# HG changeset patch # User Augie Fackler # Date 1524079928 14400 # Node ID ed5448edcbfa747b9154099e18630e49024fd47b # Parent fb92df8b634c1ff7593140a6eefc8a6c902136cd# Parent 92213f6745ed6f2c50feca9a2261b6f33a9a32fa merge with default to begin 4.6 freeze # no-check-commit because of many vendored packages diff -r fb92df8b634c -r ed5448edcbfa .clang-format --- a/.clang-format Wed Apr 04 10:35:09 2018 -0400 +++ b/.clang-format Wed Apr 18 15:32:08 2018 -0400 @@ -6,3 +6,8 @@ IndentCaseLabels: false AllowShortBlocksOnASingleLine: false AllowShortFunctionsOnASingleLine: false +IncludeCategories: + - Regex: '^<' + Priority: 1 + - Regex: '^"' + Priority: 2 diff -r fb92df8b634c -r ed5448edcbfa Makefile --- a/Makefile Wed Apr 04 10:35:09 2018 -0400 +++ b/Makefile Wed Apr 18 15:32:08 2018 -0400 @@ -124,7 +124,7 @@ format-c: clang-format --style file -i \ - `hg files 'set:(**.c or **.cc or **.h) and not "listfile:contrib/clang-format-blacklist"'` + `hg files 'set:(**.c or **.cc or **.h) and not "listfile:contrib/clang-format-ignorelist"'` update-pot: i18n/hg.pot @@ -132,8 +132,9 @@ $(PYTHON) i18n/hggettext mercurial/commands.py \ hgext/*.py hgext/*/__init__.py \ mercurial/fileset.py mercurial/revset.py \ - mercurial/templatefilters.py mercurial/templatekw.py \ - mercurial/templater.py \ + mercurial/templatefilters.py \ + mercurial/templatefuncs.py \ + mercurial/templatekw.py \ mercurial/filemerge.py \ mercurial/hgweb/webcommands.py \ mercurial/util.py \ @@ -212,11 +213,9 @@ sed "s/__CODENAME__/$*/" $< > $@ docker-debian-jessie: contrib/docker/debian-jessie - mkdir -p packages/debian-jessie contrib/dockerdeb debian jessie docker-debian-stretch: contrib/docker/debian-stretch - mkdir -p packages/debian-stretch contrib/dockerdeb debian stretch contrib/docker/ubuntu-%: contrib/docker/ubuntu.template @@ -234,24 +233,18 @@ docker-ubuntu-xenial-ppa: contrib/docker/ubuntu-xenial contrib/dockerdeb ubuntu xenial --source-only -docker-ubuntu-yakkety: contrib/docker/ubuntu-yakkety - contrib/dockerdeb ubuntu yakkety - -docker-ubuntu-yakkety-ppa: contrib/docker/ubuntu-yakkety - contrib/dockerdeb ubuntu yakkety --source-only - -docker-ubuntu-zesty: contrib/docker/ubuntu-zesty - contrib/dockerdeb ubuntu zesty - -docker-ubuntu-zesty-ppa: contrib/docker/ubuntu-zesty - contrib/dockerdeb ubuntu zesty --source-only - docker-ubuntu-artful: contrib/docker/ubuntu-artful contrib/dockerdeb ubuntu artful docker-ubuntu-artful-ppa: contrib/docker/ubuntu-artful contrib/dockerdeb ubuntu artful --source-only +docker-ubuntu-bionic: contrib/docker/ubuntu-bionic + contrib/dockerdeb ubuntu bionic + +docker-ubuntu-bionic-ppa: contrib/docker/ubuntu-bionic + contrib/dockerdeb ubuntu bionic --source-only + fedora20: mkdir -p packages/fedora20 contrib/buildrpm @@ -315,12 +308,16 @@ .PHONY: help all local build doc cleanbutpackages clean install install-bin \ install-doc install-home install-home-bin install-home-doc \ dist dist-notests check tests check-code format-c update-pot \ - osx deb ppa docker-debian-jessie docker-debian-stretch \ + osx deb ppa \ + docker-debian-jessie \ + docker-debian-stretch \ docker-ubuntu-trusty docker-ubuntu-trusty-ppa \ docker-ubuntu-xenial docker-ubuntu-xenial-ppa \ - docker-ubuntu-yakkety docker-ubuntu-yakkety-ppa \ - docker-ubuntu-zesty docker-ubuntu-zesty-ppa \ docker-ubuntu-artful docker-ubuntu-artful-ppa \ - fedora20 docker-fedora20 fedora21 docker-fedora21 \ - centos5 docker-centos5 centos6 docker-centos6 centos7 docker-centos7 \ + docker-ubuntu-bionic docker-ubuntu-bionic-ppa \ + fedora20 docker-fedora20 \ + fedora21 docker-fedora21 \ + centos5 docker-centos5 \ + centos6 docker-centos6 \ + centos7 docker-centos7 \ linux-wheels diff -r fb92df8b634c -r ed5448edcbfa contrib/Makefile.python --- a/contrib/Makefile.python Wed Apr 04 10:35:09 2018 -0400 +++ b/contrib/Makefile.python Wed Apr 18 15:32:08 2018 -0400 @@ -1,4 +1,4 @@ -PYTHONVER=2.7.10 +PYTHONVER=2.7.14 PYTHONNAME=python- PREFIX=$(HOME)/bin/prefix-$(PYTHONNAME)$(PYTHONVER) SYMLINKDIR=$(HOME)/bin diff -r fb92df8b634c -r ed5448edcbfa contrib/buildrpm --- a/contrib/buildrpm Wed Apr 04 10:35:09 2018 -0400 +++ b/contrib/buildrpm Wed Apr 18 15:32:08 2018 -0400 @@ -20,8 +20,8 @@ ;; --withpython | --with-python) shift - PYTHONVER=2.7.10 - PYTHONMD5=d7547558fd673bd9d38e2108c6b42521 + PYTHONVER=2.7.14 + PYTHONMD5=cee2e4b33ad3750da77b2e85f2f8b724 ;; --rpmbuilddir ) shift diff -r fb92df8b634c -r ed5448edcbfa contrib/check-code.py --- a/contrib/check-code.py Wed Apr 04 10:35:09 2018 -0400 +++ b/contrib/check-code.py Wed Apr 18 15:32:08 2018 -0400 @@ -111,7 +111,7 @@ (r'head -c', "don't use 'head -c', use 'dd'"), (r'tail -n', "don't use the '-n' option to tail, just use '-'"), (r'sha1sum', "don't use sha1sum, use $TESTDIR/md5sum.py"), - (r'ls.*-\w*R', "don't use 'ls -R', use 'find'"), + (r'\bls\b.*-\w*R', "don't use 'ls -R', use 'find'"), (r'printf.*[^\\]\\([1-9]|0\d)', r"don't use 'printf \NNN', use Python"), (r'printf.*[^\\]\\x', "don't use printf \\x, use Python"), (r'\$\(.*\)', "don't use $(expr), use `expr`"), @@ -150,6 +150,7 @@ (r'grep.* -[ABC]', "don't use grep's context flags"), (r'find.*-printf', "don't use 'find -printf', it doesn't exist on BSD find(1)"), + (r'\$RANDOM ', "don't use bash-only $RANDOM to generate random values"), ], # warnings [ @@ -318,9 +319,9 @@ "use util.readfile() instead"), (r'[\s\(](open|file)\([^)]*\)\.write\(', "use util.writefile() instead"), - (r'^[\s\(]*(open(er)?|file)\([^)]*\)', + (r'^[\s\(]*(open(er)?|file)\([^)]*\)(?!\.close\(\))', "always assign an opened file to a variable, and close it afterwards"), - (r'[\s\(](open|file)\([^)]*\)\.', + (r'[\s\(](open|file)\([^)]*\)\.(?!close\(\))', "always assign an opened file to a variable, and close it afterwards"), (r'(?i)descend[e]nt', "the proper spelling is descendAnt"), (r'\.debug\(\_', "don't mark debug messages for translation"), @@ -541,8 +542,11 @@ for i, pseq in enumerate(pats): # fix-up regexes for multi-line searches p = pseq[0] - # \s doesn't match \n - p = re.sub(r'(?|int|bool|list)\( # First argument. @@ -25,7 +25,7 @@ (?:default=)?(?P\S+?))? \)''', re.VERBOSE | re.MULTILINE) -configwithre = re.compile(''' +configwithre = re.compile(b''' ui\.config(?Pwith)\( # First argument is callback function. This doesn't parse robustly # if it is e.g. a function call. @@ -35,57 +35,57 @@ (?:default=)?(?P\S+?))? \)''', re.VERBOSE | re.MULTILINE) -configpartialre = (r"""ui\.config""") +configpartialre = (br"""ui\.config""") -ignorere = re.compile(r''' +ignorere = re.compile(br''' \#\s(?Pinternal|experimental|deprecated|developer|inconsistent)\s config:\s(?P\S+\.\S+)$ ''', re.VERBOSE | re.MULTILINE) def main(args): for f in args: - sect = '' - prevname = '' - confsect = '' - carryover = '' + sect = b'' + prevname = b'' + confsect = b'' + carryover = b'' linenum = 0 - for l in open(f): + for l in open(f, 'rb'): linenum += 1 # check topic-like bits - m = re.match('\s*``(\S+)``', l) + m = re.match(b'\s*``(\S+)``', l) if m: prevname = m.group(1) - if re.match('^\s*-+$', l): + if re.match(b'^\s*-+$', l): sect = prevname - prevname = '' + prevname = b'' if sect and prevname: - name = sect + '.' + prevname + name = sect + b'.' + prevname documented[name] = 1 # check docstring bits - m = re.match(r'^\s+\[(\S+)\]', l) + m = re.match(br'^\s+\[(\S+)\]', l) if m: confsect = m.group(1) continue - m = re.match(r'^\s+(?:#\s*)?(\S+) = ', l) + m = re.match(br'^\s+(?:#\s*)?(\S+) = ', l) if m: - name = confsect + '.' + m.group(1) + name = confsect + b'.' + m.group(1) documented[name] = 1 # like the bugzilla extension - m = re.match(r'^\s*(\S+\.\S+)$', l) + m = re.match(br'^\s*(\S+\.\S+)$', l) if m: documented[m.group(1)] = 1 # like convert - m = re.match(r'^\s*:(\S+\.\S+):\s+', l) + m = re.match(br'^\s*:(\S+\.\S+):\s+', l) if m: documented[m.group(1)] = 1 # quoted in help or docstrings - m = re.match(r'.*?``(\S+\.\S+)``', l) + m = re.match(br'.*?``(\S+\.\S+)``', l) if m: documented[m.group(1)] = 1 @@ -108,7 +108,7 @@ default = m.group('default') if default in (None, 'False', 'None', '0', '[]', '""', "''"): default = '' - if re.match('[a-z.]+$', default): + if re.match(b'[a-z.]+$', default): default = '' if (name in foundopts and (ctype, default) != foundopts[name] and name not in allowinconsistent): diff -r fb92df8b634c -r ed5448edcbfa contrib/chg/chg.c --- a/contrib/chg/chg.c Wed Apr 04 10:35:09 2018 -0400 +++ b/contrib/chg/chg.c Wed Apr 18 15:32:08 2018 -0400 @@ -38,11 +38,13 @@ const char **args; }; -static void initcmdserveropts(struct cmdserveropts *opts) { +static void initcmdserveropts(struct cmdserveropts *opts) +{ memset(opts, 0, sizeof(struct cmdserveropts)); } -static void freecmdserveropts(struct cmdserveropts *opts) { +static void freecmdserveropts(struct cmdserveropts *opts) +{ free(opts->args); opts->args = NULL; opts->argsize = 0; @@ -59,12 +61,8 @@ const char *name; size_t narg; } flags[] = { - {"--config", 1}, - {"--cwd", 1}, - {"--repo", 1}, - {"--repository", 1}, - {"--traceback", 0}, - {"-R", 1}, + {"--config", 1}, {"--cwd", 1}, {"--repo", 1}, + {"--repository", 1}, {"--traceback", 0}, {"-R", 1}, }; size_t i; for (i = 0; i < sizeof(flags) / sizeof(flags[0]); ++i) { @@ -89,21 +87,21 @@ /* * Parse argv[] and put sensitive flags to opts->args */ -static void setcmdserverargs(struct cmdserveropts *opts, - int argc, const char *argv[]) +static void setcmdserverargs(struct cmdserveropts *opts, int argc, + const char *argv[]) { size_t i, step; opts->argsize = 0; for (i = 0, step = 1; i < (size_t)argc; i += step, step = 1) { if (!argv[i]) - continue; /* pass clang-analyse */ + continue; /* pass clang-analyse */ if (strcmp(argv[i], "--") == 0) break; size_t n = testsensitiveflag(argv[i]); if (n == 0 || i + n > (size_t)argc) continue; - opts->args = reallocx(opts->args, - (n + opts->argsize) * sizeof(char *)); + opts->args = + reallocx(opts->args, (n + opts->argsize) * sizeof(char *)); memcpy(opts->args + opts->argsize, argv + i, sizeof(char *) * n); opts->argsize += n; @@ -180,8 +178,8 @@ r = snprintf(opts->sockname, sizeof(opts->sockname), sockfmt, basename); if (r < 0 || (size_t)r >= sizeof(opts->sockname)) abortmsg("too long TMPDIR or CHGSOCKNAME (r = %d)", r); - r = snprintf(opts->initsockname, sizeof(opts->initsockname), - "%s.%u", opts->sockname, (unsigned)getpid()); + r = snprintf(opts->initsockname, sizeof(opts->initsockname), "%s.%u", + opts->sockname, (unsigned)getpid()); if (r < 0 || (size_t)r >= sizeof(opts->initsockname)) abortmsg("too long TMPDIR or CHGSOCKNAME (r = %d)", r); } @@ -208,11 +206,14 @@ const char *hgcmd = gethgcmd(); const char *baseargv[] = { - hgcmd, - "serve", - "--cmdserver", "chgunix", - "--address", opts->initsockname, - "--daemon-postexec", "chdir:/", + hgcmd, + "serve", + "--cmdserver", + "chgunix", + "--address", + opts->initsockname, + "--daemon-postexec", + "chdir:/", }; size_t baseargvsize = sizeof(baseargv) / sizeof(baseargv[0]); size_t argsize = baseargvsize + opts->argsize + 1; @@ -237,7 +238,7 @@ debugmsg("try connect to %s repeatedly", opts->initsockname); - unsigned int timeoutsec = 60; /* default: 60 seconds */ + unsigned int timeoutsec = 60; /* default: 60 seconds */ const char *timeoutenv = getenv("CHGTIMEOUT"); if (timeoutenv) sscanf(timeoutenv, "%u", &timeoutsec); @@ -246,7 +247,7 @@ hgclient_t *hgc = hgc_open(opts->initsockname); if (hgc) { debugmsg("rename %s to %s", opts->initsockname, - opts->sockname); + opts->sockname); int r = rename(opts->initsockname, opts->sockname); if (r != 0) abortmsgerrno("cannot rename"); @@ -270,7 +271,7 @@ if (WIFEXITED(pst)) { if (WEXITSTATUS(pst) == 0) abortmsg("could not connect to cmdserver " - "(exited with status 0)"); + "(exited with status 0)"); debugmsg("cmdserver exited with status %d", WEXITSTATUS(pst)); exit(WEXITSTATUS(pst)); } else if (WIFSIGNALED(pst)) { @@ -284,8 +285,8 @@ /* Connect to a cmdserver. Will start a new server on demand. */ static hgclient_t *connectcmdserver(struct cmdserveropts *opts) { - const char *sockname = opts->redirectsockname[0] ? - opts->redirectsockname : opts->sockname; + const char *sockname = + opts->redirectsockname[0] ? opts->redirectsockname : opts->sockname; debugmsg("try connect to %s", sockname); hgclient_t *hgc = hgc_open(sockname); if (hgc) @@ -339,8 +340,8 @@ unlink(*pinst + 7); } else if (strncmp(*pinst, "redirect ", 9) == 0) { int r = snprintf(opts->redirectsockname, - sizeof(opts->redirectsockname), - "%s", *pinst + 9); + sizeof(opts->redirectsockname), "%s", + *pinst + 9); if (r < 0 || r >= (int)sizeof(opts->redirectsockname)) abortmsg("redirect path is too long (%d)", r); needreconnect = 1; @@ -365,10 +366,9 @@ */ static int isunsupported(int argc, const char *argv[]) { - enum { - SERVE = 1, - DAEMON = 2, - SERVEDAEMON = SERVE | DAEMON, + enum { SERVE = 1, + DAEMON = 2, + SERVEDAEMON = SERVE | DAEMON, }; unsigned int state = 0; int i; @@ -378,7 +378,7 @@ if (i == 0 && strcmp("serve", argv[i]) == 0) state |= SERVE; else if (strcmp("-d", argv[i]) == 0 || - strcmp("--daemon", argv[i]) == 0) + strcmp("--daemon", argv[i]) == 0) state |= DAEMON; } return (state & SERVEDAEMON) == SERVEDAEMON; @@ -401,9 +401,9 @@ if (getenv("CHGINTERNALMARK")) abortmsg("chg started by chg detected.\n" - "Please make sure ${HG:-hg} is not a symlink or " - "wrapper to chg. Alternatively, set $CHGHG to the " - "path of real hg."); + "Please make sure ${HG:-hg} is not a symlink or " + "wrapper to chg. Alternatively, set $CHGHG to the " + "path of real hg."); if (isunsupported(argc - 1, argv + 1)) execoriginalhg(argv); @@ -435,11 +435,11 @@ hgc_close(hgc); if (++retry > 10) abortmsg("too many redirections.\n" - "Please make sure %s is not a wrapper which " - "changes sensitive environment variables " - "before executing hg. If you have to use a " - "wrapper, wrap chg instead of hg.", - gethgcmd()); + "Please make sure %s is not a wrapper which " + "changes sensitive environment variables " + "before executing hg. If you have to use a " + "wrapper, wrap chg instead of hg.", + gethgcmd()); } setupsignalhandler(hgc_peerpid(hgc), hgc_peerpgid(hgc)); diff -r fb92df8b634c -r ed5448edcbfa contrib/chg/hgclient.c --- a/contrib/chg/hgclient.c Wed Apr 04 10:35:09 2018 -0400 +++ b/contrib/chg/hgclient.c Wed Apr 18 15:32:08 2018 -0400 @@ -7,7 +7,7 @@ * GNU General Public License version 2 or any later version. */ -#include /* for ntohl(), htonl() */ +#include /* for ntohl(), htonl() */ #include #include #include @@ -26,16 +26,15 @@ #include "procutil.h" #include "util.h" -enum { - CAP_GETENCODING = 0x0001, - CAP_RUNCOMMAND = 0x0002, - /* cHg extension: */ - CAP_ATTACHIO = 0x0100, - CAP_CHDIR = 0x0200, - CAP_SETENV = 0x0800, - CAP_SETUMASK = 0x1000, - CAP_VALIDATE = 0x2000, - CAP_SETPROCNAME = 0x4000, +enum { CAP_GETENCODING = 0x0001, + CAP_RUNCOMMAND = 0x0002, + /* cHg extension: */ + CAP_ATTACHIO = 0x0100, + CAP_CHDIR = 0x0200, + CAP_SETENV = 0x0800, + CAP_SETUMASK = 0x1000, + CAP_VALIDATE = 0x2000, + CAP_SETPROCNAME = 0x4000, }; typedef struct { @@ -44,15 +43,15 @@ } cappair_t; static const cappair_t captable[] = { - {"getencoding", CAP_GETENCODING}, - {"runcommand", CAP_RUNCOMMAND}, - {"attachio", CAP_ATTACHIO}, - {"chdir", CAP_CHDIR}, - {"setenv", CAP_SETENV}, - {"setumask", CAP_SETUMASK}, - {"validate", CAP_VALIDATE}, - {"setprocname", CAP_SETPROCNAME}, - {NULL, 0}, /* terminator */ + {"getencoding", CAP_GETENCODING}, + {"runcommand", CAP_RUNCOMMAND}, + {"attachio", CAP_ATTACHIO}, + {"chdir", CAP_CHDIR}, + {"setenv", CAP_SETENV}, + {"setumask", CAP_SETUMASK}, + {"validate", CAP_VALIDATE}, + {"setprocname", CAP_SETPROCNAME}, + {NULL, 0}, /* terminator */ }; typedef struct { @@ -88,8 +87,8 @@ if (newsize <= ctx->maxdatasize) return; - newsize = defaultdatasize - * ((newsize + defaultdatasize - 1) / defaultdatasize); + newsize = defaultdatasize * + ((newsize + defaultdatasize - 1) / defaultdatasize); ctx->data = reallocx(ctx->data, newsize); ctx->maxdatasize = newsize; debugmsg("enlarge context buffer to %zu", ctx->maxdatasize); @@ -126,12 +125,12 @@ enlargecontext(&hgc->ctx, hgc->ctx.datasize); if (isupper(hgc->ctx.ch) && hgc->ctx.ch != 'S') - return; /* assumes input request */ + return; /* assumes input request */ size_t cursize = 0; while (cursize < hgc->ctx.datasize) { rsize = recv(hgc->sockfd, hgc->ctx.data + cursize, - hgc->ctx.datasize - cursize, 0); + hgc->ctx.datasize - cursize, 0); if (rsize < 1) abortmsg("failed to read data block"); cursize += rsize; @@ -176,19 +175,19 @@ /* Build '\0'-separated list of args. argsize < 0 denotes that args are * terminated by NULL. */ static void packcmdargs(context_t *ctx, const char *const args[], - ssize_t argsize) + ssize_t argsize) { ctx->datasize = 0; const char *const *const end = (argsize >= 0) ? args + argsize : NULL; for (const char *const *it = args; it != end && *it; ++it) { - const size_t n = strlen(*it) + 1; /* include '\0' */ + const size_t n = strlen(*it) + 1; /* include '\0' */ enlargecontext(ctx, ctx->datasize + n); memcpy(ctx->data + ctx->datasize, *it, n); ctx->datasize += n; } if (ctx->datasize > 0) - --ctx->datasize; /* strip last '\0' */ + --ctx->datasize; /* strip last '\0' */ } /* Extract '\0'-separated list of args to new buffer, terminated by NULL */ @@ -199,7 +198,7 @@ const char *s = ctx->data; const char *e = ctx->data + ctx->datasize; for (;;) { - if (nargs + 1 >= maxnargs) { /* including last NULL */ + if (nargs + 1 >= maxnargs) { /* including last NULL */ maxnargs += 256; args = reallocx(args, maxnargs * sizeof(args[0])); } @@ -237,7 +236,7 @@ { context_t *ctx = &hgc->ctx; enlargecontext(ctx, ctx->datasize + 1); - ctx->data[ctx->datasize] = '\0'; /* terminate last string */ + ctx->data[ctx->datasize] = '\0'; /* terminate last string */ const char **args = unpackcmdargsnul(ctx); if (!args[0] || !args[1] || !args[2]) @@ -269,8 +268,8 @@ for (;;) { readchannel(hgc); context_t *ctx = &hgc->ctx; - debugmsg("response read from channel %c, size %zu", - ctx->ch, ctx->datasize); + debugmsg("response read from channel %c, size %zu", ctx->ch, + ctx->datasize); switch (ctx->ch) { case 'o': fwrite(ctx->data, sizeof(ctx->data[0]), ctx->datasize, @@ -299,7 +298,7 @@ default: if (isupper(ctx->ch)) abortmsg("cannot handle response (ch = %c)", - ctx->ch); + ctx->ch); } } } @@ -366,8 +365,8 @@ static void updateprocname(hgclient_t *hgc) { - int r = snprintf(hgc->ctx.data, hgc->ctx.maxdatasize, - "chg[worker/%d]", (int)getpid()); + int r = snprintf(hgc->ctx.data, hgc->ctx.maxdatasize, "chg[worker/%d]", + (int)getpid()); if (r < 0 || (size_t)r >= hgc->ctx.maxdatasize) abortmsg("insufficient buffer to write procname (r = %d)", r); hgc->ctx.datasize = (size_t)r; @@ -387,7 +386,7 @@ static const int fds[3] = {STDIN_FILENO, STDOUT_FILENO, STDERR_FILENO}; struct msghdr msgh; memset(&msgh, 0, sizeof(msgh)); - struct iovec iov = {ctx->data, ctx->datasize}; /* dummy payload */ + struct iovec iov = {ctx->data, ctx->datasize}; /* dummy payload */ msgh.msg_iov = &iov; msgh.msg_iovlen = 1; char fdbuf[CMSG_SPACE(sizeof(fds))]; @@ -552,7 +551,7 @@ * the last string is guaranteed to be NULL. */ const char **hgc_validate(hgclient_t *hgc, const char *const args[], - size_t argsize) + size_t argsize) { assert(hgc); if (!(hgc->capflags & CAP_VALIDATE)) diff -r fb92df8b634c -r ed5448edcbfa contrib/chg/hgclient.h --- a/contrib/chg/hgclient.h Wed Apr 04 10:35:09 2018 -0400 +++ b/contrib/chg/hgclient.h Wed Apr 18 15:32:08 2018 -0400 @@ -22,9 +22,9 @@ pid_t hgc_peerpid(const hgclient_t *hgc); const char **hgc_validate(hgclient_t *hgc, const char *const args[], - size_t argsize); + size_t argsize); int hgc_runcommand(hgclient_t *hgc, const char *const args[], size_t argsize); void hgc_attachio(hgclient_t *hgc); void hgc_setenv(hgclient_t *hgc, const char *const envp[]); -#endif /* HGCLIENT_H_ */ +#endif /* HGCLIENT_H_ */ diff -r fb92df8b634c -r ed5448edcbfa contrib/chg/procutil.c --- a/contrib/chg/procutil.c Wed Apr 04 10:35:09 2018 -0400 +++ b/contrib/chg/procutil.c Wed Apr 18 15:32:08 2018 -0400 @@ -54,7 +54,7 @@ goto error; forwardsignal(sig); - if (raise(sig) < 0) /* resend to self */ + if (raise(sig) < 0) /* resend to self */ goto error; if (sigaction(sig, &sa, &oldsa) < 0) goto error; @@ -205,8 +205,8 @@ close(pipefds[0]); close(pipefds[1]); - int r = execle("/bin/sh", "/bin/sh", "-c", pagercmd, NULL, - envp); + int r = + execle("/bin/sh", "/bin/sh", "-c", pagercmd, NULL, envp); if (r < 0) { abortmsgerrno("cannot start pager '%s'", pagercmd); } diff -r fb92df8b634c -r ed5448edcbfa contrib/chg/util.c --- a/contrib/chg/util.c Wed Apr 04 10:35:09 2018 -0400 +++ b/contrib/chg/util.c Wed Apr 18 15:32:08 2018 -0400 @@ -62,7 +62,8 @@ static int debugmsgenabled = 0; static double debugstart = 0; -static double now() { +static double now() +{ struct timeval t; gettimeofday(&t, NULL); return t.tv_usec / 1e6 + t.tv_sec; diff -r fb92df8b634c -r ed5448edcbfa contrib/chg/util.h --- a/contrib/chg/util.h Wed Apr 04 10:35:09 2018 -0400 +++ b/contrib/chg/util.h Wed Apr 18 15:32:08 2018 -0400 @@ -32,4 +32,4 @@ int runshellcmd(const char *cmd, const char *envp[], const char *cwd); -#endif /* UTIL_H_ */ +#endif /* UTIL_H_ */ diff -r fb92df8b634c -r ed5448edcbfa contrib/clang-format-blacklist --- a/contrib/clang-format-blacklist Wed Apr 04 10:35:09 2018 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,69 +0,0 @@ -# Files that just need to be migrated to the formatter. -# Do not add new files here! -contrib/chg/chg.c -contrib/chg/hgclient.c -contrib/chg/hgclient.h -contrib/chg/procutil.c -contrib/chg/procutil.h -contrib/chg/util.c -contrib/chg/util.h -contrib/hgsh/hgsh.c -mercurial/cext/base85.c -mercurial/cext/bdiff.c -mercurial/cext/charencode.c -mercurial/cext/charencode.h -mercurial/cext/diffhelpers.c -mercurial/cext/dirs.c -mercurial/cext/manifest.c -mercurial/cext/mpatch.c -mercurial/cext/osutil.c -mercurial/cext/pathencode.c -mercurial/cext/revlog.c -# Vendored code that we should never format: -contrib/python-zstandard/c-ext/bufferutil.c -contrib/python-zstandard/c-ext/compressiondict.c -contrib/python-zstandard/c-ext/compressionparams.c -contrib/python-zstandard/c-ext/compressionwriter.c -contrib/python-zstandard/c-ext/compressobj.c -contrib/python-zstandard/c-ext/compressor.c -contrib/python-zstandard/c-ext/compressoriterator.c -contrib/python-zstandard/c-ext/constants.c -contrib/python-zstandard/c-ext/decompressionwriter.c -contrib/python-zstandard/c-ext/decompressobj.c -contrib/python-zstandard/c-ext/decompressor.c -contrib/python-zstandard/c-ext/decompressoriterator.c -contrib/python-zstandard/c-ext/frameparams.c -contrib/python-zstandard/c-ext/python-zstandard.h -contrib/python-zstandard/zstd.c -contrib/python-zstandard/zstd/common/bitstream.h -contrib/python-zstandard/zstd/common/entropy_common.c -contrib/python-zstandard/zstd/common/error_private.c -contrib/python-zstandard/zstd/common/error_private.h -contrib/python-zstandard/zstd/common/fse.h -contrib/python-zstandard/zstd/common/fse_decompress.c -contrib/python-zstandard/zstd/common/huf.h -contrib/python-zstandard/zstd/common/mem.h -contrib/python-zstandard/zstd/common/pool.c -contrib/python-zstandard/zstd/common/pool.h -contrib/python-zstandard/zstd/common/threading.c -contrib/python-zstandard/zstd/common/threading.h -contrib/python-zstandard/zstd/common/xxhash.c -contrib/python-zstandard/zstd/common/xxhash.h -contrib/python-zstandard/zstd/common/zstd_common.c -contrib/python-zstandard/zstd/common/zstd_errors.h -contrib/python-zstandard/zstd/common/zstd_internal.h -contrib/python-zstandard/zstd/compress/fse_compress.c -contrib/python-zstandard/zstd/compress/huf_compress.c -contrib/python-zstandard/zstd/compress/zstd_compress.c -contrib/python-zstandard/zstd/compress/zstd_opt.h -contrib/python-zstandard/zstd/compress/zstdmt_compress.c -contrib/python-zstandard/zstd/compress/zstdmt_compress.h -contrib/python-zstandard/zstd/decompress/huf_decompress.c -contrib/python-zstandard/zstd/decompress/zstd_decompress.c -contrib/python-zstandard/zstd/dictBuilder/cover.c -contrib/python-zstandard/zstd/dictBuilder/divsufsort.c -contrib/python-zstandard/zstd/dictBuilder/divsufsort.h -contrib/python-zstandard/zstd/dictBuilder/zdict.c -contrib/python-zstandard/zstd/dictBuilder/zdict.h -contrib/python-zstandard/zstd/zstd.h -hgext/fsmonitor/pywatchman/bser.c diff -r fb92df8b634c -r ed5448edcbfa contrib/clang-format-ignorelist --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/contrib/clang-format-ignorelist Wed Apr 18 15:32:08 2018 -0400 @@ -0,0 +1,88 @@ +# Files that just need to be migrated to the formatter. +# Do not add new files here! +mercurial/cext/dirs.c +mercurial/cext/manifest.c +mercurial/cext/osutil.c +mercurial/cext/revlog.c +# Vendored code that we should never format: +contrib/python-zstandard/c-ext/bufferutil.c +contrib/python-zstandard/c-ext/compressiondict.c +contrib/python-zstandard/c-ext/compressionparams.c +contrib/python-zstandard/c-ext/compressionreader.c +contrib/python-zstandard/c-ext/compressionwriter.c +contrib/python-zstandard/c-ext/compressobj.c +contrib/python-zstandard/c-ext/compressor.c +contrib/python-zstandard/c-ext/compressoriterator.c +contrib/python-zstandard/c-ext/constants.c +contrib/python-zstandard/c-ext/decompressionreader.c +contrib/python-zstandard/c-ext/decompressionwriter.c +contrib/python-zstandard/c-ext/decompressobj.c +contrib/python-zstandard/c-ext/decompressor.c +contrib/python-zstandard/c-ext/decompressoriterator.c +contrib/python-zstandard/c-ext/frameparams.c +contrib/python-zstandard/c-ext/python-zstandard.h +contrib/python-zstandard/zstd.c +contrib/python-zstandard/zstd/common/bitstream.h +contrib/python-zstandard/zstd/common/compiler.h +contrib/python-zstandard/zstd/common/cpu.h +contrib/python-zstandard/zstd/common/entropy_common.c +contrib/python-zstandard/zstd/common/error_private.c +contrib/python-zstandard/zstd/common/error_private.h +contrib/python-zstandard/zstd/common/fse_decompress.c +contrib/python-zstandard/zstd/common/fse.h +contrib/python-zstandard/zstd/common/huf.h +contrib/python-zstandard/zstd/common/mem.h +contrib/python-zstandard/zstd/common/pool.c +contrib/python-zstandard/zstd/common/pool.h +contrib/python-zstandard/zstd/common/threading.c +contrib/python-zstandard/zstd/common/threading.h +contrib/python-zstandard/zstd/common/xxhash.c +contrib/python-zstandard/zstd/common/xxhash.h +contrib/python-zstandard/zstd/common/zstd_common.c +contrib/python-zstandard/zstd/common/zstd_errors.h +contrib/python-zstandard/zstd/common/zstd_internal.h +contrib/python-zstandard/zstd/compress/fse_compress.c +contrib/python-zstandard/zstd/compress/huf_compress.c +contrib/python-zstandard/zstd/compress/zstd_compress.c +contrib/python-zstandard/zstd/compress/zstd_compress_internal.h +contrib/python-zstandard/zstd/compress/zstd_double_fast.c +contrib/python-zstandard/zstd/compress/zstd_double_fast.h +contrib/python-zstandard/zstd/compress/zstd_fast.c +contrib/python-zstandard/zstd/compress/zstd_fast.h +contrib/python-zstandard/zstd/compress/zstd_lazy.c +contrib/python-zstandard/zstd/compress/zstd_lazy.h +contrib/python-zstandard/zstd/compress/zstd_ldm.c +contrib/python-zstandard/zstd/compress/zstd_ldm.h +contrib/python-zstandard/zstd/compress/zstdmt_compress.c +contrib/python-zstandard/zstd/compress/zstdmt_compress.h +contrib/python-zstandard/zstd/compress/zstd_opt.c +contrib/python-zstandard/zstd/compress/zstd_opt.h +contrib/python-zstandard/zstd/decompress/huf_decompress.c +contrib/python-zstandard/zstd/decompress/zstd_decompress.c +contrib/python-zstandard/zstd/deprecated/zbuff_common.c +contrib/python-zstandard/zstd/deprecated/zbuff_compress.c +contrib/python-zstandard/zstd/deprecated/zbuff_decompress.c +contrib/python-zstandard/zstd/deprecated/zbuff.h +contrib/python-zstandard/zstd/dictBuilder/cover.c +contrib/python-zstandard/zstd/dictBuilder/divsufsort.c +contrib/python-zstandard/zstd/dictBuilder/divsufsort.h +contrib/python-zstandard/zstd/dictBuilder/zdict.c +contrib/python-zstandard/zstd/dictBuilder/zdict.h +contrib/python-zstandard/zstd/zstd.h +hgext/fsmonitor/pywatchman/bser.c +mercurial/thirdparty/xdiff/xdiff.h +mercurial/thirdparty/xdiff/xdiffi.c +mercurial/thirdparty/xdiff/xdiffi.h +mercurial/thirdparty/xdiff/xemit.c +mercurial/thirdparty/xdiff/xemit.h +mercurial/thirdparty/xdiff/xhistogram.c +mercurial/thirdparty/xdiff/xinclude.h +mercurial/thirdparty/xdiff/xmacros.h +mercurial/thirdparty/xdiff/xmerge.c +mercurial/thirdparty/xdiff/xpatience.c +mercurial/thirdparty/xdiff/xprepare.c +mercurial/thirdparty/xdiff/xprepare.h +mercurial/thirdparty/xdiff/xtypes.h +mercurial/thirdparty/xdiff/xutils.c +mercurial/thirdparty/xdiff/xutils.h +mercurial/thirdparty/zope/interface/_zope_interface_coptimizations.c diff -r fb92df8b634c -r ed5448edcbfa contrib/dirstatenonnormalcheck.py --- a/contrib/dirstatenonnormalcheck.py Wed Apr 04 10:35:09 2018 -0400 +++ b/contrib/dirstatenonnormalcheck.py Wed Apr 18 15:32:08 2018 -0400 @@ -17,7 +17,7 @@ """Compute nonnormal entries from dirstate's dmap""" res = set() for f, e in dmap.iteritems(): - if e[0] != 'n' or e[3] == -1: + if e[0] != b'n' or e[3] == -1: res.add(f) return res @@ -25,24 +25,25 @@ """Compute nonnormalset from dmap, check that it matches _nonnormalset""" nonnormalcomputedmap = nonnormalentries(dmap) if _nonnormalset != nonnormalcomputedmap: - ui.develwarn("%s call to %s\n" % (label, orig), config='dirstate') - ui.develwarn("inconsistency in nonnormalset\n", config='dirstate') - ui.develwarn("[nonnormalset] %s\n" % _nonnormalset, config='dirstate') - ui.develwarn("[map] %s\n" % nonnormalcomputedmap, config='dirstate') + ui.develwarn(b"%s call to %s\n" % (label, orig), config=b'dirstate') + ui.develwarn(b"inconsistency in nonnormalset\n", config=b'dirstate') + ui.develwarn(b"[nonnormalset] %s\n" % _nonnormalset, config=b'dirstate') + ui.develwarn(b"[map] %s\n" % nonnormalcomputedmap, config=b'dirstate') def _checkdirstate(orig, self, arg): """Check nonnormal set consistency before and after the call to orig""" checkconsistency(self._ui, orig, self._map, self._map.nonnormalset, - "before") + b"before") r = orig(self, arg) - checkconsistency(self._ui, orig, self._map, self._map.nonnormalset, "after") + checkconsistency(self._ui, orig, self._map, self._map.nonnormalset, + b"after") return r def extsetup(ui): """Wrap functions modifying dirstate to check nonnormalset consistency""" dirstatecl = dirstate.dirstate - devel = ui.configbool('devel', 'all-warnings') - paranoid = ui.configbool('experimental', 'nonnormalparanoidcheck') + devel = ui.configbool(b'devel', b'all-warnings') + paranoid = ui.configbool(b'experimental', b'nonnormalparanoidcheck') if devel: extensions.wrapfunction(dirstatecl, '_writedirstate', _checkdirstate) if paranoid: diff -r fb92df8b634c -r ed5448edcbfa contrib/dumprevlog --- a/contrib/dumprevlog Wed Apr 04 10:35:09 2018 -0400 +++ b/contrib/dumprevlog Wed Apr 18 15:32:08 2018 -0400 @@ -8,14 +8,20 @@ from mercurial import ( node, revlog, - util, +) +from mercurial.utils import ( + procutil, ) for fp in (sys.stdin, sys.stdout, sys.stderr): - util.setbinary(fp) + procutil.setbinary(fp) + +def binopen(path, mode='rb'): + if 'b' not in mode: + mode = mode + 'b' + return open(path, mode) for f in sys.argv[1:]: - binopen = lambda fn: open(fn, 'rb') r = revlog.revlog(binopen, f) print("file:", f) for i in r: diff -r fb92df8b634c -r ed5448edcbfa contrib/fuzz/Makefile --- a/contrib/fuzz/Makefile Wed Apr 04 10:35:09 2018 -0400 +++ b/contrib/fuzz/Makefile Wed Apr 18 15:32:08 2018 -0400 @@ -13,8 +13,28 @@ $$CXX $$CXXFLAGS -std=c++11 -I../../mercurial bdiff.cc \ bdiff-oss-fuzz.o -lFuzzingEngine -o $$OUT/bdiff_fuzzer -all: bdiff +x%.o: ../../mercurial/thirdparty/xdiff/x%.c ../../mercurial/thirdparty/xdiff/*.h + clang -g -O1 -fsanitize=fuzzer-no-link,address -c \ + -o $@ \ + $< + +xdiff: xdiff.cc xdiffi.o xprepare.o xutils.o + clang -DHG_FUZZER_INCLUDE_MAIN=1 -g -O1 -fsanitize=fuzzer-no-link,address \ + -I../../mercurial xdiff.cc \ + xdiffi.o xprepare.o xutils.o -o xdiff -oss-fuzz: bdiff_fuzzer +fuzz-x%.o: ../../mercurial/thirdparty/xdiff/x%.c ../../mercurial/thirdparty/xdiff/*.h + $$CC $$CFLAGS -c \ + -o $@ \ + $< + +xdiff_fuzzer: xdiff.cc fuzz-xdiffi.o fuzz-xprepare.o fuzz-xutils.o + $$CXX $$CXXFLAGS -std=c++11 -I../../mercurial xdiff.cc \ + fuzz-xdiffi.o fuzz-xprepare.o fuzz-xutils.o \ + -lFuzzingEngine -o $$OUT/xdiff_fuzzer + +all: bdiff xdiff + +oss-fuzz: bdiff_fuzzer xdiff_fuzzer .PHONY: all oss-fuzz diff -r fb92df8b634c -r ed5448edcbfa contrib/fuzz/README.rst --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/contrib/fuzz/README.rst Wed Apr 18 15:32:08 2018 -0400 @@ -0,0 +1,26 @@ +How to add fuzzers (partially cribbed from oss-fuzz[0]): + + 1) git clone https://github.com/google/oss-fuzz + 2) cd oss-fuzz + 3) python infra/helper.py build_image mercurial + 4) docker run --cap-add=SYS_PTRACE -it -v $HG_REPO_PATH:/hg-new \ + gcr.io/oss-fuzz/mercurial bash + 5) cd /src + 6) rm -r mercurial + 7) ln -s /hg-new mercurial + 8) cd mercurial + 9) compile + 10) ls $OUT + +Step 9 is literally running the command "compile", which is part of +the docker container. Once you have that working, you can build the +fuzzers like this (in the oss-fuzz repo): + +python infra/helper.py build_fuzzers --sanitizer address mercurial $HG_REPO_PATH + +(you can also say "memory", "undefined" or "coverage" for +sanitizer). Then run the built fuzzers like this: + +python infra/helper.py run_fuzzer mercurial -- $FUZZER + +0: https://github.com/google/oss-fuzz/blob/master/docs/new_project_guide.md diff -r fb92df8b634c -r ed5448edcbfa contrib/fuzz/xdiff.cc --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/contrib/fuzz/xdiff.cc Wed Apr 18 15:32:08 2018 -0400 @@ -0,0 +1,60 @@ +/* + * xdiff.cc - fuzzer harness for thirdparty/xdiff + * + * Copyright 2018, Google Inc. + * + * This software may be used and distributed according to the terms of + * the GNU General Public License, incorporated herein by reference. + */ +#include "thirdparty/xdiff/xdiff.h" +#include +#include + +extern "C" { + +int hunk_consumer(long a1, long a2, long b1, long b2, void *priv) +{ + // TODO: probably also test returning -1 from this when things break? + return 0; +} + +int LLVMFuzzerTestOneInput(const uint8_t *Data, size_t Size) +{ + if (!Size) { + return 0; + } + // figure out a random point in [0, Size] to split our input. + size_t split = Data[0] / 255.0 * Size; + + mmfile_t a, b; + + // `a` input to diff is data[1:split] + a.ptr = (char *)Data + 1; + // which has len split-1 + a.size = split - 1; + // `b` starts at the next byte after `a` ends + b.ptr = a.ptr + a.size; + b.size = Size - split; + xpparam_t xpp = { + XDF_INDENT_HEURISTIC, /* flags */ + }; + xdemitconf_t xecfg = { + XDL_EMIT_BDIFFHUNK, /* flags */ + hunk_consumer, /* hunk_consume_func */ + }; + xdemitcb_t ecb = { + NULL, /* priv */ + }; + xdl_diff(&a, &b, &xpp, &xecfg, &ecb); + return 0; // Non-zero return values are reserved for future use. +} + +#ifdef HG_FUZZER_INCLUDE_MAIN +int main(int argc, char **argv) +{ + const char data[] = "asdf"; + return LLVMFuzzerTestOneInput((const uint8_t *)data, 4); +} +#endif + +} // extern "C" diff -r fb92df8b634c -r ed5448edcbfa contrib/hgsh/hgsh.c --- a/contrib/hgsh/hgsh.c Wed Apr 04 10:35:09 2018 -0400 +++ b/contrib/hgsh/hgsh.c Wed Apr 18 15:32:08 2018 -0400 @@ -48,7 +48,7 @@ * have such machine, set to NULL. */ #ifndef HG_GATEWAY -#define HG_GATEWAY "gateway" +#define HG_GATEWAY "gateway" #endif /* @@ -56,7 +56,7 @@ * NULL. */ #ifndef HG_HOST -#define HG_HOST "mercurial" +#define HG_HOST "mercurial" #endif /* @@ -64,7 +64,7 @@ * host username are same, set to NULL. */ #ifndef HG_USER -#define HG_USER "hg" +#define HG_USER "hg" #endif /* @@ -72,14 +72,14 @@ * validate location of repo when someone is try to access, set to NULL. */ #ifndef HG_ROOT -#define HG_ROOT "/home/hg/repos" +#define HG_ROOT "/home/hg/repos" #endif /* * HG: path to the mercurial executable to run. */ #ifndef HG -#define HG "/home/hg/bin/hg" +#define HG "/home/hg/bin/hg" #endif /* @@ -88,7 +88,7 @@ * impossible, set to NULL. */ #ifndef HG_SHELL -#define HG_SHELL NULL +#define HG_SHELL NULL /* #define HG_SHELL "/bin/bash" */ #endif @@ -97,7 +97,7 @@ * should not get helpful message, set to NULL. */ #ifndef HG_HELP -#define HG_HELP "please contact support@example.com for help." +#define HG_HELP "please contact support@example.com for help." #endif /* @@ -106,7 +106,7 @@ * arguments it is called with. see forward_through_gateway. */ #ifndef SSH -#define SSH "/usr/bin/ssh" +#define SSH "/usr/bin/ssh" #endif /* @@ -249,7 +249,6 @@ hg_serve, }; - /* * attempt to verify that a directory is really a hg repo, by testing * for the existence of a subdirectory. @@ -310,8 +309,7 @@ if (sscanf(argv[2], "hg init %as", &repo) == 1) { cmd = hg_init; - } - else if (sscanf(argv[2], "hg -R %as serve --stdio", &repo) == 1) { + } else if (sscanf(argv[2], "hg -R %as serve --stdio", &repo) == 1) { cmd = hg_serve; } else { goto badargs; diff -r fb92df8b634c -r ed5448edcbfa contrib/import-checker.py --- a/contrib/import-checker.py Wed Apr 04 10:35:09 2018 -0400 +++ b/contrib/import-checker.py Wed Apr 18 15:32:08 2018 -0400 @@ -31,12 +31,15 @@ # for cffi modules to re-export pure functions 'mercurial.pure.base85', 'mercurial.pure.bdiff', - 'mercurial.pure.diffhelpers', 'mercurial.pure.mpatch', 'mercurial.pure.osutil', 'mercurial.pure.parsers', # third-party imports should be directly imported 'mercurial.thirdparty', + 'mercurial.thirdparty.cbor', + 'mercurial.thirdparty.cbor.cbor2', + 'mercurial.thirdparty.zope', + 'mercurial.thirdparty.zope.interface', ) # Whitelist of symbols that can be directly imported. diff -r fb92df8b634c -r ed5448edcbfa contrib/mercurial.spec --- a/contrib/mercurial.spec Wed Apr 04 10:35:09 2018 -0400 +++ b/contrib/mercurial.spec Wed Apr 18 15:32:08 2018 -0400 @@ -6,8 +6,8 @@ %global pythonver %{withpython} %global pythonname Python-%{withpython} -%global docutilsname docutils-0.12 -%global docutilsmd5 4622263b62c5c771c03502afa3157768 +%global docutilsname docutils-0.14 +%global docutilsmd5 c53768d63db3873b7d452833553469de %global pythonhg python-hg %global hgpyprefix /opt/%{pythonhg} # byte compilation will fail on some some Python /test/ files diff -r fb92df8b634c -r ed5448edcbfa contrib/perf.py --- a/contrib/perf.py Wed Apr 04 10:35:09 2018 -0400 +++ b/contrib/perf.py Wed Apr 18 15:32:08 2018 -0400 @@ -64,6 +64,12 @@ from mercurial import scmutil # since 1.9 (or 8b252e826c68) except ImportError: pass +try: + from mercurial import pycompat + getargspec = pycompat.getargspec # added to module after 4.5 +except (ImportError, AttributeError): + import inspect + getargspec = inspect.getargspec # for "historical portability": # define util.safehasattr forcibly, because util.safehasattr has been @@ -114,9 +120,8 @@ if safehasattr(registrar, 'command'): command = registrar.command(cmdtable) elif safehasattr(cmdutil, 'command'): - import inspect command = cmdutil.command(cmdtable) - if 'norepo' not in inspect.getargspec(command)[0]: + if 'norepo' not in getargspec(command).args: # for "historical portability": # wrap original cmdutil.command, because "norepo" option has # been available since 3.1 (or 75a96326cecb) @@ -418,7 +423,8 @@ oldquiet = repo.ui.quiet repo.ui.quiet = True matcher = scmutil.match(repo[None]) - timer(lambda: scmutil.addremove(repo, matcher, "", dry_run=True)) + opts['dry_run'] = True + timer(lambda: scmutil.addremove(repo, matcher, "", opts)) finally: repo.ui.quiet = oldquiet fm.end() @@ -761,7 +767,7 @@ @command('perfchangeset', formatteropts) def perfchangeset(ui, repo, rev, **opts): timer, fm = gettimer(ui, opts) - n = repo[rev].node() + n = scmutil.revsingle(repo, rev).node() def d(): repo.changelog.read(n) #repo.changelog._cache = None @@ -847,7 +853,7 @@ timer, fm = gettimer(ui, opts) import mercurial.revlog mercurial.revlog._prereadsize = 2**24 # disable lazy parser in old hg - n = repo[rev].node() + n = scmutil.revsingle(repo, rev).node() cl = mercurial.revlog.revlog(getsvfs(repo), "00changelog.i") def d(): cl.rev(n) @@ -934,11 +940,16 @@ timer(d) fm.end() -def _bdiffworker(q, ready, done): +def _bdiffworker(q, blocks, xdiff, ready, done): while not done.is_set(): pair = q.get() while pair is not None: - mdiff.textdiff(*pair) + if xdiff: + mdiff.bdiff.xdiffblocks(*pair) + elif blocks: + mdiff.bdiff.blocks(*pair) + else: + mdiff.textdiff(*pair) q.task_done() pair = q.get() q.task_done() # for the None one @@ -949,6 +960,8 @@ ('', 'count', 1, 'number of revisions to test (when using --startrev)'), ('', 'alldata', False, 'test bdiffs for all associated revisions'), ('', 'threads', 0, 'number of thread to use (disable with 0)'), + ('', 'blocks', False, 'test computing diffs into blocks'), + ('', 'xdiff', False, 'use xdiff algorithm'), ], '-c|-m|FILE REV') @@ -964,6 +977,11 @@ measure bdiffs for all changes related to that changeset (manifest and filelogs). """ + opts = pycompat.byteskwargs(opts) + + if opts['xdiff'] and not opts['blocks']: + raise error.CommandError('perfbdiff', '--xdiff requires --blocks') + if opts['alldata']: opts['changelog'] = True @@ -972,6 +990,8 @@ elif rev is None: raise error.CommandError('perfbdiff', 'invalid arguments') + blocks = opts['blocks'] + xdiff = opts['xdiff'] textpairs = [] r = cmdutil.openrevlog(repo, 'perfbdiff', file_, opts) @@ -1002,7 +1022,12 @@ if not withthreads: def d(): for pair in textpairs: - mdiff.textdiff(*pair) + if xdiff: + mdiff.bdiff.xdiffblocks(*pair) + elif blocks: + mdiff.bdiff.blocks(*pair) + else: + mdiff.textdiff(*pair) else: q = util.queue() for i in xrange(threads): @@ -1010,7 +1035,8 @@ ready = threading.Condition() done = threading.Event() for i in xrange(threads): - threading.Thread(target=_bdiffworker, args=(q, ready, done)).start() + threading.Thread(target=_bdiffworker, + args=(q, blocks, xdiff, ready, done)).start() q.join() def d(): for pair in textpairs: @@ -1031,6 +1057,71 @@ with ready: ready.notify_all() +@command('perfunidiff', revlogopts + formatteropts + [ + ('', 'count', 1, 'number of revisions to test (when using --startrev)'), + ('', 'alldata', False, 'test unidiffs for all associated revisions'), + ], '-c|-m|FILE REV') +def perfunidiff(ui, repo, file_, rev=None, count=None, **opts): + """benchmark a unified diff between revisions + + This doesn't include any copy tracing - it's just a unified diff + of the texts. + + By default, benchmark a diff between its delta parent and itself. + + With ``--count``, benchmark diffs between delta parents and self for N + revisions starting at the specified revision. + + With ``--alldata``, assume the requested revision is a changeset and + measure diffs for all changes related to that changeset (manifest + and filelogs). + """ + if opts['alldata']: + opts['changelog'] = True + + if opts.get('changelog') or opts.get('manifest'): + file_, rev = None, file_ + elif rev is None: + raise error.CommandError('perfunidiff', 'invalid arguments') + + textpairs = [] + + r = cmdutil.openrevlog(repo, 'perfunidiff', file_, opts) + + startrev = r.rev(r.lookup(rev)) + for rev in range(startrev, min(startrev + count, len(r) - 1)): + if opts['alldata']: + # Load revisions associated with changeset. + ctx = repo[rev] + mtext = repo.manifestlog._revlog.revision(ctx.manifestnode()) + for pctx in ctx.parents(): + pman = repo.manifestlog._revlog.revision(pctx.manifestnode()) + textpairs.append((pman, mtext)) + + # Load filelog revisions by iterating manifest delta. + man = ctx.manifest() + pman = ctx.p1().manifest() + for filename, change in pman.diff(man).items(): + fctx = repo.file(filename) + f1 = fctx.revision(change[0][0] or -1) + f2 = fctx.revision(change[1][0] or -1) + textpairs.append((f1, f2)) + else: + dp = r.deltaparent(rev) + textpairs.append((r.revision(dp), r.revision(rev))) + + def d(): + for left, right in textpairs: + # The date strings don't matter, so we pass empty strings. + headerlines, hunks = mdiff.unidiff( + left, '', right, '', 'left', 'right', binary=False) + # consume iterators in roughly the way patch.py does + b'\n'.join(headerlines) + b''.join(sum((list(hlines) for hrange, hlines in hunks), [])) + timer, fm = gettimer(ui, opts) + timer(d) + fm.end() + @command('perfdiffwd', formatteropts) def perfdiffwd(ui, repo, **opts): """Profile diff of working directory changes""" @@ -1498,11 +1589,13 @@ ('', 'clear-revbranch', False, 'purge the revbranch cache between computation'), ] + formatteropts) -def perfbranchmap(ui, repo, full=False, clear_revbranch=False, **opts): +def perfbranchmap(ui, repo, *filternames, **opts): """benchmark the update of a branchmap This benchmarks the full repo.branchmap() call with read and write disabled """ + full = opts.get("full", False) + clear_revbranch = opts.get("clear_revbranch", False) timer, fm = gettimer(ui, opts) def getbranchmap(filtername): """generate a benchmark function for the filtername""" @@ -1521,6 +1614,8 @@ return d # add filter in smaller subset to bigger subset possiblefilters = set(repoview.filtertable) + if filternames: + possiblefilters &= set(filternames) subsettable = getbranchmapsubsettable() allfilters = [] while possiblefilters: @@ -1537,8 +1632,9 @@ if not full: for name in allfilters: repo.filtered(name).branchmap() - # add unfiltered - allfilters.append(None) + if not filternames or 'unfiltered' in filternames: + # add unfiltered + allfilters.append(None) branchcacheread = safeattrsetter(branchmap, 'read') branchcachewrite = safeattrsetter(branchmap.branchcache, 'write') @@ -1546,7 +1642,10 @@ branchcachewrite.set(lambda bc, repo: None) try: for name in allfilters: - timer(getbranchmap(name), title=str(name)) + printname = name + if name is None: + printname = 'unfiltered' + timer(getbranchmap(name), title=str(printname)) finally: branchcacheread.restore() branchcachewrite.restore() diff -r fb92df8b634c -r ed5448edcbfa contrib/phabricator.py --- a/contrib/phabricator.py Wed Apr 04 10:35:09 2018 -0400 +++ b/contrib/phabricator.py Wed Apr 18 15:32:08 2018 -0400 @@ -22,7 +22,8 @@ url = https://phab.example.com/ # API token. Get it from https://$HOST/conduit/login/ - token = cli-xxxxxxxxxxxxxxxxxxxxxxxxxxxx + # Deprecated: see [phabricator.auth] below + #token = cli-xxxxxxxxxxxxxxxxxxxxxxxxxxxx # Repo callsign. If a repo has a URL https://$HOST/diffusion/FOO, then its # callsign is "FOO". @@ -33,6 +34,11 @@ # if you need to specify advanced options that is not easily supported by # the internal library. curlcmd = curl --connect-timeout 2 --retry 3 --silent + + [phabricator.auth] + example.url = https://phab.example.com/ + # API token. Get it from https://$HOST/conduit/login/ + example.token = cli-xxxxxxxxxxxxxxxxxxxxxxxxxxxx """ from __future__ import absolute_import @@ -60,6 +66,9 @@ url as urlmod, util, ) +from mercurial.utils import ( + procutil, +) cmdtable = {} command = registrar.command(cmdtable) @@ -94,20 +103,56 @@ process('', params) return util.urlreq.urlencode(flatparams) +printed_token_warning = False + +def readlegacytoken(repo): + """Transitional support for old phabricator tokens. + + Remove before the 4.6 release. + """ + global printed_token_warning + token = repo.ui.config('phabricator', 'token') + if token and not printed_token_warning: + printed_token_warning = True + repo.ui.warn(_('phabricator.token is deprecated - please ' + 'migrate to the phabricator.auth section.\n')) + return token + def readurltoken(repo): """return conduit url, token and make sure they exist Currently read from [phabricator] config section. In the future, it might make sense to read from .arcconfig and .arcrc as well. """ - values = [] - section = 'phabricator' - for name in ['url', 'token']: - value = repo.ui.config(section, name) - if not value: - raise error.Abort(_('config %s.%s is required') % (section, name)) - values.append(value) - return values + url = repo.ui.config('phabricator', 'url') + if not url: + raise error.Abort(_('config %s.%s is required') + % ('phabricator', 'url')) + + groups = {} + for key, val in repo.ui.configitems('phabricator.auth'): + if '.' not in key: + repo.ui.warn(_("ignoring invalid [phabricator.auth] key '%s'\n") + % key) + continue + group, setting = key.rsplit('.', 1) + groups.setdefault(group, {})[setting] = val + + token = None + for group, auth in groups.iteritems(): + if url != auth.get('url'): + continue + token = auth.get('token') + if token: + break + + if not token: + token = readlegacytoken(repo) + if not token: + raise error.Abort(_('Can\'t find conduit token associated to %s') + % (url,)) + + return url, token def callconduit(repo, name, params): """call Conduit API, params is a dict. return json.loads result, or None""" @@ -119,7 +164,8 @@ data = urlencodenested(params) curlcmd = repo.ui.config('phabricator', 'curlcmd') if curlcmd: - sin, sout = util.popen2('%s -d @- %s' % (curlcmd, util.shellquote(url))) + sin, sout = procutil.popen2('%s -d @- %s' + % (curlcmd, procutil.shellquote(url))) sin.write(data) sin.close() body = sout.read() @@ -868,11 +914,12 @@ templatekeyword = registrar.templatekeyword() -@templatekeyword('phabreview') -def template_review(repo, ctx, revcache, **args): +@templatekeyword('phabreview', requires={'ctx'}) +def template_review(context, mapping): """:phabreview: Object describing the review for this changeset. Has attributes `url` and `id`. """ + ctx = context.resource(mapping, 'ctx') m = _differentialrevisiondescre.search(ctx.description()) if m: return { diff -r fb92df8b634c -r ed5448edcbfa contrib/python-zstandard/MANIFEST.in --- a/contrib/python-zstandard/MANIFEST.in Wed Apr 04 10:35:09 2018 -0400 +++ b/contrib/python-zstandard/MANIFEST.in Wed Apr 18 15:32:08 2018 -0400 @@ -1,5 +1,7 @@ graft c-ext graft zstd +graft tests include make_cffi.py include setup_zstd.py include zstd.c +include LICENSE diff -r fb92df8b634c -r ed5448edcbfa contrib/python-zstandard/NEWS.rst --- a/contrib/python-zstandard/NEWS.rst Wed Apr 04 10:35:09 2018 -0400 +++ b/contrib/python-zstandard/NEWS.rst Wed Apr 18 15:32:08 2018 -0400 @@ -1,13 +1,201 @@ +=============== Version History =============== +1.0.0 (not yet released) +======================== + +Actions Blocking Release +------------------------ + +* compression and decompression APIs that support ``io.rawIOBase`` interface + (#13). +* Refactor module names so C and CFFI extensions live under ``zstandard`` + package. +* Overall API design review. +* Use Python allocator where possible. +* Figure out what to do about experimental APIs not implemented by CFFI. +* APIs for auto adjusting compression parameters based on input size. e.g. + clamping the window log so it isn't too large for input. +* Consider allowing compressor and decompressor instances to be thread safe, + support concurrent operations. Or track when an operation is in progress and + refuse to let concurrent operations use the same instance. +* Support for magic-less frames for all decompression operations (``decompress()`` + doesn't work due to sniffing the content size and the lack of a ZSTD API to + sniff magic-less frames - this should be fixed in 1.3.5.). +* Audit for complete flushing when ending compression streams. +* Deprecate legacy APIs. +* Audit for ability to control read/write sizes on all APIs. +* Detect memory leaks via bench.py. +* Remove low-level compression parameters from ``ZstdCompressor.__init__`` and + require use of ``CompressionParameters``. +* Expose ``ZSTD_getFrameProgression()`` from more compressor types. + +Other Actions Not Blocking Release +--------------------------------------- + +* Support for block compression APIs. +* API for ensuring max memory ceiling isn't exceeded. +* Move off nose for testing. + +0.9.0 (released 2018-04-08) +=========================== + +Backwards Compatibility Notes +----------------------------- + +* CFFI 1.11 or newer is now required (previous requirement was 1.8). +* The primary module is now ``zstandard``. Please change imports of ``zstd`` + and ``zstd_cffi`` to ``import zstandard``. See the README for more. Support + for importing the old names will be dropped in the next release. +* ``ZstdCompressor.read_from()`` and ``ZstdDecompressor.read_from()`` have + been renamed to ``read_to_iter()``. ``read_from()`` is aliased to the new + name and will be deleted in a future release. +* Support for Python 2.6 has been removed. +* Support for Python 3.3 has been removed. +* The ``selectivity`` argument to ``train_dictionary()`` has been removed, as + the feature disappeared from zstd 1.3. +* Support for legacy dictionaries has been removed. Cover dictionaries are now + the default. ``train_cover_dictionary()`` has effectively been renamed to + ``train_dictionary()``. +* The ``allow_empty`` argument from ``ZstdCompressor.compress()`` has been + deleted and the method now allows empty inputs to be compressed by default. +* ``estimate_compression_context_size()`` has been removed. Use + ``CompressionParameters.estimated_compression_context_size()`` instead. +* ``get_compression_parameters()`` has been removed. Use + ``CompressionParameters.from_level()`` instead. +* The arguments to ``CompressionParameters.__init__()`` have changed. If you + were using positional arguments before, the positions now map to different + arguments. It is recommended to use keyword arguments to construct + ``CompressionParameters`` instances. +* ``TARGETLENGTH_MAX`` constant has been removed (it disappeared from zstandard + 1.3.4). +* ``ZstdCompressor.write_to()`` and ``ZstdDecompressor.write_to()`` have been + renamed to ``ZstdCompressor.stream_writer()`` and + ``ZstdDecompressor.stream_writer()``, respectively. The old names are still + aliased, but will be removed in the next major release. +* Content sizes are written into frame headers by default + (``ZstdCompressor(write_content_size=True)`` is now the default). +* ``CompressionParameters`` has been renamed to ``ZstdCompressionParameters`` + for consistency with other types. The old name is an alias and will be removed + in the next major release. + +Bug Fixes +--------- + +* Fixed memory leak in ``ZstdCompressor.copy_stream()`` (#40) (from 0.8.2). +* Fixed memory leak in ``ZstdDecompressor.copy_stream()`` (#35) (from 0.8.2). +* Fixed memory leak of ``ZSTD_DDict`` instances in CFFI's ``ZstdDecompressor``. + +New Features +------------ + +* Bundlded zstandard library upgraded from 1.1.3 to 1.3.4. This delivers various + bug fixes and performance improvements. It also gives us access to newer + features. +* Support for negative compression levels. +* Support for *long distance matching* (facilitates compression ratios that approach + LZMA). +* Supporting for reading empty zstandard frames (with an embedded content size + of 0). +* Support for writing and partial support for reading zstandard frames without a + magic header. +* New ``stream_reader()`` API that exposes the ``io.RawIOBase`` interface (allows + you to ``.read()`` from a file-like object). +* Several minor features, bug fixes, and performance enhancements. +* Wheels for Linux and macOS are now provided with releases. + +Changes +------- + +* Functions accepting bytes data now use the buffer protocol and can accept + more types (like ``memoryview`` and ``bytearray``) (#26). +* Add #includes so compilation on OS X and BSDs works (#20). +* New ``ZstdDecompressor.stream_reader()`` API to obtain a read-only i/o stream + of decompressed data for a source. +* New ``ZstdCompressor.stream_reader()`` API to obtain a read-only i/o stream of + compressed data for a source. +* Renamed ``ZstdDecompressor.read_from()`` to ``ZstdDecompressor.read_to_iter()``. + The old name is still available. +* Renamed ``ZstdCompressor.read_from()`` to ``ZstdCompressor.read_to_iter()``. + ``read_from()`` is still available at its old location. +* Introduce the ``zstandard`` module to import and re-export the C or CFFI + *backend* as appropriate. Behavior can be controlled via the + ``PYTHON_ZSTANDARD_IMPORT_POLICY`` environment variable. See README for + usage info. +* Vendored version of zstd upgraded to 1.3.4. +* Added module constants ``CONTENTSIZE_UNKNOWN`` and ``CONTENTSIZE_ERROR``. +* Add ``STRATEGY_BTULTRA`` compression strategy constant. +* Switch from deprecated ``ZSTD_getDecompressedSize()`` to + ``ZSTD_getFrameContentSize()`` replacement. +* ``ZstdCompressor.compress()`` can now compress empty inputs without requiring + special handling. +* ``ZstdCompressor`` and ``ZstdDecompressor`` now have a ``memory_size()`` + method for determining the current memory utilization of the underlying zstd + primitive. +* ``train_dictionary()`` has new arguments and functionality for trying multiple + variations of COVER parameters and selecting the best one. +* Added module constants ``LDM_MINMATCH_MIN``, ``LDM_MINMATCH_MAX``, and + ``LDM_BUCKETSIZELOG_MAX``. +* Converted all consumers to the zstandard *new advanced API*, which uses + ``ZSTD_compress_generic()`` +* ``CompressionParameters.__init__`` now accepts several more arguments, + including support for *long distance matching*. +* ``ZstdCompressionDict.__init__`` now accepts a ``dict_type`` argument that + controls how the dictionary should be interpreted. This can be used to + force the use of *content-only* dictionaries or to require the presence + of the dictionary magic header. +* ``ZstdCompressionDict.precompute_compress()`` can be used to precompute the + compression dictionary so it can efficiently be used with multiple + ``ZstdCompressor`` instances. +* Digested dictionaries are now stored in ``ZstdCompressionDict`` instances, + created automatically on first use, and automatically reused by all + ``ZstdDecompressor`` instances bound to that dictionary. +* All meaningful functions now accept keyword arguments. +* ``ZstdDecompressor.decompressobj()`` now accepts a ``write_size`` argument + to control how much work to perform on every decompressor invocation. +* ``ZstdCompressor.write_to()`` now exposes a ``tell()``, which exposes the + total number of bytes written so far. +* ``ZstdDecompressor.stream_reader()`` now supports ``seek()`` when moving + forward in the stream. +* Removed ``TARGETLENGTH_MAX`` constant. +* Added ``frame_header_size(data)`` function. +* Added ``frame_content_size(data)`` function. +* Consumers of ``ZSTD_decompress*`` have been switched to the new *advanced + decompression* API. +* ``ZstdCompressor`` and ``ZstdCompressionParams`` can now be constructed with + negative compression levels. +* ``ZstdDecompressor`` now accepts a ``max_window_size`` argument to limit the + amount of memory required for decompression operations. +* ``FORMAT_ZSTD1`` and ``FORMAT_ZSTD1_MAGICLESS`` constants to be used with + the ``format`` compression parameter to control whether the frame magic + header is written. +* ``ZstdDecompressor`` now accepts a ``format`` argument to control the + expected frame format. +* ``ZstdCompressor`` now has a ``frame_progression()`` method to return + information about the current compression operation. +* Error messages in CFFI no longer have ``b''`` literals. +* Compiler warnings and underlying overflow issues on 32-bit platforms have been + fixed. +* Builds in CI now build with compiler warnings as errors. This should hopefully + fix new compiler warnings from being introduced. +* Make ``ZstdCompressor(write_content_size=True)`` and + ``CompressionParameters(write_content_size=True)`` the default. +* ``CompressionParameters`` has been renamed to ``ZstdCompressionParameters``. + +0.8.2 (released 2018-02-22) +--------------------------- + +* Fixed memory leak in ``ZstdCompressor.copy_stream()`` (#40). +* Fixed memory leak in ``ZstdDecompressor.copy_stream()`` (#35). + 0.8.1 (released 2017-04-08) --------------------------- * Add #includes so compilation on OS X and BSDs works (#20). 0.8.0 (released 2017-03-08) ---------------------------- +=========================== * CompressionParameters now has a estimated_compression_context_size() method. zstd.estimate_compression_context_size() is now deprecated and slated for @@ -35,7 +223,7 @@ DictParameters instance to control dictionary generation. 0.7.0 (released 2017-02-07) ---------------------------- +=========================== * Added zstd.get_frame_parameters() to obtain info about a zstd frame. * Added ZstdDecompressor.decompress_content_dict_chain() for efficient @@ -62,7 +250,7 @@ * DictParameters instances now expose their values as attributes. 0.6.0 (released 2017-01-14) ---------------------------- +=========================== * Support for legacy zstd protocols (build time opt in feature). * Automation improvements to test against Python 3.6, latest versions @@ -79,17 +267,17 @@ * Disallow compress(b'') when writing content sizes by default (issue #11). 0.5.2 (released 2016-11-12) ---------------------------- +=========================== * more packaging fixes for source distribution 0.5.1 (released 2016-11-12) ---------------------------- +=========================== * setup_zstd.py is included in the source distribution 0.5.0 (released 2016-11-10) ---------------------------- +=========================== * Vendored version of zstd updated to 1.1.1. * Continuous integration for Python 3.6 and 3.7 @@ -114,8 +302,8 @@ * The monolithic ``zstd.c`` file has been split into a header file defining types and separate ``.c`` source files for the implementation. -History of the Project -====================== +Older History +============= 2016-08-31 - Zstandard 1.0.0 is released and Gregory starts hacking on a Python extension for use by the Mercurial project. A very hacky prototype diff -r fb92df8b634c -r ed5448edcbfa contrib/python-zstandard/README.rst --- a/contrib/python-zstandard/README.rst Wed Apr 04 10:35:09 2018 -0400 +++ b/contrib/python-zstandard/README.rst Wed Apr 18 15:32:08 2018 -0400 @@ -11,69 +11,18 @@ performance. This means exposing most of the features and flexibility of the C API while not sacrificing usability or safety that Python provides. -The canonical home for this project is +The canonical home for this project lives in a Mercurial repository run by +the author. For convenience, that repository is frequently synchronized to https://github.com/indygreg/python-zstandard. | |ci-status| |win-ci-status| -State of Project -================ - -The project is officially in beta state. The author is reasonably satisfied -that functionality works as advertised. **There will be some backwards -incompatible changes before 1.0, probably in the 0.9 release.** This may -involve renaming the main module from *zstd* to *zstandard* and renaming -various types and methods. Pin the package version to prevent unwanted -breakage when this change occurs! - -This project is vendored and distributed with Mercurial 4.1, where it is -used in a production capacity. - -There is continuous integration for Python versions 2.6, 2.7, and 3.3+ -on Linux x86_x64 and Windows x86 and x86_64. The author is reasonably -confident the extension is stable and works as advertised on these -platforms. - -The CFFI bindings are mostly feature complete. Where a feature is implemented -in CFFI, unit tests run against both C extension and CFFI implementation to -ensure behavior parity. - -Expected Changes ----------------- - -The author is reasonably confident in the current state of what's -implemented on the ``ZstdCompressor`` and ``ZstdDecompressor`` types. -Those APIs likely won't change significantly. Some low-level behavior -(such as naming and types expected by arguments) may change. - -There will likely be arguments added to control the input and output -buffer sizes (currently, certain operations read and write in chunk -sizes using zstd's preferred defaults). - -There should be an API that accepts an object that conforms to the buffer -interface and returns an iterator over compressed or decompressed output. - -There should be an API that exposes an ``io.RawIOBase`` interface to -compressor and decompressor streams, like how ``gzip.GzipFile`` from -the standard library works (issue 13). - -The author is on the fence as to whether to support the extremely -low level compression and decompression APIs. It could be useful to -support compression without the framing headers. But the author doesn't -believe it a high priority at this time. - -There will likely be a refactoring of the module names. Currently, -``zstd`` is a C extension and ``zstd_cffi`` is the CFFI interface. -This means that all code for the C extension must be implemented in -C. ``zstd`` may be converted to a Python module so code can be reused -between CFFI and C and so not all code in the C extension has to be C. - Requirements ============ -This extension is designed to run with Python 2.6, 2.7, 3.3, 3.4, 3.5, and -3.6 on common platforms (Linux, Windows, and OS X). Only x86_64 is -currently well-tested as an architecture. +This extension is designed to run with Python 2.7, 3.4, 3.5, and 3.6 +on common platforms (Linux, Windows, and OS X). x86 and x86_64 are well-tested +on Windows. Only x86_64 is well-tested on Linux and macOS. Installing ========== @@ -96,114 +45,82 @@ Performance =========== -Very crude and non-scientific benchmarking (most benchmarks fall in this -category because proper benchmarking is hard) show that the Python bindings -perform within 10% of the native C implementation. - -The following table compares the performance of compressing and decompressing -a 1.1 GB tar file comprised of the files in a Firefox source checkout. Values -obtained with the ``zstd`` program are on the left. The remaining columns detail -performance of various compression APIs in the Python bindings. +zstandard is a highly tunable compression algorithm. In its default settings +(compression level 3), it will be faster at compression and decompression and +will have better compression ratios than zlib on most data sets. When tuned +for speed, it approaches lz4's speed and ratios. When tuned for compression +ratio, it approaches lzma ratios and compression speed, but decompression +speed is much faster. See the official zstandard documentation for more. -+-------+-----------------+-----------------+-----------------+---------------+ -| Level | Native | Simple | Stream In | Stream Out | -| | Comp / Decomp | Comp / Decomp | Comp / Decomp | Comp | -+=======+=================+=================+=================+===============+ -| 1 | 490 / 1338 MB/s | 458 / 1266 MB/s | 407 / 1156 MB/s | 405 MB/s | -+-------+-----------------+-----------------+-----------------+---------------+ -| 2 | 412 / 1288 MB/s | 381 / 1203 MB/s | 345 / 1128 MB/s | 349 MB/s | -+-------+-----------------+-----------------+-----------------+---------------+ -| 3 | 342 / 1312 MB/s | 319 / 1182 MB/s | 285 / 1165 MB/s | 287 MB/s | -+-------+-----------------+-----------------+-----------------+---------------+ -| 11 | 64 / 1506 MB/s | 66 / 1436 MB/s | 56 / 1342 MB/s | 57 MB/s | -+-------+-----------------+-----------------+-----------------+---------------+ - -Again, these are very unscientific. But it shows that Python is capable of -compressing at several hundred MB/s and decompressing at over 1 GB/s. - -Comparison to Other Python Bindings -=================================== - -https://pypi.python.org/pypi/zstd is an alternate Python binding to -Zstandard. At the time this was written, the latest release of that -package (1.1.2) only exposed the simple APIs for compression and decompression. -This package exposes much more of the zstd API, including streaming and -dictionary compression. This package also has CFFI support. - -Bundling of Zstandard Source Code -================================= - -The source repository for this project contains a vendored copy of the -Zstandard source code. This is done for a few reasons. +zstandard and this library support multi-threaded compression. There is a +mechanism to compress large inputs using multiple threads. -First, Zstandard is relatively new and not yet widely available as a system -package. Providing a copy of the source code enables the Python C extension -to be compiled without requiring the user to obtain the Zstandard source code -separately. - -Second, Zstandard has both a stable *public* API and an *experimental* API. -The *experimental* API is actually quite useful (contains functionality for -training dictionaries for example), so it is something we wish to expose to -Python. However, the *experimental* API is only available via static linking. -Furthermore, the *experimental* API can change at any time. So, control over -the exact version of the Zstandard library linked against is important to -ensure known behavior. - -Instructions for Building and Testing -===================================== - -Once you have the source code, the extension can be built via setup.py:: - - $ python setup.py build_ext - -We recommend testing with ``nose``:: - - $ nosetests +The performance of this library is usually very similar to what the zstandard +C API can deliver. Overhead in this library is due to general Python overhead +and can't easily be avoided by *any* zstandard Python binding. This library +exposes multiple APIs for performing compression and decompression so callers +can pick an API suitable for their need. Contrast with the compression +modules in Python's standard library (like ``zlib``), which only offer limited +mechanisms for performing operations. The API flexibility means consumers can +choose to use APIs that facilitate zero copying or minimize Python object +creation and garbage collection overhead. -A Tox configuration is present to test against multiple Python versions:: - - $ tox - -Tests use the ``hypothesis`` Python package to perform fuzzing. If you -don't have it, those tests won't run. Since the fuzzing tests take longer -to execute than normal tests, you'll need to opt in to running them by -setting the ``ZSTD_SLOW_TESTS`` environment variable. This is set -automatically when using ``tox``. - -The ``cffi`` Python package needs to be installed in order to build the CFFI -bindings. If it isn't present, the CFFI bindings won't be built. - -To create a virtualenv with all development dependencies, do something -like the following:: - - # Python 2 - $ virtualenv venv - - # Python 3 - $ python3 -m venv venv - - $ source venv/bin/activate - $ pip install cffi hypothesis nose tox +This library is capable of single-threaded throughputs well over 1 GB/s. For +exact numbers, measure yourself. The source code repository has a ``bench.py`` +script that can be used to measure things. API === -The compiled C extension provides a ``zstd`` Python module. The CFFI -bindings provide a ``zstd_cffi`` module. Both provide an identical API -interface. The types, functions, and attributes exposed by these modules +To interface with Zstandard, simply import the ``zstandard`` module:: + + import zstandard + +It is a popular convention to alias the module as a different name for +brevity:: + + import zstandard as zstd + +This module attempts to import and use either the C extension or CFFI +implementation. On Python platforms known to support C extensions (like +CPython), it raises an ImportError if the C extension cannot be imported. +On Python platforms known to not support C extensions (like PyPy), it only +attempts to import the CFFI implementation and raises ImportError if that +can't be done. On other platforms, it first tries to import the C extension +then falls back to CFFI if that fails and raises ImportError if CFFI fails. + +To change the module import behavior, a ``PYTHON_ZSTANDARD_IMPORT_POLICY`` +environment variable can be set. The following values are accepted: + +default + The behavior described above. +cffi_fallback + Always try to import the C extension then fall back to CFFI if that + fails. +cext + Only attempt to import the C extension. +cffi + Only attempt to import the CFFI implementation. + +In addition, the ``zstandard`` module exports a ``backend`` attribute +containing the string name of the backend being used. It will be one +of ``cext`` or ``cffi`` (for *C extension* and *cffi*, respectively). + +The types, functions, and attributes exposed by the ``zstandard`` module are documented in the sections below. .. note:: The documentation in this section makes references to various zstd - concepts and functionality. The ``Concepts`` section below explains - these concepts in more detail. + concepts and functionality. The source repository contains a + ``docs/concepts.rst`` file explaining these in more detail. ZstdCompressor -------------- The ``ZstdCompressor`` class provides an interface for performing -compression operations. +compression operations. Each instance is essentially a wrapper around a +``ZSTD_CCtx`` from the C API. Each instance is associated with parameters that control compression behavior. These come from the following named arguments (all optional): @@ -214,21 +131,21 @@ Compression dictionary to use. Note: When using dictionary data and ``compress()`` is called multiple - times, the ``CompressionParameters`` derived from an integer compression - ``level`` and the first compressed data's size will be reused for all - subsequent operations. This may not be desirable if source data size - varies significantly. + times, the ``ZstdCompressionParameters`` derived from an integer + compression ``level`` and the first compressed data's size will be reused + for all subsequent operations. This may not be desirable if source data + size varies significantly. compression_params - A ``CompressionParameters`` instance (overrides the ``level`` value). + A ``ZstdCompressionParameters`` instance defining compression settings. write_checksum Whether a 4 byte checksum should be written with the compressed data. Defaults to False. If True, the decompressor can verify that decompressed data matches the original input data. write_content_size Whether the size of the uncompressed data will be written into the - header of compressed data. Defaults to False. The data will only be + header of compressed data. Defaults to True. The data will only be written if the compressor knows the size of the input data. This is - likely not true for streaming compression. + often not true for streaming compression. write_dict_id Whether to write the dictionary ID into the compressed data. Defaults to True. The dictionary ID is only written if a dictionary @@ -242,10 +159,25 @@ data. APIs that spawn multiple threads for working on multiple pieces of data have their own ``threads`` argument. +``compression_params`` is mutually exclusive with ``level``, ``write_checksum``, +``write_content_size``, ``write_dict_id``, and ``threads``. + Unless specified otherwise, assume that no two methods of ``ZstdCompressor`` instances can be called from multiple Python threads simultaneously. In other words, assume instances are not thread safe unless stated otherwise. +Utility Methods +^^^^^^^^^^^^^^^ + +``frame_progression()`` returns a 3-tuple containing the number of bytes +ingested, consumed, and produced by the current compression operation. + +``memory_size()`` obtains the memory utilization of the underlying zstd +compression context, in bytes.:: + + cctx = zstd.ZstdCompressor() + memory = cctx.memory_size() + Simple API ^^^^^^^^^^ @@ -256,40 +188,75 @@ The ``data`` argument can be any object that implements the *buffer protocol*. -Unless ``compression_params`` or ``dict_data`` are passed to the -``ZstdCompressor``, each invocation of ``compress()`` will calculate the -optimal compression parameters for the configured compression ``level`` and -input data size (some parameters are fine-tuned for small input sizes). +Stream Reader API +^^^^^^^^^^^^^^^^^ + +``stream_reader(source)`` can be used to obtain an object conforming to the +``io.RawIOBase`` interface for reading compressed output as a stream:: + + with open(path, 'rb') as fh: + cctx = zstd.ZstdCompressor() + with cctx.stream_reader(fh) as reader: + while True: + chunk = reader.read(16384) + if not chunk: + break + + # Do something with compressed chunk. + +The stream can only be read within a context manager. When the context +manager exits, the stream is closed and the underlying resource is +released and future operations against the compression stream stream will fail. + +The ``source`` argument to ``stream_reader()`` can be any object with a +``read(size)`` method or any object implementing the *buffer protocol*. -If a compression dictionary is being used, the compression parameters -determined from the first input's size will be reused for subsequent -operations. +``stream_reader()`` accepts a ``size`` argument specifying how large the input +stream is. This is used to adjust compression parameters so they are +tailored to the source size.:: + + with open(path, 'rb') as fh: + cctx = zstd.ZstdCompressor() + with cctx.stream_reader(fh, size=os.stat(path).st_size) as reader: + ... + +If the ``source`` is a stream, you can specify how large ``read()`` requests +to that stream should be via the ``read_size`` argument. It defaults to +``zstandard.COMPRESSION_RECOMMENDED_INPUT_SIZE``.:: -There is currently a deficiency in zstd's C APIs that makes it difficult -to round trip empty inputs when ``write_content_size=True``. Attempting -this will raise a ``ValueError`` unless ``allow_empty=True`` is passed -to ``compress()``. + with open(path, 'rb') as fh: + cctx = zstd.ZstdCompressor() + # Will perform fh.read(8192) when obtaining data to feed into the + # compressor. + with cctx.stream_reader(fh, read_size=8192) as reader: + ... + +The stream returned by ``stream_reader()`` is neither writable nor seekable +(even if the underlying source is seekable). ``readline()`` and +``readlines()`` are not implemented because they don't make sense for +compressed data. ``tell()`` returns the number of compressed bytes +emitted so far. Streaming Input API ^^^^^^^^^^^^^^^^^^^ -``write_to(fh)`` (which behaves as a context manager) allows you to *stream* +``stream_writer(fh)`` (which behaves as a context manager) allows you to *stream* data into a compressor.:: cctx = zstd.ZstdCompressor(level=10) - with cctx.write_to(fh) as compressor: + with cctx.stream_writer(fh) as compressor: compressor.write(b'chunk 0') compressor.write(b'chunk 1') ... -The argument to ``write_to()`` must have a ``write(data)`` method. As +The argument to ``stream_writer()`` must have a ``write(data)`` method. As compressed data is available, ``write()`` will be called with the compressed data as its argument. Many common Python types implement ``write()``, including open file handles and ``io.BytesIO``. -``write_to()`` returns an object representing a streaming compressor instance. -It **must** be used as a context manager. That object's ``write(data)`` method -is used to feed data into the compressor. +``stream_writer()`` returns an object representing a streaming compressor +instance. It **must** be used as a context manager. That object's +``write(data)`` method is used to feed data into the compressor. A ``flush()`` method can be called to evict whatever data remains within the compressor's internal state into the output object. This may result in 0 or @@ -303,7 +270,7 @@ you can declare it before compression begins:: cctx = zstd.ZstdCompressor() - with cctx.write_to(fh, size=data_len) as compressor: + with cctx.stream_writer(fh, size=data_len) as compressor: compressor.write(chunk0) compressor.write(chunk1) ... @@ -315,29 +282,35 @@ The size of chunks being ``write()`` to the destination can be specified:: cctx = zstd.ZstdCompressor() - with cctx.write_to(fh, write_size=32768) as compressor: + with cctx.stream_writer(fh, write_size=32768) as compressor: ... To see how much memory is being used by the streaming compressor:: cctx = zstd.ZstdCompressor() - with cctx.write_to(fh) as compressor: + with cctx.stream_writer(fh) as compressor: ... byte_size = compressor.memory_size() +Thte total number of bytes written so far are exposed via ``tell()``:: + + cctx = zstd.ZstdCompressor() + with cctx.stream_writer(fh) as compressor: + ... + total_written = compressor.tell() + Streaming Output API ^^^^^^^^^^^^^^^^^^^^ -``read_from(reader)`` provides a mechanism to stream data out of a compressor -as an iterator of data chunks.:: +``read_to_iter(reader)`` provides a mechanism to stream data out of a +compressor as an iterator of data chunks.:: cctx = zstd.ZstdCompressor() - for chunk in cctx.read_from(fh): + for chunk in cctx.read_to_iter(fh): # Do something with emitted data. -``read_from()`` accepts an object that has a ``read(size)`` method or conforms -to the buffer protocol. (``bytes`` and ``memoryview`` are 2 common types that -provide the buffer protocol.) +``read_to_iter()`` accepts an object that has a ``read(size)`` method or +conforms to the buffer protocol. Uncompressed data is fetched from the source either by calling ``read(size)`` or by fetching a slice of data from the object directly (in the case where @@ -348,23 +321,24 @@ it raises or returns an empty bytes (``b''``). It is perfectly valid for the source to deliver fewer bytes than were what requested by ``read(size)``. -Like ``write_to()``, ``read_from()`` also accepts a ``size`` argument +Like ``stream_writer()``, ``read_to_iter()`` also accepts a ``size`` argument declaring the size of the input stream:: cctx = zstd.ZstdCompressor() - for chunk in cctx.read_from(fh, size=some_int): + for chunk in cctx.read_to_iter(fh, size=some_int): pass You can also control the size that data is ``read()`` from the source and the ideal size of output chunks:: cctx = zstd.ZstdCompressor() - for chunk in cctx.read_from(fh, read_size=16384, write_size=8192): + for chunk in cctx.read_to_iter(fh, read_size=16384, write_size=8192): pass -Unlike ``write_to()``, ``read_from()`` does not give direct control over the -sizes of chunks fed into the compressor. Instead, chunk sizes will be whatever -the object being read from delivers. These will often be of a uniform size. +Unlike ``stream_writer()``, ``read_to_iter()`` does not give direct control +over the sizes of chunks fed into the compressor. Instead, chunk sizes will +be whatever the object being read from delivers. These will often be of a +uniform size. Stream Copying API ^^^^^^^^^^^^^^^^^^ @@ -404,7 +378,7 @@ ``flush()`` methods. Each returns compressed data or an empty bytes. The purpose of ``compressobj()`` is to provide an API-compatible interface -with ``zlib.compressobj`` and ``bz2.BZ2Compressor``. This allows callers to +with ``zlib.compressobj``, ``bz2.BZ2Compressor``, etc. This allows callers to swap in different compressor objects while using the same API. ``flush()`` accepts an optional argument indicating how to end the stream. @@ -485,13 +459,23 @@ ---------------- The ``ZstdDecompressor`` class provides an interface for performing -decompression. +decompression. It is effectively a wrapper around the ``ZSTD_DCtx`` type from +the C API. Each instance is associated with parameters that control decompression. These come from the following named arguments (all optional): dict_data Compression dictionary to use. +max_window_size + Sets an uppet limit on the window size for decompression operations in + kibibytes. This setting can be used to prevent large memory allocations + for inputs using large compression windows. +format + Set the format of data for the decoder. By default, this is + ``zstd.FORMAT_ZSTD1``. It can be set to ``zstd.FORMAT_ZSTD1_MAGICLESS`` to + allow decoding frames without the 4 byte magic header. Not all decompression + APIs support this mode. The interface of this class is very similar to ``ZstdCompressor`` (by design). @@ -499,6 +483,15 @@ instances can be called from multiple Python threads simultaneously. In other words, assume instances are not thread safe unless stated otherwise. +Utility Methods +^^^^^^^^^^^^^^^ + +``memory_size()`` obtains the size of the underlying zstd decompression context, +in bytes.:: + + dctx = zstd.ZstdDecompressor() + size = dctx.memory_size() + Simple API ^^^^^^^^^^ @@ -509,9 +502,10 @@ decompressed = dctx.decompress(data) By default, ``decompress(data)`` will only work on data written with the content -size encoded in its header. This can be achieved by creating a -``ZstdCompressor`` with ``write_content_size=True``. If compressed data without -an embedded content size is seen, ``zstd.ZstdError`` will be raised. +size encoded in its header (this is the default behavior of +``ZstdCompressor().compress()`` but may not be true for streaming compression). If +compressed data without an embedded content size is seen, ``zstd.ZstdError`` will +be raised. If the compressed data doesn't have its content size embedded within it, decompression can be attempted by specifying the ``max_output_size`` @@ -534,17 +528,67 @@ result in a lot of work for the memory allocator and may result in ``MemoryError`` being raised if the allocation fails. -If the exact size of decompressed data is unknown, it is **strongly** -recommended to use a streaming API. +.. important:: + + If the exact size of decompressed data is unknown (not passed in explicitly + and not stored in the zstandard frame), for performance reasons it is + encouraged to use a streaming API. + +Stream Reader API +^^^^^^^^^^^^^^^^^ + +``stream_reader(source)`` can be used to obtain an object conforming to the +``io.RawIOBase`` interface for reading decompressed output as a stream:: + + with open(path, 'rb') as fh: + dctx = zstd.ZstdDecompressor() + with dctx.stream_reader(fh) as reader: + while True: + chunk = reader.read(16384) + if not chunk: + break + + # Do something with decompressed chunk. + +The stream can only be read within a context manager. When the context +manager exits, the stream is closed and the underlying resource is +released and future operations against the stream will fail. + +The ``source`` argument to ``stream_reader()`` can be any object with a +``read(size)`` method or any object implementing the *buffer protocol*. + +If the ``source`` is a stream, you can specify how large ``read()`` requests +to that stream should be via the ``read_size`` argument. It defaults to +``zstandard.DECOMPRESSION_RECOMMENDED_INPUT_SIZE``.:: + + with open(path, 'rb') as fh: + dctx = zstd.ZstdDecompressor() + # Will perform fh.read(8192) when obtaining data for the decompressor. + with dctx.stream_reader(fh, read_size=8192) as reader: + ... + +The stream returned by ``stream_reader()`` is not writable. + +The stream returned by ``stream_reader()`` is *partially* seekable. +Absolute and relative positions (``SEEK_SET`` and ``SEEK_CUR``) forward +of the current position are allowed. Offsets behind the current read +position and offsets relative to the end of stream are not allowed and +will raise ``ValueError`` if attempted. + +``tell()`` returns the number of decompressed bytes read so far. + +Not all I/O methods are implemented. Notably missing is support for +``readline()``, ``readlines()``, and linewise iteration support. Support for +these is planned for a future release. Streaming Input API ^^^^^^^^^^^^^^^^^^^ -``write_to(fh)`` can be used to incrementally send compressed data to a +``stream_writer(fh)`` can be used to incrementally send compressed data to a decompressor.:: dctx = zstd.ZstdDecompressor() - with dctx.write_to(fh) as decompressor: + with dctx.stream_writer(fh) as decompressor: decompressor.write(compressed_data) This behaves similarly to ``zstd.ZstdCompressor``: compressed data is written to @@ -558,54 +602,56 @@ The size of chunks being ``write()`` to the destination can be specified:: dctx = zstd.ZstdDecompressor() - with dctx.write_to(fh, write_size=16384) as decompressor: + with dctx.stream_writer(fh, write_size=16384) as decompressor: pass You can see how much memory is being used by the decompressor:: dctx = zstd.ZstdDecompressor() - with dctx.write_to(fh) as decompressor: + with dctx.stream_writer(fh) as decompressor: byte_size = decompressor.memory_size() Streaming Output API ^^^^^^^^^^^^^^^^^^^^ -``read_from(fh)`` provides a mechanism to stream decompressed data out of a +``read_to_iter(fh)`` provides a mechanism to stream decompressed data out of a compressed source as an iterator of data chunks.:: dctx = zstd.ZstdDecompressor() - for chunk in dctx.read_from(fh): + for chunk in dctx.read_to_iter(fh): # Do something with original data. -``read_from()`` accepts a) an object with a ``read(size)`` method that will -return compressed bytes b) an object conforming to the buffer protocol that -can expose its data as a contiguous range of bytes. The ``bytes`` and -``memoryview`` types expose this buffer protocol. +``read_to_iter()`` accepts an object with a ``read(size)`` method that will +return compressed bytes or an object conforming to the buffer protocol that +can expose its data as a contiguous range of bytes. -``read_from()`` returns an iterator whose elements are chunks of the +``read_to_iter()`` returns an iterator whose elements are chunks of the decompressed data. The size of requested ``read()`` from the source can be specified:: dctx = zstd.ZstdDecompressor() - for chunk in dctx.read_from(fh, read_size=16384): + for chunk in dctx.read_to_iter(fh, read_size=16384): pass It is also possible to skip leading bytes in the input data:: dctx = zstd.ZstdDecompressor() - for chunk in dctx.read_from(fh, skip_bytes=1): + for chunk in dctx.read_to_iter(fh, skip_bytes=1): pass -Skipping leading bytes is useful if the source data contains extra -*header* data but you want to avoid the overhead of making a buffer copy -or allocating a new ``memoryview`` object in order to decompress the data. +.. tip:: -Similarly to ``ZstdCompressor.read_from()``, the consumer of the iterator + Skipping leading bytes is useful if the source data contains extra + *header* data. Traditionally, you would need to create a slice or + ``memoryview`` of the data you want to decompress. This would create + overhead. It is more efficient to pass the offset into this API. + +Similarly to ``ZstdCompressor.read_to_iter()``, the consumer of the iterator controls when data is decompressed. If the iterator isn't consumed, decompression is put on hold. -When ``read_from()`` is passed an object conforming to the buffer protocol, +When ``read_to_iter()`` is passed an object conforming to the buffer protocol, the behavior may seem similar to what occurs when the simple decompression API is used. However, this API works when the decompressed size is unknown. Furthermore, if feeding large inputs, the decompressor will work in chunks @@ -636,7 +682,7 @@ ^^^^^^^^^^^^^^^^ ``decompressobj()`` returns an object that exposes a ``decompress(data)`` -methods. Compressed data chunks are fed into ``decompress(data)`` and +method. Compressed data chunks are fed into ``decompress(data)`` and uncompressed output (or an empty bytes) is returned. Output from subsequent calls needs to be concatenated to reassemble the full decompressed byte sequence. @@ -650,11 +696,25 @@ Here is how this API should be used:: - dctx = zstd.ZstdDeompressor() - dobj = cctx.decompressobj() + dctx = zstd.ZstdDecompressor() + dobj = dctx.decompressobj() data = dobj.decompress(compressed_chunk_0) data = dobj.decompress(compressed_chunk_1) +By default, calls to ``decompress()`` write output data in chunks of size +``DECOMPRESSION_RECOMMENDED_OUTPUT_SIZE``. These chunks are concatenated +before being returned to the caller. It is possible to define the size of +these temporary chunks by passing ``write_size`` to ``decompressobj()``:: + + dctx = zstd.ZstdDecompressor() + dobj = dctx.decompressobj(write_size=1048576) + +.. note:: + + Because calls to ``decompress()`` may need to perform multiple + memory (re)allocations, this streaming decompression API isn't as + efficient as other APIs. + Batch Decompression API ^^^^^^^^^^^^^^^^^^^^^^^ @@ -671,9 +731,12 @@ minimal input validation will be done for that type. If calling from Python (as opposed to C), constructing one of these instances may add overhead cancelling out the performance overhead of validation for list -inputs. +inputs.:: -The decompressed size of each frame must be discoverable. It can either be + dctx = zstd.ZstdDecompressor() + results = dctx.multi_decompress_to_buffer([b'...', b'...']) + +The decompressed size of each frame MUST be discoverable. It can either be embedded within the zstd frame (``write_content_size=True`` argument to ``ZstdCompressor``) or passed in via the ``decompressed_sizes`` argument. @@ -681,7 +744,13 @@ protocol which holds an array of 64-bit unsigned integers in the machine's native format defining the decompressed sizes of each frame. If this argument is passed, it avoids having to scan each frame for its decompressed size. -This frame scanning can add noticeable overhead in some scenarios. +This frame scanning can add noticeable overhead in some scenarios.:: + + frames = [...] + sizes = struct.pack('=QQQQ', len0, len1, len2, len3) + + dctx = zstd.ZstdDecompressor() + results = dctx.multi_decompress_to_buffer(frames, decompressed_sizes=sizes) The ``threads`` argument controls the number of threads to use to perform decompression operations. The default (``0``) or the value ``1`` means to @@ -701,22 +770,23 @@ as possible by having as little overhead as possible. Since decompression is performed as a single operation and since the decompressed output is stored in a single buffer, extra memory allocations, Python objects, and Python function -calls are avoided. This is ideal for scenarios where callers need to access -decompressed data for multiple frames. +calls are avoided. This is ideal for scenarios where callers know up front that +they need to access data for multiple frames, such as when *delta chains* are +being used. Currently, the implementation always spawns multiple threads when requested, even if the amount of work to do is small. In the future, it will be smarter about avoiding threads and their associated overhead when the amount of work to do is small. -Content-Only Dictionary Chain Decompression -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +Prefix Dictionary Chain Decompression +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ ``decompress_content_dict_chain(frames)`` performs decompression of a list of -zstd frames produced using chained *content-only* dictionary compression. Such +zstd frames produced using chained *prefix* dictionary compression. Such a list of frames is produced by compressing discrete inputs where each -non-initial input is compressed with a *content-only* dictionary consisting -of the content of the previous input. +non-initial input is compressed with a *prefix* dictionary consisting of the +content of the previous input. For example, say you have the following inputs:: @@ -725,25 +795,25 @@ The zstd frame chain consists of: 1. ``b'input 1'`` compressed in standalone/discrete mode -2. ``b'input 2'`` compressed using ``b'input 1'`` as a *content-only* dictionary -3. ``b'input 3'`` compressed using ``b'input 2'`` as a *content-only* dictionary +2. ``b'input 2'`` compressed using ``b'input 1'`` as a *prefix* dictionary +3. ``b'input 3'`` compressed using ``b'input 2'`` as a *prefix* dictionary Each zstd frame **must** have the content size written. -The following Python code can be used to produce a *content-only dictionary -chain*:: +The following Python code can be used to produce a *prefix dictionary chain*:: def make_chain(inputs): frames = [] # First frame is compressed in standalone/discrete mode. - zctx = zstd.ZstdCompressor(write_content_size=True) + zctx = zstd.ZstdCompressor() frames.append(zctx.compress(inputs[0])) - # Subsequent frames use the previous fulltext as a content-only dictionary + # Subsequent frames use the previous fulltext as a prefix dictionary for i, raw in enumerate(inputs[1:]): - dict_data = zstd.ZstdCompressionDict(inputs[i]) - zctx = zstd.ZstdCompressor(write_content_size=True, dict_data=dict_data) + dict_data = zstd.ZstdCompressionDict( + inputs[i], dict_type=zstd.DICT_TYPE_RAWCONTENT) + zctx = zstd.ZstdCompressor(dict_data=dict_data) frames.append(zctx.compress(raw)) return frames @@ -751,10 +821,13 @@ ``decompress_content_dict_chain()`` returns the uncompressed data of the last element in the input chain. -It is possible to implement *content-only dictionary chain* decompression -on top of other Python APIs. However, this function will likely be significantly -faster, especially for long input chains, as it avoids the overhead of -instantiating and passing around intermediate objects between C and Python. + +.. note:: + + It is possible to implement *prefix dictionary chain* decompression + on top of other APIs. However, this function will likely be faster - + especially for long input chains - as it avoids the overhead of instantiating + and passing around intermediate objects between C and Python. Multi-Threaded Compression -------------------------- @@ -764,9 +837,15 @@ into segments and each segment is fed into a worker pool for compression. Once a segment is compressed, it is flushed/appended to the output. +.. note:: + + These threads are created at the C layer and are not Python threads. So they + work outside the GIL. It is therefore possible to CPU saturate multiple cores + from Python. + The segment size for multi-threaded compression is chosen from the window size of the compressor. This is derived from the ``window_log`` attribute of a -``CompressionParameters`` instance. By default, segment sizes are in the 1+MB +``ZstdCompressionParameters`` instance. By default, segment sizes are in the 1+MB range. If multi-threaded compression is requested and the input is smaller than the @@ -785,31 +864,33 @@ there is a CPU/wall time versus size trade off that may warrant investigation. Output from multi-threaded compression does not require any special handling -on the decompression side. In other words, any zstd decompressor should be able -to consume data produced with multi-threaded compression. +on the decompression side. To the decompressor, data generated with single +threaded compressor looks the same as data generated by a multi-threaded +compressor and does not require any special handling or additional resource +requirements. Dictionary Creation and Management ---------------------------------- -Compression dictionaries are represented as the ``ZstdCompressionDict`` type. +Compression dictionaries are represented with the ``ZstdCompressionDict`` type. Instances can be constructed from bytes:: dict_data = zstd.ZstdCompressionDict(data) -It is possible to construct a dictionary from *any* data. Unless the -data begins with a magic header, the dictionary will be treated as -*content-only*. *Content-only* dictionaries allow compression operations -that follow to reference raw data within the content. For one use of -*content-only* dictionaries, see -``ZstdDecompressor.decompress_content_dict_chain()``. +It is possible to construct a dictionary from *any* data. If the data doesn't +begin with a magic header, it will be treated as a *prefix* dictionary. +*Prefix* dictionaries allow compression operations to reference raw data +within the dictionary. -More interestingly, instances can be created by *training* on sample data:: +It is possible to force the use of *prefix* dictionaries or to require a +dictionary header: - dict_data = zstd.train_dictionary(size, samples) + dict_data = zstd.ZstdCompressionDict(data, + dict_type=zstd.DICT_TYPE_RAWCONTENT) -This takes a list of bytes instances and creates and returns a -``ZstdCompressionDict``. + dict_data = zstd.ZstdCompressionDict(data, + dict_type=zstd.DICT_TYPE_FULLDICT) You can see how many bytes are in the dictionary by calling ``len()``:: @@ -819,7 +900,7 @@ Once you have a dictionary, you can pass it to the objects performing compression and decompression:: - dict_data = zstd.train_dictionary(16384, samples) + dict_data = zstd.train_dictionary(131072, samples) cctx = zstd.ZstdCompressor(dict_data=dict_data) for source_data in input_data: @@ -829,7 +910,7 @@ dctx = zstd.ZstdDecompressor(dict_data=dict_data) for compressed_data in input_data: buffer = io.BytesIO() - with dctx.write_to(buffer) as decompressor: + with dctx.stream_writer(buffer) as decompressor: decompressor.write(compressed_data) # Do something with raw data in ``buffer``. @@ -843,56 +924,69 @@ dict_data = zstd.train_dictionary(size, samples) raw_data = dict_data.as_bytes() -The following named arguments to ``train_dictionary`` can also be used -to further control dictionary generation. +By default, when a ``ZstdCompressionDict`` is *attached* to a +``ZstdCompressor``, each ``ZstdCompressor`` performs work to prepare the +dictionary for use. This is fine if only 1 compression operation is being +performed or if the ``ZstdCompressor`` is being reused for multiple operations. +But if multiple ``ZstdCompressor`` instances are being used with the dictionary, +this can add overhead. -selectivity - Integer selectivity level. Default is 9. Larger values yield more data in - dictionary. -level - Integer compression level. Default is 6. -dict_id - Integer dictionary ID for the produced dictionary. Default is 0, which - means to use a random value. -notifications - Controls writing of informational messages to ``stderr``. ``0`` (the - default) means to write nothing. ``1`` writes errors. ``2`` writes - progression info. ``3`` writes more details. And ``4`` writes all info. +It is possible to *precompute* the dictionary so it can readily be consumed +by multiple ``ZstdCompressor`` instances:: + + d = zstd.ZstdCompressionDict(data) -Cover Dictionaries -^^^^^^^^^^^^^^^^^^ + # Precompute for compression level 3. + d.precompute_compress(level=3) -An alternate dictionary training mechanism named *cover* is also available. -More details about this training mechanism are available in the paper -*Effective Construction of Relative Lempel-Ziv Dictionaries* (authors: -Liao, Petri, Moffat, Wirth). - -To use this mechanism, use ``zstd.train_cover_dictionary()`` instead of -``zstd.train_dictionary()``. The function behaves nearly the same except -its arguments are different and the returned dictionary will contain ``k`` -and ``d`` attributes reflecting the parameters to the cover algorithm. + # Precompute with specific compression parameters. + params = zstd.ZstdCompressionParameters(...) + d.precompute_compress(compression_params=params) .. note:: - The ``k`` and ``d`` attributes are only populated on dictionary - instances created by this function. If a ``ZstdCompressionDict`` is - constructed from raw bytes data, the ``k`` and ``d`` attributes will - be ``0``. + When a dictionary is precomputed, the compression parameters used to + precompute the dictionary overwrite some of the compression parameters + specified to ``ZstdCompressor.__init__``. + +Training Dictionaries +^^^^^^^^^^^^^^^^^^^^^ + +Unless using *prefix* dictionaries, dictionary data is produced by *training* +on existing data:: + + dict_data = zstd.train_dictionary(size, samples) + +This takes a target dictionary size and list of bytes instances and creates and +returns a ``ZstdCompressionDict``. + +The dictionary training mechanism is known as *cover*. More details about it are +available in the paper *Effective Construction of Relative Lempel-Ziv +Dictionaries* (authors: Liao, Petri, Moffat, Wirth). + +The cover algorithm takes parameters ``k` and ``d``. These are the +*segment size* and *dmer size*, respectively. The returned dictionary +instance created by this function has ``k`` and ``d`` attributes +containing the values for these parameters. If a ``ZstdCompressionDict`` +is constructed from raw bytes data (a content-only dictionary), the +``k`` and ``d`` attributes will be ``0``. The segment and dmer size parameters to the cover algorithm can either be -specified manually or you can ask ``train_cover_dictionary()`` to try -multiple values and pick the best one, where *best* means the smallest -compressed data size. - -In manual mode, the ``k`` and ``d`` arguments must be specified or a -``ZstdError`` will be raised. +specified manually or ``train_dictionary()`` can try multiple values +and pick the best one, where *best* means the smallest compressed data size. +This later mode is called *optimization* mode. -In automatic mode (triggered by specifying ``optimize=True``), ``k`` -and ``d`` are optional. If a value isn't specified, then default values for -both are tested. The ``steps`` argument can control the number of steps -through ``k`` values. The ``level`` argument defines the compression level -that will be used when testing the compressed size. And ``threads`` can -specify the number of threads to use for concurrent operation. +If none of ``k``, ``d``, ``steps``, ``threads``, ``level``, ``notifications``, +or ``dict_id`` (basically anything from the underlying ``ZDICT_cover_params_t`` +struct) are defined, *optimization* mode is used with default parameter +values. + +If ``steps`` or ``threads`` are defined, then *optimization* mode is engaged +with explicit control over those parameters. Specifying ``threads=0`` or +``threads=1`` can be used to engage *optimization* mode if other parameters +are not defined. + +Otherwise, non-*optimization* mode is used with the parameters specified. This function takes the following arguments: @@ -909,64 +1003,92 @@ dict_id Integer dictionary ID for the produced dictionary. Default is 0, which uses a random value. -optimize - When true, test dictionary generation with multiple parameters. +steps + Number of steps through ``k`` values to perform when trying parameter + variations. +threads + Number of threads to use when trying parameter variations. Default is 0, + which means to use a single thread. A negative value can be specified to + use as many threads as there are detected logical CPUs. level - Integer target compression level when testing compression with - ``optimize=True``. Default is 1. -steps - Number of steps through ``k`` values to perform when ``optimize=True``. - Default is 32. -threads - Number of threads to use when ``optimize=True``. Default is 0, which means - to use a single thread. A negative value can be specified to use as many - threads as there are detected logical CPUs. + Integer target compression level when trying parameter variations. notifications - Controls writing of informational messages to ``stderr``. See the - documentation for ``train_dictionary()`` for more. + Controls writing of informational messages to ``stderr``. ``0`` (the + default) means to write nothing. ``1`` writes errors. ``2`` writes + progression info. ``3`` writes more details. And ``4`` writes all info. Explicit Compression Parameters ------------------------------- -Zstandard's integer compression levels along with the input size and dictionary -size are converted into a data structure defining multiple parameters to tune -behavior of the compression algorithm. It is possible to use define this -data structure explicitly to have lower-level control over compression behavior. +Zstandard offers a high-level *compression level* that maps to lower-level +compression parameters. For many consumers, this numeric level is the only +compression setting you'll need to touch. + +But for advanced use cases, it might be desirable to tweak these lower-level +settings. -The ``zstd.CompressionParameters`` type represents this data structure. -You can see how Zstandard converts compression levels to this data structure -by calling ``zstd.get_compression_parameters()``. e.g.:: +The ``ZstdCompressionParameters`` type represents these low-level compression +settings. - params = zstd.get_compression_parameters(5) +Instances of this type can be constructed from a myriad of keyword arguments +(defined below) for complete low-level control over each adjustable +compression setting. + +From a higher level, one can construct a ``ZstdCompressionParameters`` instance +given a desired compression level and target input and dictionary size +using ``ZstdCompressionParameters.from_level()``. e.g.:: -This function also accepts the uncompressed data size and dictionary size -to adjust parameters:: + # Derive compression settings for compression level 7. + params = zstd.ZstdCompressionParameters.from_level(7) - params = zstd.get_compression_parameters(3, source_size=len(data), dict_size=len(dict_data)) + # With an input size of 1MB + params = zstd.ZstdCompressionParameters.from_level(7, source_size=1048576) + +Using ``from_level()``, it is also possible to override individual compression +parameters or to define additional settings that aren't automatically derived. +e.g.:: -You can also construct compression parameters from their low-level components:: + params = zstd.ZstdCompressionParameters.from_level(4, window_log=10) + params = zstd.ZstdCompressionParameters.from_level(5, threads=4) + +Or you can define low-level compression settings directly:: - params = zstd.CompressionParameters(20, 6, 12, 5, 4, 10, zstd.STRATEGY_FAST) + params = zstd.ZstdCompressionParameters(window_log=12, enable_ldm=True) -You can then configure a compressor to use the custom parameters:: +Once a ``ZstdCompressionParameters`` instance is obtained, it can be used to +configure a compressor:: cctx = zstd.ZstdCompressor(compression_params=params) -The members/attributes of ``CompressionParameters`` instances are as follows:: +The named arguments and attributes of ``ZstdCompressionParameters`` are as +follows: +* format +* compression_level * window_log +* hash_log * chain_log -* hash_log * search_log -* search_length +* min_match * target_length -* strategy +* compression_strategy +* write_content_size +* write_checksum +* write_dict_id +* job_size +* overlap_size_log +* compress_literals +* force_max_window +* enable_ldm +* ldm_hash_log +* ldm_min_match +* ldm_bucket_size_log +* ldm_hash_every_log +* threads -This is the order the arguments are passed to the constructor if not using -named arguments. - -You'll need to read the Zstandard documentation for what these parameters -do. +Some of these are very low-level settings. It may help to consult the official +zstandard documentation for their behavior. Look for the ``ZSTD_p_*`` constants +in ``zstd.h`` (https://github.com/facebook/zstd/blob/dev/lib/zstd.h). Frame Inspection ---------------- @@ -1003,15 +1125,17 @@ Bool indicating whether a 4 byte content checksum is stored at the end of the frame. +``zstd.frame_header_size(data)`` returns the size of the zstandard frame +header. + +``zstd.frame_content_size(data)`` returns the content size as parsed from +the frame header. ``-1`` means the content size is unknown. ``0`` means +an empty frame. The content size is usually correct. However, it may not +be accurate. + Misc Functionality ------------------ -estimate_compression_context_size(CompressionParameters) -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -Given a ``CompressionParameters`` struct, estimate the memory size required -to perform compression. - estimate_decompression_context_size() ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ @@ -1041,6 +1165,11 @@ MAGIC_NUMBER Frame header as an integer +CONTENTSIZE_UNKNOWN + Value for content size when the content size is unknown. +CONTENTSIZE_ERROR + Value for content size when content size couldn't be determined. + WINDOWLOG_MIN Minimum value for compression parameter WINDOWLOG_MAX @@ -1063,8 +1192,6 @@ Maximum value for compression parameter TARGETLENGTH_MIN Minimum value for compression parameter -TARGETLENGTH_MAX - Maximum value for compression parameter STRATEGY_FAST Compression strategy STRATEGY_DFAST @@ -1079,6 +1206,13 @@ Compression strategy STRATEGY_BTOPT Compression strategy +STRATEGY_BTULTRA + Compression strategy + +FORMAT_ZSTD1 + Zstandard frame format +FORMAT_ZSTD1_MAGICLESS + Zstandard frame format without magic header Performance Considerations -------------------------- @@ -1090,7 +1224,7 @@ operation. The differences are magnified as the size of data decreases. For example, the difference between *context* reuse and non-reuse for 100,000 100 byte inputs will be significant (possiby over 10x faster to reuse contexts) -whereas 10 1,000,000 byte inputs will be more similar in speed (because the +whereas 10 100,000,000 byte inputs will be more similar in speed (because the time spent doing compression dwarfs time spent creating new *contexts*). Buffer Types @@ -1187,9 +1321,8 @@ because different applications have different needs and the library wants to facilitate optimal use in as many use cases as possible. -From a high-level, APIs are divided into *one-shot* and *streaming*. See -the ``Concepts`` section for a description of how these are different at -the C layer. +From a high-level, APIs are divided into *one-shot* and *streaming*: either you +are operating on all data at once or you operate on it piecemeal. The *one-shot* APIs are useful for small data, where the input or output size is known. (The size can come from a buffer length, file size, or @@ -1222,145 +1355,39 @@ underlying stream (say from interacting with a filesystem or network). This could add considerable overhead. -Concepts -======== - -It is important to have a basic understanding of how Zstandard works in order -to optimally use this library. In addition, there are some low-level Python -concepts that are worth explaining to aid understanding. This section aims to -provide that knowledge. - -Zstandard Frames and Compression Format ---------------------------------------- - -Compressed zstandard data almost always exists within a container called a -*frame*. (For the technically curious, see the -`specification _.) - -The frame contains a header and optional trailer. The header contains a -magic number to self-identify as a zstd frame and a description of the -compressed data that follows. - -Among other things, the frame *optionally* contains the size of the -decompressed data the frame represents, a 32-bit checksum of the -decompressed data (to facilitate verification during decompression), -and the ID of the dictionary used to compress the data. - -Storing the original content size in the frame (``write_content_size=True`` -to ``ZstdCompressor``) is important for performance in some scenarios. Having -the decompressed size stored there (or storing it elsewhere) allows -decompression to perform a single memory allocation that is exactly sized to -the output. This is faster than continuously growing a memory buffer to hold -output. +Thread Safety +============= -Compression and Decompression Contexts --------------------------------------- - -In order to perform a compression or decompression operation with the zstd -C API, you need what's called a *context*. A context essentially holds -configuration and state for a compression or decompression operation. For -example, a compression context holds the configured compression level. - -Contexts can be reused for multiple operations. Since creating and -destroying contexts is not free, there are performance advantages to -reusing contexts. - -The ``ZstdCompressor`` and ``ZstdDecompressor`` types are essentially -wrappers around these contexts in the zstd C API. +``ZstdCompressor`` and ``ZstdDecompressor`` instances have no guarantees +about thread safety. Do not operate on the same ``ZstdCompressor`` and +``ZstdDecompressor`` instance simultaneously from different threads. It is +fine to have different threads call into a single instance, just not at the +same time. -One-shot And Streaming Operations ---------------------------------- - -A compression or decompression operation can either be performed as a -single *one-shot* operation or as a continuous *streaming* operation. - -In one-shot mode (the *simple* APIs provided by the Python interface), -**all** input is handed to the compressor or decompressor as a single buffer -and **all** output is returned as a single buffer. - -In streaming mode, input is delivered to the compressor or decompressor as -a series of chunks via multiple function calls. Likewise, output is -obtained in chunks as well. - -Streaming operations require an additional *stream* object to be created -to track the operation. These are logical extensions of *context* -instances. +Some operations require multiple function calls to complete. e.g. streaming +operations. A single ``ZstdCompressor`` or ``ZstdDecompressor`` cannot be used +for simultaneously active operations. e.g. you must not start a streaming +operation when another streaming operation is already active. -There are advantages and disadvantages to each mode of operation. There -are scenarios where certain modes can't be used. See the -``Choosing an API`` section for more. - -Dictionaries ------------- - -A compression *dictionary* is essentially data used to seed the compressor -state so it can achieve better compression. The idea is that if you are -compressing a lot of similar pieces of data (e.g. JSON documents or anything -sharing similar structure), then you can find common patterns across multiple -objects then leverage those common patterns during compression and -decompression operations to achieve better compression ratios. - -Dictionary compression is generally only useful for small inputs - data no -larger than a few kilobytes. The upper bound on this range is highly dependent -on the input data and the dictionary. - -Python Buffer Protocol ----------------------- - -Many functions in the library operate on objects that implement Python's -`buffer protocol `_. - -The *buffer protocol* is an internal implementation detail of a Python -type that allows instances of that type (objects) to be exposed as a raw -pointer (or buffer) in the C API. In other words, it allows objects to be -exposed as an array of bytes. +The C extension releases the GIL during non-trivial calls into the zstd C +API. Non-trivial calls are notably compression and decompression. Trivial +calls are things like parsing frame parameters. Where the GIL is released +is considered an implementation detail and can change in any release. -From the perspective of the C API, objects implementing the *buffer protocol* -all look the same: they are just a pointer to a memory address of a defined -length. This allows the C API to be largely type agnostic when accessing their -data. This allows custom types to be passed in without first converting them -to a specific type. - -Many Python types implement the buffer protocol. These include ``bytes`` -(``str`` on Python 2), ``bytearray``, ``array.array``, ``io.BytesIO``, -``mmap.mmap``, and ``memoryview``. - -``python-zstandard`` APIs that accept objects conforming to the buffer -protocol require that the buffer is *C contiguous* and has a single -dimension (``ndim==1``). This is usually the case. An example of where it -is not is a Numpy matrix type. - -Requiring Output Sizes for Non-Streaming Decompression APIs ------------------------------------------------------------ - -Non-streaming decompression APIs require that either the output size is -explicitly defined (either in the zstd frame header or passed into the -function) or that a max output size is specified. This restriction is for -your safety. - -The *one-shot* decompression APIs store the decompressed result in a -single buffer. This means that a buffer needs to be pre-allocated to hold -the result. If the decompressed size is not known, then there is no universal -good default size to use. Any default will fail or will be highly sub-optimal -in some scenarios (it will either be too small or will put stress on the -memory allocator to allocate a too large block). - -A *helpful* API may retry decompression with buffers of increasing size. -While useful, there are obvious performance disadvantages, namely redoing -decompression N times until it works. In addition, there is a security -concern. Say the input came from highly compressible data, like 1 GB of the -same byte value. The output size could be several magnitudes larger than the -input size. An input of <100KB could decompress to >1GB. Without a bounds -restriction on the decompressed size, certain inputs could exhaust all system -memory. That's not good and is why the maximum output size is limited. +APIs that accept bytes-like objects don't enforce that the underlying object +is read-only. However, it is assumed that the passed object is read-only for +the duration of the function call. It is possible to pass a mutable object +(like a ``bytearray``) to e.g. ``ZstdCompressor.compress()``, have the GIL +released, and mutate the object from another thread. Such a race condition +is a bug in the consumer of python-zstandard. Most Python data types are +immutable, so unless you are doing something fancy, you don't need to +worry about this. Note on Zstandard's *Experimental* API ====================================== Many of the Zstandard APIs used by this module are marked as *experimental* -within the Zstandard project. This includes a large number of useful -features, such as compression and frame parameters and parts of dictionary -compression. +within the Zstandard project. It is unclear how Zstandard's C API will evolve over time, especially with regards to this *experimental* functionality. We will try to maintain @@ -1371,7 +1398,7 @@ module and since we compile against it, the behavior of a specific version of this module should be constant for all of time. So if you pin the version of this module used in your projects (which is a Python -best practice), you should be buffered from unwanted future changes. +best practice), you should be shielded from unwanted future changes. Donate ====== diff -r fb92df8b634c -r ed5448edcbfa contrib/python-zstandard/c-ext/bufferutil.c --- a/contrib/python-zstandard/c-ext/bufferutil.c Wed Apr 04 10:35:09 2018 -0400 +++ b/contrib/python-zstandard/c-ext/bufferutil.c Wed Apr 18 15:32:08 2018 -0400 @@ -83,7 +83,7 @@ } if (segments.len % sizeof(BufferSegment)) { - PyErr_Format(PyExc_ValueError, "segments array size is not a multiple of %lu", + PyErr_Format(PyExc_ValueError, "segments array size is not a multiple of %zu", sizeof(BufferSegment)); goto except; } @@ -123,7 +123,7 @@ PyBuffer_Release(&self->parent); PyBuffer_Release(&segments); return -1; -}; +} /** * Construct a BufferWithSegments from existing memory and offsets. @@ -188,6 +188,12 @@ return NULL; } + if (self->segments[i].length > PY_SSIZE_T_MAX) { + PyErr_Format(PyExc_ValueError, + "item at offset %zd is too large for this platform", i); + return NULL; + } + result = (ZstdBufferSegment*)PyObject_CallObject((PyObject*)&ZstdBufferSegmentType, NULL); if (NULL == result) { return NULL; @@ -197,7 +203,7 @@ Py_INCREF(self); result->data = (char*)self->data + self->segments[i].offset; - result->dataSize = self->segments[i].length; + result->dataSize = (Py_ssize_t)self->segments[i].length; result->offset = self->segments[i].offset; return result; @@ -205,7 +211,13 @@ #if PY_MAJOR_VERSION >= 3 static int BufferWithSegments_getbuffer(ZstdBufferWithSegments* self, Py_buffer* view, int flags) { - return PyBuffer_FillInfo(view, (PyObject*)self, self->data, self->dataSize, 1, flags); + if (self->dataSize > PY_SSIZE_T_MAX) { + view->obj = NULL; + PyErr_SetString(PyExc_BufferError, "buffer is too large for this platform"); + return -1; + } + + return PyBuffer_FillInfo(view, (PyObject*)self, self->data, (Py_ssize_t)self->dataSize, 1, flags); } #else static Py_ssize_t BufferWithSegments_getreadbuffer(ZstdBufferWithSegments* self, Py_ssize_t segment, void **ptrptr) { @@ -214,8 +226,13 @@ return -1; } + if (self->dataSize > PY_SSIZE_T_MAX) { + PyErr_SetString(PyExc_ValueError, "buffer is too large for this platform"); + return -1; + } + *ptrptr = self->data; - return self->dataSize; + return (Py_ssize_t)self->dataSize; } static Py_ssize_t BufferWithSegments_getsegcount(ZstdBufferWithSegments* self, Py_ssize_t* len) { @@ -232,7 +249,12 @@ ); static PyObject* BufferWithSegments_tobytes(ZstdBufferWithSegments* self) { - return PyBytes_FromStringAndSize(self->data, self->dataSize); + if (self->dataSize > PY_SSIZE_T_MAX) { + PyErr_SetString(PyExc_ValueError, "buffer is too large for this platform"); + return NULL; + } + + return PyBytes_FromStringAndSize(self->data, (Py_ssize_t)self->dataSize); } PyDoc_STRVAR(BufferWithSegments_segments__doc__, diff -r fb92df8b634c -r ed5448edcbfa contrib/python-zstandard/c-ext/compressiondict.c --- a/contrib/python-zstandard/c-ext/compressiondict.c Wed Apr 04 10:35:09 2018 -0400 +++ b/contrib/python-zstandard/c-ext/compressiondict.c Wed Apr 18 15:32:08 2018 -0400 @@ -14,125 +14,11 @@ static char* kwlist[] = { "dict_size", "samples", - "selectivity", - "level", - "notifications", - "dict_id", - NULL - }; - size_t capacity; - PyObject* samples; - Py_ssize_t samplesLen; - unsigned selectivity = 0; - int level = 0; - unsigned notifications = 0; - unsigned dictID = 0; - ZDICT_params_t zparams; - Py_ssize_t sampleIndex; - Py_ssize_t sampleSize; - PyObject* sampleItem; - size_t zresult; - void* sampleBuffer = NULL; - void* sampleOffset; - size_t samplesSize = 0; - size_t* sampleSizes = NULL; - void* dict = NULL; - ZstdCompressionDict* result = NULL; - - if (!PyArg_ParseTupleAndKeywords(args, kwargs, "nO!|IiII:train_dictionary", - kwlist, - &capacity, - &PyList_Type, &samples, - &selectivity, &level, ¬ifications, &dictID)) { - return NULL; - } - - memset(&zparams, 0, sizeof(zparams)); - - zparams.selectivityLevel = selectivity; - zparams.compressionLevel = level; - zparams.notificationLevel = notifications; - zparams.dictID = dictID; - - /* Figure out the size of the raw samples */ - samplesLen = PyList_Size(samples); - for (sampleIndex = 0; sampleIndex < samplesLen; sampleIndex++) { - sampleItem = PyList_GetItem(samples, sampleIndex); - if (!PyBytes_Check(sampleItem)) { - PyErr_SetString(PyExc_ValueError, "samples must be bytes"); - return NULL; - } - samplesSize += PyBytes_GET_SIZE(sampleItem); - } - - /* Now that we know the total size of the raw simples, we can allocate - a buffer for the raw data */ - sampleBuffer = PyMem_Malloc(samplesSize); - if (!sampleBuffer) { - PyErr_NoMemory(); - goto finally; - } - sampleSizes = PyMem_Malloc(samplesLen * sizeof(size_t)); - if (!sampleSizes) { - PyErr_NoMemory(); - goto finally; - } - - sampleOffset = sampleBuffer; - /* Now iterate again and assemble the samples in the buffer */ - for (sampleIndex = 0; sampleIndex < samplesLen; sampleIndex++) { - sampleItem = PyList_GetItem(samples, sampleIndex); - sampleSize = PyBytes_GET_SIZE(sampleItem); - sampleSizes[sampleIndex] = sampleSize; - memcpy(sampleOffset, PyBytes_AS_STRING(sampleItem), sampleSize); - sampleOffset = (char*)sampleOffset + sampleSize; - } - - dict = PyMem_Malloc(capacity); - if (!dict) { - PyErr_NoMemory(); - goto finally; - } - - /* TODO consider using dup2() to redirect zstd's stderr writing to a buffer */ - Py_BEGIN_ALLOW_THREADS - zresult = ZDICT_trainFromBuffer_advanced(dict, capacity, - sampleBuffer, sampleSizes, (unsigned int)samplesLen, - zparams); - Py_END_ALLOW_THREADS - if (ZDICT_isError(zresult)) { - PyErr_Format(ZstdError, "Cannot train dict: %s", ZDICT_getErrorName(zresult)); - PyMem_Free(dict); - goto finally; - } - - result = PyObject_New(ZstdCompressionDict, &ZstdCompressionDictType); - if (!result) { - goto finally; - } - - result->dictData = dict; - result->dictSize = zresult; - result->d = 0; - result->k = 0; - -finally: - PyMem_Free(sampleBuffer); - PyMem_Free(sampleSizes); - - return result; -} - -ZstdCompressionDict* train_cover_dictionary(PyObject* self, PyObject* args, PyObject* kwargs) { - static char* kwlist[] = { - "dict_size", - "samples", "k", "d", "notifications", "dict_id", "level", - "optimize", "steps", "threads", NULL @@ -145,10 +31,9 @@ unsigned notifications = 0; unsigned dictID = 0; int level = 0; - PyObject* optimize = NULL; unsigned steps = 0; int threads = 0; - COVER_params_t params; + ZDICT_cover_params_t params; Py_ssize_t samplesLen; Py_ssize_t i; size_t samplesSize = 0; @@ -160,9 +45,9 @@ size_t zresult; ZstdCompressionDict* result = NULL; - if (!PyArg_ParseTupleAndKeywords(args, kwargs, "nO!|IIIIiOIi:train_cover_dictionary", + if (!PyArg_ParseTupleAndKeywords(args, kwargs, "nO!|IIIIiIi:train_dictionary", kwlist, &capacity, &PyList_Type, &samples, - &k, &d, ¬ifications, &dictID, &level, &optimize, &steps, &threads)) { + &k, &d, ¬ifications, &dictID, &level, &steps, &threads)) { return NULL; } @@ -175,9 +60,9 @@ params.d = d; params.steps = steps; params.nbThreads = threads; - params.notificationLevel = notifications; - params.dictID = dictID; - params.compressionLevel = level; + params.zParams.notificationLevel = notifications; + params.zParams.dictID = dictID; + params.zParams.compressionLevel = level; /* Figure out total size of input samples. */ samplesLen = PyList_Size(samples); @@ -219,12 +104,21 @@ } Py_BEGIN_ALLOW_THREADS - if (optimize && PyObject_IsTrue(optimize)) { - zresult = COVER_optimizeTrainFromBuffer(dict, capacity, + /* No parameters uses the default function, which will use default params + and call ZDICT_optimizeTrainFromBuffer_cover under the hood. */ + if (!params.k && !params.d && !params.zParams.compressionLevel + && !params.zParams.notificationLevel && !params.zParams.dictID) { + zresult = ZDICT_trainFromBuffer(dict, capacity, sampleBuffer, + sampleSizes, (unsigned)samplesLen); + } + /* Use optimize mode if user controlled steps or threads explicitly. */ + else if (params.steps || params.nbThreads) { + zresult = ZDICT_optimizeTrainFromBuffer_cover(dict, capacity, sampleBuffer, sampleSizes, (unsigned)samplesLen, ¶ms); } + /* Non-optimize mode with explicit control. */ else { - zresult = COVER_trainFromBuffer(dict, capacity, + zresult = ZDICT_trainFromBuffer_cover(dict, capacity, sampleBuffer, sampleSizes, (unsigned)samplesLen, params); } Py_END_ALLOW_THREADS @@ -243,8 +137,11 @@ result->dictData = dict; result->dictSize = zresult; + result->dictType = ZSTD_dct_fullDict; result->d = params.d; result->k = params.k; + result->cdict = NULL; + result->ddict = NULL; finally: PyMem_Free(sampleBuffer); @@ -253,43 +150,99 @@ return result; } +int ensure_ddict(ZstdCompressionDict* dict) { + if (dict->ddict) { + return 0; + } + + Py_BEGIN_ALLOW_THREADS + dict->ddict = ZSTD_createDDict_advanced(dict->dictData, dict->dictSize, + ZSTD_dlm_byRef, dict->dictType, ZSTD_defaultCMem); + Py_END_ALLOW_THREADS + if (!dict->ddict) { + PyErr_SetString(ZstdError, "could not create decompression dict"); + return 1; + } + + return 0; +} + PyDoc_STRVAR(ZstdCompressionDict__doc__, "ZstdCompressionDict(data) - Represents a computed compression dictionary\n" "\n" "This type holds the results of a computed Zstandard compression dictionary.\n" -"Instances are obtained by calling ``train_dictionary()`` or by passing bytes\n" -"obtained from another source into the constructor.\n" +"Instances are obtained by calling ``train_dictionary()`` or by passing\n" +"bytes obtained from another source into the constructor.\n" ); -static int ZstdCompressionDict_init(ZstdCompressionDict* self, PyObject* args) { - const char* source; - Py_ssize_t sourceSize; +static int ZstdCompressionDict_init(ZstdCompressionDict* self, PyObject* args, PyObject* kwargs) { + static char* kwlist[] = { + "data", + "dict_type", + NULL + }; + + int result = -1; + Py_buffer source; + unsigned dictType = ZSTD_dct_auto; self->dictData = NULL; self->dictSize = 0; + self->cdict = NULL; + self->ddict = NULL; #if PY_MAJOR_VERSION >= 3 - if (!PyArg_ParseTuple(args, "y#:ZstdCompressionDict", + if (!PyArg_ParseTupleAndKeywords(args, kwargs, "y*|I:ZstdCompressionDict", #else - if (!PyArg_ParseTuple(args, "s#:ZstdCompressionDict", + if (!PyArg_ParseTupleAndKeywords(args, kwargs, "s*|I:ZstdCompressionDict", #endif - &source, &sourceSize)) { + kwlist, &source, &dictType)) { return -1; } - self->dictData = PyMem_Malloc(sourceSize); + if (!PyBuffer_IsContiguous(&source, 'C') || source.ndim > 1) { + PyErr_SetString(PyExc_ValueError, + "data buffer should be contiguous and have at most one dimension"); + goto finally; + } + + if (dictType != ZSTD_dct_auto && dictType != ZSTD_dct_rawContent + && dictType != ZSTD_dct_fullDict) { + PyErr_Format(PyExc_ValueError, + "invalid dictionary load mode: %d; must use DICT_TYPE_* constants", + dictType); + goto finally; + } + + self->dictType = dictType; + + self->dictData = PyMem_Malloc(source.len); if (!self->dictData) { PyErr_NoMemory(); - return -1; + goto finally; } - memcpy(self->dictData, source, sourceSize); - self->dictSize = sourceSize; + memcpy(self->dictData, source.buf, source.len); + self->dictSize = source.len; + + result = 0; - return 0; +finally: + PyBuffer_Release(&source); + return result; +} + +static void ZstdCompressionDict_dealloc(ZstdCompressionDict* self) { + if (self->cdict) { + ZSTD_freeCDict(self->cdict); + self->cdict = NULL; } -static void ZstdCompressionDict_dealloc(ZstdCompressionDict* self) { + if (self->ddict) { + ZSTD_freeDDict(self->ddict); + self->ddict = NULL; + } + if (self->dictData) { PyMem_Free(self->dictData); self->dictData = NULL; @@ -298,6 +251,74 @@ PyObject_Del(self); } +PyDoc_STRVAR(ZstdCompressionDict_precompute_compress__doc__, +"Precompute a dictionary so it can be used by multiple compressors.\n" +); + +static PyObject* ZstdCompressionDict_precompute_compress(ZstdCompressionDict* self, PyObject* args, PyObject* kwargs) { + static char* kwlist[] = { + "level", + "compression_params", + NULL + }; + + int level = 0; + ZstdCompressionParametersObject* compressionParams = NULL; + ZSTD_compressionParameters cParams; + size_t zresult; + + if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|iO!:precompute_compress", kwlist, + &level, &ZstdCompressionParametersType, &compressionParams)) { + return NULL; + } + + if (level && compressionParams) { + PyErr_SetString(PyExc_ValueError, + "must only specify one of level or compression_params"); + return NULL; + } + + if (!level && !compressionParams) { + PyErr_SetString(PyExc_ValueError, + "must specify one of level or compression_params"); + return NULL; + } + + if (self->cdict) { + zresult = ZSTD_freeCDict(self->cdict); + self->cdict = NULL; + if (ZSTD_isError(zresult)) { + PyErr_Format(ZstdError, "unable to free CDict: %s", + ZSTD_getErrorName(zresult)); + return NULL; + } + } + + if (level) { + cParams = ZSTD_getCParams(level, 0, self->dictSize); + } + else { + cParams.chainLog = compressionParams->chainLog; + cParams.hashLog = compressionParams->hashLog; + cParams.searchLength = compressionParams->minMatch; + cParams.searchLog = compressionParams->searchLog; + cParams.strategy = compressionParams->compressionStrategy; + cParams.targetLength = compressionParams->targetLength; + cParams.windowLog = compressionParams->windowLog; + } + + assert(!self->cdict); + self->cdict = ZSTD_createCDict_advanced(self->dictData, self->dictSize, + ZSTD_dlm_byRef, self->dictType, cParams, ZSTD_defaultCMem); + + if (!self->cdict) { + PyErr_SetString(ZstdError, "unable to precompute dictionary"); + return NULL; + } + + Py_RETURN_NONE; +} + static PyObject* ZstdCompressionDict_dict_id(ZstdCompressionDict* self) { unsigned dictID = ZDICT_getDictID(self->dictData, self->dictSize); @@ -313,6 +334,8 @@ PyDoc_STR("dict_id() -- obtain the numeric dictionary ID") }, { "as_bytes", (PyCFunction)ZstdCompressionDict_as_bytes, METH_NOARGS, PyDoc_STR("as_bytes() -- obtain the raw bytes constituting the dictionary data") }, + { "precompute_compress", (PyCFunction)ZstdCompressionDict_precompute_compress, + METH_VARARGS | METH_KEYWORDS, ZstdCompressionDict_precompute_compress__doc__ }, { NULL, NULL } }; diff -r fb92df8b634c -r ed5448edcbfa contrib/python-zstandard/c-ext/compressionparams.c --- a/contrib/python-zstandard/c-ext/compressionparams.c Wed Apr 04 10:35:09 2018 -0400 +++ b/contrib/python-zstandard/c-ext/compressionparams.c Wed Apr 18 15:32:08 2018 -0400 @@ -8,204 +8,448 @@ #include "python-zstandard.h" -void ztopy_compression_parameters(CompressionParametersObject* params, ZSTD_compressionParameters* zparams) { - zparams->windowLog = params->windowLog; - zparams->chainLog = params->chainLog; - zparams->hashLog = params->hashLog; - zparams->searchLog = params->searchLog; - zparams->searchLength = params->searchLength; - zparams->targetLength = params->targetLength; - zparams->strategy = params->strategy; -} +extern PyObject* ZstdError; -CompressionParametersObject* get_compression_parameters(PyObject* self, PyObject* args) { - int compressionLevel; - unsigned PY_LONG_LONG sourceSize = 0; - Py_ssize_t dictSize = 0; - ZSTD_compressionParameters params; - CompressionParametersObject* result; - - if (!PyArg_ParseTuple(args, "i|Kn:get_compression_parameters", - &compressionLevel, &sourceSize, &dictSize)) { - return NULL; +int set_parameter(ZSTD_CCtx_params* params, ZSTD_cParameter param, unsigned value) { + size_t zresult = ZSTD_CCtxParam_setParameter(params, param, value); + if (ZSTD_isError(zresult)) { + PyErr_Format(ZstdError, "unable to set compression context parameter: %s", + ZSTD_getErrorName(zresult)); + return 1; } - params = ZSTD_getCParams(compressionLevel, sourceSize, dictSize); + return 0; +} + +#define TRY_SET_PARAMETER(params, param, value) if (set_parameter(params, param, value)) return -1; - result = PyObject_New(CompressionParametersObject, &CompressionParametersType); - if (!result) { - return NULL; +int set_parameters(ZSTD_CCtx_params* params, ZstdCompressionParametersObject* obj) { + TRY_SET_PARAMETER(params, ZSTD_p_format, obj->format); + TRY_SET_PARAMETER(params, ZSTD_p_compressionLevel, (unsigned)obj->compressionLevel); + TRY_SET_PARAMETER(params, ZSTD_p_windowLog, obj->windowLog); + TRY_SET_PARAMETER(params, ZSTD_p_hashLog, obj->hashLog); + TRY_SET_PARAMETER(params, ZSTD_p_chainLog, obj->chainLog); + TRY_SET_PARAMETER(params, ZSTD_p_searchLog, obj->searchLog); + TRY_SET_PARAMETER(params, ZSTD_p_minMatch, obj->minMatch); + TRY_SET_PARAMETER(params, ZSTD_p_targetLength, obj->targetLength); + TRY_SET_PARAMETER(params, ZSTD_p_compressionStrategy, obj->compressionStrategy); + TRY_SET_PARAMETER(params, ZSTD_p_contentSizeFlag, obj->contentSizeFlag); + TRY_SET_PARAMETER(params, ZSTD_p_checksumFlag, obj->checksumFlag); + TRY_SET_PARAMETER(params, ZSTD_p_dictIDFlag, obj->dictIDFlag); + TRY_SET_PARAMETER(params, ZSTD_p_nbWorkers, obj->threads); + TRY_SET_PARAMETER(params, ZSTD_p_jobSize, obj->jobSize); + TRY_SET_PARAMETER(params, ZSTD_p_overlapSizeLog, obj->overlapSizeLog); + TRY_SET_PARAMETER(params, ZSTD_p_compressLiterals, obj->compressLiterals); + TRY_SET_PARAMETER(params, ZSTD_p_forceMaxWindow, obj->forceMaxWindow); + TRY_SET_PARAMETER(params, ZSTD_p_enableLongDistanceMatching, obj->enableLongDistanceMatching); + TRY_SET_PARAMETER(params, ZSTD_p_ldmHashLog, obj->ldmHashLog); + TRY_SET_PARAMETER(params, ZSTD_p_ldmMinMatch, obj->ldmMinMatch); + TRY_SET_PARAMETER(params, ZSTD_p_ldmBucketSizeLog, obj->ldmBucketSizeLog); + TRY_SET_PARAMETER(params, ZSTD_p_ldmHashEveryLog, obj->ldmHashEveryLog); + + return 0; +} + +int reset_params(ZstdCompressionParametersObject* params) { + if (params->params) { + ZSTD_CCtxParams_reset(params->params); + } + else { + params->params = ZSTD_createCCtxParams(); + if (!params->params) { + PyErr_NoMemory(); + return 1; + } } - result->windowLog = params.windowLog; - result->chainLog = params.chainLog; - result->hashLog = params.hashLog; - result->searchLog = params.searchLog; - result->searchLength = params.searchLength; - result->targetLength = params.targetLength; - result->strategy = params.strategy; - - return result; + return set_parameters(params->params, params); } -static int CompressionParameters_init(CompressionParametersObject* self, PyObject* args, PyObject* kwargs) { +static int ZstdCompressionParameters_init(ZstdCompressionParametersObject* self, PyObject* args, PyObject* kwargs) { static char* kwlist[] = { + "format", + "compression_level", "window_log", - "chain_log", "hash_log", + "chain_log", "search_log", - "search_length", + "min_match", "target_length", - "strategy", + "compression_strategy", + "write_content_size", + "write_checksum", + "write_dict_id", + "job_size", + "overlap_size_log", + "force_max_window", + "enable_ldm", + "ldm_hash_log", + "ldm_min_match", + "ldm_bucket_size_log", + "ldm_hash_every_log", + "threads", + "compress_literals", NULL }; - unsigned windowLog; - unsigned chainLog; - unsigned hashLog; - unsigned searchLog; - unsigned searchLength; - unsigned targetLength; - unsigned strategy; - ZSTD_compressionParameters params; - size_t zresult; + unsigned format = 0; + int compressionLevel = 0; + unsigned windowLog = 0; + unsigned hashLog = 0; + unsigned chainLog = 0; + unsigned searchLog = 0; + unsigned minMatch = 0; + unsigned targetLength = 0; + unsigned compressionStrategy = 0; + unsigned contentSizeFlag = 1; + unsigned checksumFlag = 0; + unsigned dictIDFlag = 0; + unsigned jobSize = 0; + unsigned overlapSizeLog = 0; + unsigned forceMaxWindow = 0; + unsigned enableLDM = 0; + unsigned ldmHashLog = 0; + unsigned ldmMinMatch = 0; + unsigned ldmBucketSizeLog = 0; + unsigned ldmHashEveryLog = 0; + int threads = 0; - if (!PyArg_ParseTupleAndKeywords(args, kwargs, "IIIIIII:CompressionParameters", - kwlist, &windowLog, &chainLog, &hashLog, &searchLog, &searchLength, - &targetLength, &strategy)) { - return -1; - } + /* Setting value 0 has the effect of disabling. So we use -1 as a default + * to detect whether to set. Then we automatically derive the expected value + * based on the level, just like zstandard does itself. */ + int compressLiterals = -1; - if (windowLog < ZSTD_WINDOWLOG_MIN || windowLog > ZSTD_WINDOWLOG_MAX) { - PyErr_SetString(PyExc_ValueError, "invalid window log value"); - return -1; - } - - if (chainLog < ZSTD_CHAINLOG_MIN || chainLog > ZSTD_CHAINLOG_MAX) { - PyErr_SetString(PyExc_ValueError, "invalid chain log value"); - return -1; - } - - if (hashLog < ZSTD_HASHLOG_MIN || hashLog > ZSTD_HASHLOG_MAX) { - PyErr_SetString(PyExc_ValueError, "invalid hash log value"); + if (!PyArg_ParseTupleAndKeywords(args, kwargs, + "|IiIIIIIIIIIIIIIIIIIIii:CompressionParameters", + kwlist, &format, &compressionLevel, &windowLog, &hashLog, &chainLog, + &searchLog, &minMatch, &targetLength, &compressionStrategy, + &contentSizeFlag, &checksumFlag, &dictIDFlag, &jobSize, &overlapSizeLog, + &forceMaxWindow, &enableLDM, &ldmHashLog, &ldmMinMatch, &ldmBucketSizeLog, + &ldmHashEveryLog, &threads, &compressLiterals)) { return -1; } - if (searchLog < ZSTD_SEARCHLOG_MIN || searchLog > ZSTD_SEARCHLOG_MAX) { - PyErr_SetString(PyExc_ValueError, "invalid search log value"); - return -1; + if (threads < 0) { + threads = cpu_count(); } - if (searchLength < ZSTD_SEARCHLENGTH_MIN || searchLength > ZSTD_SEARCHLENGTH_MAX) { - PyErr_SetString(PyExc_ValueError, "invalid search length value"); - return -1; - } - - if (targetLength < ZSTD_TARGETLENGTH_MIN || targetLength > ZSTD_TARGETLENGTH_MAX) { - PyErr_SetString(PyExc_ValueError, "invalid target length value"); - return -1; + if (compressLiterals < 0) { + compressLiterals = compressionLevel >= 0; } - if (strategy < ZSTD_fast || strategy > ZSTD_btopt) { - PyErr_SetString(PyExc_ValueError, "invalid strategy value"); - return -1; - } - + self->format = format; + self->compressionLevel = compressionLevel; self->windowLog = windowLog; + self->hashLog = hashLog; self->chainLog = chainLog; - self->hashLog = hashLog; self->searchLog = searchLog; - self->searchLength = searchLength; + self->minMatch = minMatch; self->targetLength = targetLength; - self->strategy = strategy; + self->compressionStrategy = compressionStrategy; + self->contentSizeFlag = contentSizeFlag; + self->checksumFlag = checksumFlag; + self->dictIDFlag = dictIDFlag; + self->threads = threads; + self->jobSize = jobSize; + self->overlapSizeLog = overlapSizeLog; + self->compressLiterals = compressLiterals; + self->forceMaxWindow = forceMaxWindow; + self->enableLongDistanceMatching = enableLDM; + self->ldmHashLog = ldmHashLog; + self->ldmMinMatch = ldmMinMatch; + self->ldmBucketSizeLog = ldmBucketSizeLog; + self->ldmHashEveryLog = ldmHashEveryLog; - ztopy_compression_parameters(self, ¶ms); - zresult = ZSTD_checkCParams(params); - - if (ZSTD_isError(zresult)) { - PyErr_Format(PyExc_ValueError, "invalid compression parameters: %s", - ZSTD_getErrorName(zresult)); + if (reset_params(self)) { return -1; } return 0; } -PyDoc_STRVAR(CompressionParameters_estimated_compression_context_size__doc__, +PyDoc_STRVAR(ZstdCompressionParameters_from_level__doc__, +"Create a CompressionParameters from a compression level and target sizes\n" +); + +ZstdCompressionParametersObject* CompressionParameters_from_level(PyObject* undef, PyObject* args, PyObject* kwargs) { + int managedKwargs = 0; + int level; + PyObject* sourceSize = NULL; + PyObject* dictSize = NULL; + unsigned PY_LONG_LONG iSourceSize = 0; + Py_ssize_t iDictSize = 0; + PyObject* val; + ZSTD_compressionParameters params; + ZstdCompressionParametersObject* result = NULL; + int res; + + if (!PyArg_ParseTuple(args, "i:from_level", + &level)) { + return NULL; + } + + if (!kwargs) { + kwargs = PyDict_New(); + if (!kwargs) { + return NULL; + } + managedKwargs = 1; + } + + sourceSize = PyDict_GetItemString(kwargs, "source_size"); + if (sourceSize) { +#if PY_MAJOR_VERSION >= 3 + iSourceSize = PyLong_AsUnsignedLongLong(sourceSize); + if (iSourceSize == (unsigned PY_LONG_LONG)(-1)) { + goto cleanup; + } +#else + iSourceSize = PyInt_AsUnsignedLongLongMask(sourceSize); +#endif + + PyDict_DelItemString(kwargs, "source_size"); + } + + dictSize = PyDict_GetItemString(kwargs, "dict_size"); + if (dictSize) { +#if PY_MAJOR_VERSION >= 3 + iDictSize = PyLong_AsSsize_t(dictSize); +#else + iDictSize = PyInt_AsSsize_t(dictSize); +#endif + if (iDictSize == -1) { + goto cleanup; + } + + PyDict_DelItemString(kwargs, "dict_size"); + } + + + params = ZSTD_getCParams(level, iSourceSize, iDictSize); + + /* Values derived from the input level and sizes are passed along to the + constructor. But only if a value doesn't already exist. */ + val = PyDict_GetItemString(kwargs, "window_log"); + if (!val) { + val = PyLong_FromUnsignedLong(params.windowLog); + if (!val) { + goto cleanup; + } + PyDict_SetItemString(kwargs, "window_log", val); + Py_DECREF(val); + } + + val = PyDict_GetItemString(kwargs, "chain_log"); + if (!val) { + val = PyLong_FromUnsignedLong(params.chainLog); + if (!val) { + goto cleanup; + } + PyDict_SetItemString(kwargs, "chain_log", val); + Py_DECREF(val); + } + + val = PyDict_GetItemString(kwargs, "hash_log"); + if (!val) { + val = PyLong_FromUnsignedLong(params.hashLog); + if (!val) { + goto cleanup; + } + PyDict_SetItemString(kwargs, "hash_log", val); + Py_DECREF(val); + } + + val = PyDict_GetItemString(kwargs, "search_log"); + if (!val) { + val = PyLong_FromUnsignedLong(params.searchLog); + if (!val) { + goto cleanup; + } + PyDict_SetItemString(kwargs, "search_log", val); + Py_DECREF(val); + } + + val = PyDict_GetItemString(kwargs, "min_match"); + if (!val) { + val = PyLong_FromUnsignedLong(params.searchLength); + if (!val) { + goto cleanup; + } + PyDict_SetItemString(kwargs, "min_match", val); + Py_DECREF(val); + } + + val = PyDict_GetItemString(kwargs, "target_length"); + if (!val) { + val = PyLong_FromUnsignedLong(params.targetLength); + if (!val) { + goto cleanup; + } + PyDict_SetItemString(kwargs, "target_length", val); + Py_DECREF(val); + } + + val = PyDict_GetItemString(kwargs, "compression_strategy"); + if (!val) { + val = PyLong_FromUnsignedLong(params.strategy); + if (!val) { + goto cleanup; + } + PyDict_SetItemString(kwargs, "compression_strategy", val); + Py_DECREF(val); + } + + val = PyDict_GetItemString(kwargs, "compress_literals"); + if (!val) { + val = PyLong_FromLong(level >= 0 ? 1 : 0); + if (!val) { + goto cleanup; + } + PyDict_SetItemString(kwargs, "compress_literals", val); + Py_DECREF(val); + } + + result = PyObject_New(ZstdCompressionParametersObject, &ZstdCompressionParametersType); + if (!result) { + goto cleanup; + } + + result->params = NULL; + + val = PyTuple_New(0); + if (!val) { + Py_CLEAR(result); + goto cleanup; + } + + res = ZstdCompressionParameters_init(result, val, kwargs); + Py_DECREF(val); + + if (res) { + Py_CLEAR(result); + goto cleanup; + } + +cleanup: + if (managedKwargs) { + Py_DECREF(kwargs); + } + + return result; +} + +PyDoc_STRVAR(ZstdCompressionParameters_estimated_compression_context_size__doc__, "Estimate the size in bytes of a compression context for compression parameters\n" ); -PyObject* CompressionParameters_estimated_compression_context_size(CompressionParametersObject* self) { - ZSTD_compressionParameters params; - - ztopy_compression_parameters(self, ¶ms); - - return PyLong_FromSize_t(ZSTD_estimateCCtxSize(params)); +PyObject* ZstdCompressionParameters_estimated_compression_context_size(ZstdCompressionParametersObject* self) { + return PyLong_FromSize_t(ZSTD_estimateCCtxSize_usingCCtxParams(self->params)); } -PyObject* estimate_compression_context_size(PyObject* self, PyObject* args) { - CompressionParametersObject* params; - ZSTD_compressionParameters zparams; - PyObject* result; +PyDoc_STRVAR(ZstdCompressionParameters__doc__, +"ZstdCompressionParameters: low-level control over zstd compression"); - if (!PyArg_ParseTuple(args, "O!:estimate_compression_context_size", - &CompressionParametersType, ¶ms)) { - return NULL; +static void ZstdCompressionParameters_dealloc(ZstdCompressionParametersObject* self) { + if (self->params) { + ZSTD_freeCCtxParams(self->params); + self->params = NULL; } - ztopy_compression_parameters(params, &zparams); - result = PyLong_FromSize_t(ZSTD_estimateCCtxSize(zparams)); - return result; -} - -PyDoc_STRVAR(CompressionParameters__doc__, -"CompressionParameters: low-level control over zstd compression"); - -static void CompressionParameters_dealloc(PyObject* self) { PyObject_Del(self); } -static PyMethodDef CompressionParameters_methods[] = { +static PyMethodDef ZstdCompressionParameters_methods[] = { + { + "from_level", + (PyCFunction)CompressionParameters_from_level, + METH_VARARGS | METH_KEYWORDS | METH_STATIC, + ZstdCompressionParameters_from_level__doc__ + }, { "estimated_compression_context_size", - (PyCFunction)CompressionParameters_estimated_compression_context_size, + (PyCFunction)ZstdCompressionParameters_estimated_compression_context_size, METH_NOARGS, - CompressionParameters_estimated_compression_context_size__doc__ + ZstdCompressionParameters_estimated_compression_context_size__doc__ }, { NULL, NULL } }; -static PyMemberDef CompressionParameters_members[] = { +static PyMemberDef ZstdCompressionParameters_members[] = { + { "format", T_UINT, + offsetof(ZstdCompressionParametersObject, format), READONLY, + "compression format" }, + { "compression_level", T_INT, + offsetof(ZstdCompressionParametersObject, compressionLevel), READONLY, + "compression level" }, { "window_log", T_UINT, - offsetof(CompressionParametersObject, windowLog), READONLY, + offsetof(ZstdCompressionParametersObject, windowLog), READONLY, "window log" }, - { "chain_log", T_UINT, - offsetof(CompressionParametersObject, chainLog), READONLY, - "chain log" }, { "hash_log", T_UINT, - offsetof(CompressionParametersObject, hashLog), READONLY, + offsetof(ZstdCompressionParametersObject, hashLog), READONLY, "hash log" }, + { "chain_log", T_UINT, + offsetof(ZstdCompressionParametersObject, chainLog), READONLY, + "chain log" }, { "search_log", T_UINT, - offsetof(CompressionParametersObject, searchLog), READONLY, + offsetof(ZstdCompressionParametersObject, searchLog), READONLY, "search log" }, - { "search_length", T_UINT, - offsetof(CompressionParametersObject, searchLength), READONLY, + { "min_match", T_UINT, + offsetof(ZstdCompressionParametersObject, minMatch), READONLY, "search length" }, { "target_length", T_UINT, - offsetof(CompressionParametersObject, targetLength), READONLY, + offsetof(ZstdCompressionParametersObject, targetLength), READONLY, "target length" }, - { "strategy", T_INT, - offsetof(CompressionParametersObject, strategy), READONLY, - "strategy" }, + { "compression_strategy", T_UINT, + offsetof(ZstdCompressionParametersObject, compressionStrategy), READONLY, + "compression strategy" }, + { "write_content_size", T_UINT, + offsetof(ZstdCompressionParametersObject, contentSizeFlag), READONLY, + "whether to write content size in frames" }, + { "write_checksum", T_UINT, + offsetof(ZstdCompressionParametersObject, checksumFlag), READONLY, + "whether to write checksum in frames" }, + { "write_dict_id", T_UINT, + offsetof(ZstdCompressionParametersObject, dictIDFlag), READONLY, + "whether to write dictionary ID in frames" }, + { "threads", T_UINT, + offsetof(ZstdCompressionParametersObject, threads), READONLY, + "number of threads to use" }, + { "job_size", T_UINT, + offsetof(ZstdCompressionParametersObject, jobSize), READONLY, + "size of compression job when using multiple threads" }, + { "overlap_size_log", T_UINT, + offsetof(ZstdCompressionParametersObject, overlapSizeLog), READONLY, + "Size of previous input reloaded at the beginning of each job" }, + { "compress_literals", T_UINT, + offsetof(ZstdCompressionParametersObject, compressLiterals), READONLY, + "whether Huffman compression of literals is in use" }, + { "force_max_window", T_UINT, + offsetof(ZstdCompressionParametersObject, forceMaxWindow), READONLY, + "force back references to remain smaller than window size" }, + { "enable_ldm", T_UINT, + offsetof(ZstdCompressionParametersObject, enableLongDistanceMatching), READONLY, + "whether to enable long distance matching" }, + { "ldm_hash_log", T_UINT, + offsetof(ZstdCompressionParametersObject, ldmHashLog), READONLY, + "Size of the table for long distance matching, as a power of 2" }, + { "ldm_min_match", T_UINT, + offsetof(ZstdCompressionParametersObject, ldmMinMatch), READONLY, + "minimum size of searched matches for long distance matcher" }, + { "ldm_bucket_size_log", T_UINT, + offsetof(ZstdCompressionParametersObject, ldmBucketSizeLog), READONLY, + "log size of each bucket in the LDM hash table for collision resolution" }, + { "ldm_hash_every_log", T_UINT, + offsetof(ZstdCompressionParametersObject, ldmHashEveryLog), READONLY, + "frequency of inserting/looking up entries in the LDM hash table" }, { NULL } }; -PyTypeObject CompressionParametersType = { +PyTypeObject ZstdCompressionParametersType = { PyVarObject_HEAD_INIT(NULL, 0) - "CompressionParameters", /* tp_name */ - sizeof(CompressionParametersObject), /* tp_basicsize */ + "ZstdCompressionParameters", /* tp_name */ + sizeof(ZstdCompressionParametersObject), /* tp_basicsize */ 0, /* tp_itemsize */ - (destructor)CompressionParameters_dealloc, /* tp_dealloc */ + (destructor)ZstdCompressionParameters_dealloc, /* tp_dealloc */ 0, /* tp_print */ 0, /* tp_getattr */ 0, /* tp_setattr */ @@ -221,33 +465,38 @@ 0, /* tp_setattro */ 0, /* tp_as_buffer */ Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE, /* tp_flags */ - CompressionParameters__doc__, /* tp_doc */ + ZstdCompressionParameters__doc__, /* tp_doc */ 0, /* tp_traverse */ 0, /* tp_clear */ 0, /* tp_richcompare */ 0, /* tp_weaklistoffset */ 0, /* tp_iter */ 0, /* tp_iternext */ - CompressionParameters_methods, /* tp_methods */ - CompressionParameters_members, /* tp_members */ + ZstdCompressionParameters_methods, /* tp_methods */ + ZstdCompressionParameters_members, /* tp_members */ 0, /* tp_getset */ 0, /* tp_base */ 0, /* tp_dict */ 0, /* tp_descr_get */ 0, /* tp_descr_set */ 0, /* tp_dictoffset */ - (initproc)CompressionParameters_init, /* tp_init */ + (initproc)ZstdCompressionParameters_init, /* tp_init */ 0, /* tp_alloc */ PyType_GenericNew, /* tp_new */ }; void compressionparams_module_init(PyObject* mod) { - Py_TYPE(&CompressionParametersType) = &PyType_Type; - if (PyType_Ready(&CompressionParametersType) < 0) { + Py_TYPE(&ZstdCompressionParametersType) = &PyType_Type; + if (PyType_Ready(&ZstdCompressionParametersType) < 0) { return; } - Py_INCREF(&CompressionParametersType); + Py_INCREF(&ZstdCompressionParametersType); + PyModule_AddObject(mod, "ZstdCompressionParameters", + (PyObject*)&ZstdCompressionParametersType); + + /* TODO remove deprecated alias. */ + Py_INCREF(&ZstdCompressionParametersType); PyModule_AddObject(mod, "CompressionParameters", - (PyObject*)&CompressionParametersType); + (PyObject*)&ZstdCompressionParametersType); } diff -r fb92df8b634c -r ed5448edcbfa contrib/python-zstandard/c-ext/compressionreader.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/contrib/python-zstandard/c-ext/compressionreader.c Wed Apr 18 15:32:08 2018 -0400 @@ -0,0 +1,405 @@ +/** +* Copyright (c) 2017-present, Gregory Szorc +* All rights reserved. +* +* This software may be modified and distributed under the terms +* of the BSD license. See the LICENSE file for details. +*/ + +#include "python-zstandard.h" + +extern PyObject* ZstdError; + +static void set_unsupported_operation(void) { + PyObject* iomod; + PyObject* exc; + + iomod = PyImport_ImportModule("io"); + if (NULL == iomod) { + return; + } + + exc = PyObject_GetAttrString(iomod, "UnsupportedOperation"); + if (NULL == exc) { + Py_DECREF(iomod); + return; + } + + PyErr_SetNone(exc); + Py_DECREF(exc); + Py_DECREF(iomod); +} + +static void reader_dealloc(ZstdCompressionReader* self) { + Py_XDECREF(self->compressor); + Py_XDECREF(self->reader); + + if (self->buffer.buf) { + PyBuffer_Release(&self->buffer); + memset(&self->buffer, 0, sizeof(self->buffer)); + } + + PyObject_Del(self); +} + +static ZstdCompressionReader* reader_enter(ZstdCompressionReader* self) { + size_t zresult; + + if (self->entered) { + PyErr_SetString(PyExc_ValueError, "cannot __enter__ multiple times"); + return NULL; + } + + zresult = ZSTD_CCtx_setPledgedSrcSize(self->compressor->cctx, self->sourceSize); + if (ZSTD_isError(zresult)) { + PyErr_Format(ZstdError, "error setting source size: %s", + ZSTD_getErrorName(zresult)); + return NULL; + } + + self->entered = 1; + + Py_INCREF(self); + return self; +} + +static PyObject* reader_exit(ZstdCompressionReader* self, PyObject* args) { + PyObject* exc_type; + PyObject* exc_value; + PyObject* exc_tb; + + if (!PyArg_ParseTuple(args, "OOO:__exit__", &exc_type, &exc_value, &exc_tb)) { + return NULL; + } + + self->entered = 0; + self->closed = 1; + + /* Release resources associated with source. */ + Py_CLEAR(self->reader); + if (self->buffer.buf) { + PyBuffer_Release(&self->buffer); + memset(&self->buffer, 0, sizeof(self->buffer)); + } + + Py_CLEAR(self->compressor); + + Py_RETURN_FALSE; +} + +static PyObject* reader_readable(ZstdCompressionReader* self) { + Py_RETURN_TRUE; +} + +static PyObject* reader_writable(ZstdCompressionReader* self) { + Py_RETURN_FALSE; +} + +static PyObject* reader_seekable(ZstdCompressionReader* self) { + Py_RETURN_FALSE; +} + +static PyObject* reader_readline(PyObject* self, PyObject* args) { + set_unsupported_operation(); + return NULL; +} + +static PyObject* reader_readlines(PyObject* self, PyObject* args) { + set_unsupported_operation(); + return NULL; +} + +static PyObject* reader_write(PyObject* self, PyObject* args) { + PyErr_SetString(PyExc_OSError, "stream is not writable"); + return NULL; +} + +static PyObject* reader_writelines(PyObject* self, PyObject* args) { + PyErr_SetString(PyExc_OSError, "stream is not writable"); + return NULL; +} + +static PyObject* reader_isatty(PyObject* self) { + Py_RETURN_FALSE; +} + +static PyObject* reader_flush(PyObject* self) { + Py_RETURN_NONE; +} + +static PyObject* reader_close(ZstdCompressionReader* self) { + self->closed = 1; + Py_RETURN_NONE; +} + +static PyObject* reader_closed(ZstdCompressionReader* self) { + if (self->closed) { + Py_RETURN_TRUE; + } + else { + Py_RETURN_FALSE; + } +} + +static PyObject* reader_tell(ZstdCompressionReader* self) { + /* TODO should this raise OSError since stream isn't seekable? */ + return PyLong_FromUnsignedLongLong(self->bytesCompressed); +} + +static PyObject* reader_read(ZstdCompressionReader* self, PyObject* args, PyObject* kwargs) { + static char* kwlist[] = { + "size", + NULL + }; + + Py_ssize_t size = -1; + PyObject* result = NULL; + char* resultBuffer; + Py_ssize_t resultSize; + size_t zresult; + size_t oldPos; + + if (!self->entered) { + PyErr_SetString(ZstdError, "read() must be called from an active context manager"); + return NULL; + } + + if (self->closed) { + PyErr_SetString(PyExc_ValueError, "stream is closed"); + return NULL; + } + + if (self->finishedOutput) { + return PyBytes_FromStringAndSize("", 0); + } + + if (!PyArg_ParseTupleAndKeywords(args, kwargs, "n", kwlist, &size)) { + return NULL; + } + + if (size < 1) { + PyErr_SetString(PyExc_ValueError, "cannot read negative or size 0 amounts"); + return NULL; + } + + result = PyBytes_FromStringAndSize(NULL, size); + if (NULL == result) { + return NULL; + } + + PyBytes_AsStringAndSize(result, &resultBuffer, &resultSize); + + self->output.dst = resultBuffer; + self->output.size = resultSize; + self->output.pos = 0; + +readinput: + + /* If we have data left over, consume it. */ + if (self->input.pos < self->input.size) { + oldPos = self->output.pos; + + Py_BEGIN_ALLOW_THREADS + zresult = ZSTD_compress_generic(self->compressor->cctx, + &self->output, &self->input, ZSTD_e_continue); + + Py_END_ALLOW_THREADS + + self->bytesCompressed += self->output.pos - oldPos; + + /* Input exhausted. Clear out state tracking. */ + if (self->input.pos == self->input.size) { + memset(&self->input, 0, sizeof(self->input)); + Py_CLEAR(self->readResult); + + if (self->buffer.buf) { + self->finishedInput = 1; + } + } + + if (ZSTD_isError(zresult)) { + PyErr_Format(ZstdError, "zstd compress error: %s", ZSTD_getErrorName(zresult)); + return NULL; + } + + if (self->output.pos) { + /* If no more room in output, emit it. */ + if (self->output.pos == self->output.size) { + memset(&self->output, 0, sizeof(self->output)); + return result; + } + + /* + * There is room in the output. We fall through to below, which will either + * get more input for us or will attempt to end the stream. + */ + } + + /* Fall through to gather more input. */ + } + + if (!self->finishedInput) { + if (self->reader) { + Py_buffer buffer; + + assert(self->readResult == NULL); + self->readResult = PyObject_CallMethod(self->reader, "read", + "k", self->readSize); + if (self->readResult == NULL) { + return NULL; + } + + memset(&buffer, 0, sizeof(buffer)); + + if (0 != PyObject_GetBuffer(self->readResult, &buffer, PyBUF_CONTIG_RO)) { + return NULL; + } + + /* EOF */ + if (0 == buffer.len) { + self->finishedInput = 1; + Py_CLEAR(self->readResult); + } + else { + self->input.src = buffer.buf; + self->input.size = buffer.len; + self->input.pos = 0; + } + + PyBuffer_Release(&buffer); + } + else { + assert(self->buffer.buf); + + self->input.src = self->buffer.buf; + self->input.size = self->buffer.len; + self->input.pos = 0; + } + } + + if (self->input.size) { + goto readinput; + } + + /* Else EOF */ + oldPos = self->output.pos; + + zresult = ZSTD_compress_generic(self->compressor->cctx, &self->output, + &self->input, ZSTD_e_end); + + self->bytesCompressed += self->output.pos - oldPos; + + if (ZSTD_isError(zresult)) { + PyErr_Format(ZstdError, "error ending compression stream: %s", + ZSTD_getErrorName(zresult)); + return NULL; + } + + assert(self->output.pos); + + if (0 == zresult) { + self->finishedOutput = 1; + } + + if (safe_pybytes_resize(&result, self->output.pos)) { + Py_XDECREF(result); + return NULL; + } + + memset(&self->output, 0, sizeof(self->output)); + + return result; +} + +static PyObject* reader_readall(PyObject* self) { + PyErr_SetNone(PyExc_NotImplementedError); + return NULL; +} + +static PyObject* reader_iter(PyObject* self) { + set_unsupported_operation(); + return NULL; +} + +static PyObject* reader_iternext(PyObject* self) { + set_unsupported_operation(); + return NULL; +} + +static PyMethodDef reader_methods[] = { + { "__enter__", (PyCFunction)reader_enter, METH_NOARGS, + PyDoc_STR("Enter a compression context") }, + { "__exit__", (PyCFunction)reader_exit, METH_VARARGS, + PyDoc_STR("Exit a compression context") }, + { "close", (PyCFunction)reader_close, METH_NOARGS, + PyDoc_STR("Close the stream so it cannot perform any more operations") }, + { "closed", (PyCFunction)reader_closed, METH_NOARGS, + PyDoc_STR("Whether stream is closed") }, + { "flush", (PyCFunction)reader_flush, METH_NOARGS, PyDoc_STR("no-ops") }, + { "isatty", (PyCFunction)reader_isatty, METH_NOARGS, PyDoc_STR("Returns False") }, + { "readable", (PyCFunction)reader_readable, METH_NOARGS, + PyDoc_STR("Returns True") }, + { "read", (PyCFunction)reader_read, METH_VARARGS | METH_KEYWORDS, PyDoc_STR("read compressed data") }, + { "readall", (PyCFunction)reader_readall, METH_NOARGS, PyDoc_STR("Not implemented") }, + { "readline", (PyCFunction)reader_readline, METH_VARARGS, PyDoc_STR("Not implemented") }, + { "readlines", (PyCFunction)reader_readlines, METH_VARARGS, PyDoc_STR("Not implemented") }, + { "seekable", (PyCFunction)reader_seekable, METH_NOARGS, + PyDoc_STR("Returns False") }, + { "tell", (PyCFunction)reader_tell, METH_NOARGS, + PyDoc_STR("Returns current number of bytes compressed") }, + { "writable", (PyCFunction)reader_writable, METH_NOARGS, + PyDoc_STR("Returns False") }, + { "write", reader_write, METH_VARARGS, PyDoc_STR("Raises OSError") }, + { "writelines", reader_writelines, METH_VARARGS, PyDoc_STR("Not implemented") }, + { NULL, NULL } +}; + +PyTypeObject ZstdCompressionReaderType = { + PyVarObject_HEAD_INIT(NULL, 0) + "zstd.ZstdCompressionReader", /* tp_name */ + sizeof(ZstdCompressionReader), /* tp_basicsize */ + 0, /* tp_itemsize */ + (destructor)reader_dealloc, /* tp_dealloc */ + 0, /* tp_print */ + 0, /* tp_getattr */ + 0, /* tp_setattr */ + 0, /* tp_compare */ + 0, /* tp_repr */ + 0, /* tp_as_number */ + 0, /* tp_as_sequence */ + 0, /* tp_as_mapping */ + 0, /* tp_hash */ + 0, /* tp_call */ + 0, /* tp_str */ + 0, /* tp_getattro */ + 0, /* tp_setattro */ + 0, /* tp_as_buffer */ + Py_TPFLAGS_DEFAULT, /* tp_flags */ + 0, /* tp_doc */ + 0, /* tp_traverse */ + 0, /* tp_clear */ + 0, /* tp_richcompare */ + 0, /* tp_weaklistoffset */ + reader_iter, /* tp_iter */ + reader_iternext, /* tp_iternext */ + reader_methods, /* tp_methods */ + 0, /* tp_members */ + 0, /* tp_getset */ + 0, /* tp_base */ + 0, /* tp_dict */ + 0, /* tp_descr_get */ + 0, /* tp_descr_set */ + 0, /* tp_dictoffset */ + 0, /* tp_init */ + 0, /* tp_alloc */ + PyType_GenericNew, /* tp_new */ +}; + +void compressionreader_module_init(PyObject* mod) { + /* TODO make reader a sub-class of io.RawIOBase */ + + Py_TYPE(&ZstdCompressionReaderType) = &PyType_Type; + if (PyType_Ready(&ZstdCompressionReaderType) < 0) { + return; + } +} diff -r fb92df8b634c -r ed5448edcbfa contrib/python-zstandard/c-ext/compressionwriter.c --- a/contrib/python-zstandard/c-ext/compressionwriter.c Wed Apr 04 10:35:09 2018 -0400 +++ b/contrib/python-zstandard/c-ext/compressionwriter.c Wed Apr 18 15:32:08 2018 -0400 @@ -22,20 +22,18 @@ } static PyObject* ZstdCompressionWriter_enter(ZstdCompressionWriter* self) { + size_t zresult; + if (self->entered) { PyErr_SetString(ZstdError, "cannot __enter__ multiple times"); return NULL; } - if (self->compressor->mtcctx) { - if (init_mtcstream(self->compressor, self->sourceSize)) { - return NULL; - } - } - else { - if (0 != init_cstream(self->compressor, self->sourceSize)) { - return NULL; - } + zresult = ZSTD_CCtx_setPledgedSrcSize(self->compressor->cctx, self->sourceSize); + if (ZSTD_isError(zresult)) { + PyErr_Format(ZstdError, "error setting source size: %s", + ZSTD_getErrorName(zresult)); + return NULL; } self->entered = 1; @@ -59,8 +57,12 @@ self->entered = 0; - if ((self->compressor->cstream || self->compressor->mtcctx) && exc_type == Py_None - && exc_value == Py_None && exc_tb == Py_None) { + if (exc_type == Py_None && exc_value == Py_None && exc_tb == Py_None) { + ZSTD_inBuffer inBuffer; + + inBuffer.src = NULL; + inBuffer.size = 0; + inBuffer.pos = 0; output.dst = PyMem_Malloc(self->outSize); if (!output.dst) { @@ -70,12 +72,7 @@ output.pos = 0; while (1) { - if (self->compressor->mtcctx) { - zresult = ZSTDMT_endStream(self->compressor->mtcctx, &output); - } - else { - zresult = ZSTD_endStream(self->compressor->cstream, &output); - } + zresult = ZSTD_compress_generic(self->compressor->cctx, &output, &inBuffer, ZSTD_e_end); if (ZSTD_isError(zresult)) { PyErr_Format(ZstdError, "error ending compression stream: %s", ZSTD_getErrorName(zresult)); @@ -107,18 +104,17 @@ } static PyObject* ZstdCompressionWriter_memory_size(ZstdCompressionWriter* self) { - if (!self->compressor->cstream) { - PyErr_SetString(ZstdError, "cannot determine size of an inactive compressor; " - "call when a context manager is active"); - return NULL; - } - - return PyLong_FromSize_t(ZSTD_sizeof_CStream(self->compressor->cstream)); + return PyLong_FromSize_t(ZSTD_sizeof_CCtx(self->compressor->cctx)); } -static PyObject* ZstdCompressionWriter_write(ZstdCompressionWriter* self, PyObject* args) { - const char* source; - Py_ssize_t sourceSize; +static PyObject* ZstdCompressionWriter_write(ZstdCompressionWriter* self, PyObject* args, PyObject* kwargs) { + static char* kwlist[] = { + "data", + NULL + }; + + PyObject* result = NULL; + Py_buffer source; size_t zresult; ZSTD_inBuffer input; ZSTD_outBuffer output; @@ -126,44 +122,46 @@ Py_ssize_t totalWrite = 0; #if PY_MAJOR_VERSION >= 3 - if (!PyArg_ParseTuple(args, "y#:write", &source, &sourceSize)) { + if (!PyArg_ParseTupleAndKeywords(args, kwargs, "y*:write", #else - if (!PyArg_ParseTuple(args, "s#:write", &source, &sourceSize)) { + if (!PyArg_ParseTupleAndKeywords(args, kwargs, "s*:write", #endif + kwlist, &source)) { return NULL; } if (!self->entered) { PyErr_SetString(ZstdError, "compress must be called from an active context manager"); - return NULL; + goto finally; + } + + if (!PyBuffer_IsContiguous(&source, 'C') || source.ndim > 1) { + PyErr_SetString(PyExc_ValueError, + "data buffer should be contiguous and have at most one dimension"); + goto finally; } output.dst = PyMem_Malloc(self->outSize); if (!output.dst) { - return PyErr_NoMemory(); + PyErr_NoMemory(); + goto finally; } output.size = self->outSize; output.pos = 0; - input.src = source; - input.size = sourceSize; + input.src = source.buf; + input.size = source.len; input.pos = 0; - while ((ssize_t)input.pos < sourceSize) { + while ((ssize_t)input.pos < source.len) { Py_BEGIN_ALLOW_THREADS - if (self->compressor->mtcctx) { - zresult = ZSTDMT_compressStream(self->compressor->mtcctx, - &output, &input); - } - else { - zresult = ZSTD_compressStream(self->compressor->cstream, &output, &input); - } + zresult = ZSTD_compress_generic(self->compressor->cctx, &output, &input, ZSTD_e_continue); Py_END_ALLOW_THREADS if (ZSTD_isError(zresult)) { PyMem_Free(output.dst); PyErr_Format(ZstdError, "zstd compress error: %s", ZSTD_getErrorName(zresult)); - return NULL; + goto finally; } /* Copy data from output buffer to writer. */ @@ -176,18 +174,24 @@ output.dst, output.pos); Py_XDECREF(res); totalWrite += output.pos; + self->bytesCompressed += output.pos; } output.pos = 0; } PyMem_Free(output.dst); - return PyLong_FromSsize_t(totalWrite); + result = PyLong_FromSsize_t(totalWrite); + +finally: + PyBuffer_Release(&source); + return result; } static PyObject* ZstdCompressionWriter_flush(ZstdCompressionWriter* self, PyObject* args) { size_t zresult; ZSTD_outBuffer output; + ZSTD_inBuffer input; PyObject* res; Py_ssize_t totalWrite = 0; @@ -196,6 +200,10 @@ return NULL; } + input.src = NULL; + input.size = 0; + input.pos = 0; + output.dst = PyMem_Malloc(self->outSize); if (!output.dst) { return PyErr_NoMemory(); @@ -205,12 +213,7 @@ while (1) { Py_BEGIN_ALLOW_THREADS - if (self->compressor->mtcctx) { - zresult = ZSTDMT_flushStream(self->compressor->mtcctx, &output); - } - else { - zresult = ZSTD_flushStream(self->compressor->cstream, &output); - } + zresult = ZSTD_compress_generic(self->compressor->cctx, &output, &input, ZSTD_e_flush); Py_END_ALLOW_THREADS if (ZSTD_isError(zresult)) { @@ -233,6 +236,7 @@ output.dst, output.pos); Py_XDECREF(res); totalWrite += output.pos; + self->bytesCompressed += output.pos; } output.pos = 0; } @@ -242,6 +246,10 @@ return PyLong_FromSsize_t(totalWrite); } +static PyObject* ZstdCompressionWriter_tell(ZstdCompressionWriter* self) { + return PyLong_FromUnsignedLongLong(self->bytesCompressed); +} + static PyMethodDef ZstdCompressionWriter_methods[] = { { "__enter__", (PyCFunction)ZstdCompressionWriter_enter, METH_NOARGS, PyDoc_STR("Enter a compression context.") }, @@ -249,10 +257,12 @@ PyDoc_STR("Exit a compression context.") }, { "memory_size", (PyCFunction)ZstdCompressionWriter_memory_size, METH_NOARGS, PyDoc_STR("Obtain the memory size of the underlying compressor") }, - { "write", (PyCFunction)ZstdCompressionWriter_write, METH_VARARGS, + { "write", (PyCFunction)ZstdCompressionWriter_write, METH_VARARGS | METH_KEYWORDS, PyDoc_STR("Compress data") }, { "flush", (PyCFunction)ZstdCompressionWriter_flush, METH_NOARGS, PyDoc_STR("Flush data and finish a zstd frame") }, + { "tell", (PyCFunction)ZstdCompressionWriter_tell, METH_NOARGS, + PyDoc_STR("Returns current number of bytes compressed") }, { NULL, NULL } }; diff -r fb92df8b634c -r ed5448edcbfa contrib/python-zstandard/c-ext/compressobj.c --- a/contrib/python-zstandard/c-ext/compressobj.c Wed Apr 04 10:35:09 2018 -0400 +++ b/contrib/python-zstandard/c-ext/compressobj.c Wed Apr 18 15:32:08 2018 -0400 @@ -23,9 +23,13 @@ PyObject_Del(self); } -static PyObject* ZstdCompressionObj_compress(ZstdCompressionObj* self, PyObject* args) { - const char* source; - Py_ssize_t sourceSize; +static PyObject* ZstdCompressionObj_compress(ZstdCompressionObj* self, PyObject* args, PyObject* kwargs) { + static char* kwlist[] = { + "data", + NULL + }; + + Py_buffer source; ZSTD_inBuffer input; size_t zresult; PyObject* result = NULL; @@ -37,38 +41,43 @@ } #if PY_MAJOR_VERSION >= 3 - if (!PyArg_ParseTuple(args, "y#:compress", &source, &sourceSize)) { + if (!PyArg_ParseTupleAndKeywords(args, kwargs, "y*:compress", #else - if (!PyArg_ParseTuple(args, "s#:compress", &source, &sourceSize)) { + if (!PyArg_ParseTupleAndKeywords(args, kwargs, "s*:compress", #endif + kwlist, &source)) { return NULL; } - input.src = source; - input.size = sourceSize; + if (!PyBuffer_IsContiguous(&source, 'C') || source.ndim > 1) { + PyErr_SetString(PyExc_ValueError, + "data buffer should be contiguous and have at most one dimension"); + goto finally; + } + + input.src = source.buf; + input.size = source.len; input.pos = 0; - while ((ssize_t)input.pos < sourceSize) { + while ((ssize_t)input.pos < source.len) { Py_BEGIN_ALLOW_THREADS - if (self->compressor->mtcctx) { - zresult = ZSTDMT_compressStream(self->compressor->mtcctx, - &self->output, &input); - } - else { - zresult = ZSTD_compressStream(self->compressor->cstream, &self->output, &input); - } + zresult = ZSTD_compress_generic(self->compressor->cctx, &self->output, + &input, ZSTD_e_continue); Py_END_ALLOW_THREADS if (ZSTD_isError(zresult)) { PyErr_Format(ZstdError, "zstd compress error: %s", ZSTD_getErrorName(zresult)); - return NULL; + Py_CLEAR(result); + goto finally; } if (self->output.pos) { if (result) { resultSize = PyBytes_GET_SIZE(result); - if (-1 == _PyBytes_Resize(&result, resultSize + self->output.pos)) { - return NULL; + + if (safe_pybytes_resize(&result, resultSize + self->output.pos)) { + Py_CLEAR(result); + goto finally; } memcpy(PyBytes_AS_STRING(result) + resultSize, @@ -77,7 +86,7 @@ else { result = PyBytes_FromStringAndSize(self->output.dst, self->output.pos); if (!result) { - return NULL; + goto finally; } } @@ -85,21 +94,29 @@ } } - if (result) { - return result; + if (NULL == result) { + result = PyBytes_FromString(""); } - else { - return PyBytes_FromString(""); - } + +finally: + PyBuffer_Release(&source); + + return result; } -static PyObject* ZstdCompressionObj_flush(ZstdCompressionObj* self, PyObject* args) { +static PyObject* ZstdCompressionObj_flush(ZstdCompressionObj* self, PyObject* args, PyObject* kwargs) { + static char* kwlist[] = { + "flush_mode", + NULL + }; + int flushMode = compressorobj_flush_finish; size_t zresult; PyObject* result = NULL; Py_ssize_t resultSize = 0; + ZSTD_inBuffer input; - if (!PyArg_ParseTuple(args, "|i:flush", &flushMode)) { + if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|i:flush", kwlist, &flushMode)) { return NULL; } @@ -115,16 +132,16 @@ assert(self->output.pos == 0); + input.src = NULL; + input.size = 0; + input.pos = 0; + if (flushMode == compressorobj_flush_block) { /* The output buffer is of size ZSTD_CStreamOutSize(), which is guaranteed to hold a full block. */ Py_BEGIN_ALLOW_THREADS - if (self->compressor->mtcctx) { - zresult = ZSTDMT_flushStream(self->compressor->mtcctx, &self->output); - } - else { - zresult = ZSTD_flushStream(self->compressor->cstream, &self->output); - } + zresult = ZSTD_compress_generic(self->compressor->cctx, &self->output, + &input, ZSTD_e_flush); Py_END_ALLOW_THREADS if (ZSTD_isError(zresult)) { @@ -156,12 +173,8 @@ self->finished = 1; while (1) { - if (self->compressor->mtcctx) { - zresult = ZSTDMT_endStream(self->compressor->mtcctx, &self->output); - } - else { - zresult = ZSTD_endStream(self->compressor->cstream, &self->output); - } + zresult = ZSTD_compress_generic(self->compressor->cctx, &self->output, + &input, ZSTD_e_end); if (ZSTD_isError(zresult)) { PyErr_Format(ZstdError, "error ending compression stream: %s", ZSTD_getErrorName(zresult)); @@ -171,7 +184,9 @@ if (self->output.pos) { if (result) { resultSize = PyBytes_GET_SIZE(result); - if (-1 == _PyBytes_Resize(&result, resultSize + self->output.pos)) { + + if (safe_pybytes_resize(&result, resultSize + self->output.pos)) { + Py_XDECREF(result); return NULL; } @@ -202,9 +217,9 @@ } static PyMethodDef ZstdCompressionObj_methods[] = { - { "compress", (PyCFunction)ZstdCompressionObj_compress, METH_VARARGS, + { "compress", (PyCFunction)ZstdCompressionObj_compress, METH_VARARGS | METH_KEYWORDS, PyDoc_STR("compress data") }, - { "flush", (PyCFunction)ZstdCompressionObj_flush, METH_VARARGS, + { "flush", (PyCFunction)ZstdCompressionObj_flush, METH_VARARGS | METH_KEYWORDS, PyDoc_STR("finish compression operation") }, { NULL, NULL } }; diff -r fb92df8b634c -r ed5448edcbfa contrib/python-zstandard/c-ext/compressor.c --- a/contrib/python-zstandard/c-ext/compressor.c Wed Apr 04 10:35:09 2018 -0400 +++ b/contrib/python-zstandard/c-ext/compressor.c Wed Apr 18 15:32:08 2018 -0400 @@ -11,118 +11,78 @@ extern PyObject* ZstdError; -int populate_cdict(ZstdCompressor* compressor, ZSTD_parameters* zparams) { - ZSTD_customMem zmem; +int ensure_cctx(ZstdCompressor* compressor) { + size_t zresult; + + assert(compressor); + assert(compressor->cctx); + assert(compressor->params); - if (compressor->cdict || !compressor->dict || !compressor->dict->dictData) { - return 0; + ZSTD_CCtx_reset(compressor->cctx); + + zresult = ZSTD_CCtx_setParametersUsingCCtxParams(compressor->cctx, compressor->params); + if (ZSTD_isError(zresult)) { + PyErr_Format(ZstdError, "could not set compression parameters: %s", + ZSTD_getErrorName(zresult)); + return 1; } - Py_BEGIN_ALLOW_THREADS - memset(&zmem, 0, sizeof(zmem)); - compressor->cdict = ZSTD_createCDict_advanced(compressor->dict->dictData, - compressor->dict->dictSize, 1, *zparams, zmem); - Py_END_ALLOW_THREADS - - if (!compressor->cdict) { - PyErr_SetString(ZstdError, "could not create compression dictionary"); - return 1; + if (compressor->dict) { + if (compressor->dict->cdict) { + zresult = ZSTD_CCtx_refCDict(compressor->cctx, compressor->dict->cdict); + } + else { + zresult = ZSTD_CCtx_loadDictionary_advanced(compressor->cctx, + compressor->dict->dictData, compressor->dict->dictSize, + ZSTD_dlm_byRef, compressor->dict->dictType); + } + if (ZSTD_isError(zresult)) { + PyErr_Format(ZstdError, "could not load compression dictionary: %s", + ZSTD_getErrorName(zresult)); + return 1; + } } return 0; } -/** - * Ensure the ZSTD_CStream on a ZstdCompressor instance is initialized. - * - * Returns 0 on success. Other value on failure. Will set a Python exception - * on failure. - */ -int init_cstream(ZstdCompressor* compressor, unsigned long long sourceSize) { - ZSTD_parameters zparams; - void* dictData = NULL; - size_t dictSize = 0; - size_t zresult; +static PyObject* frame_progression(ZSTD_CCtx* cctx) { + PyObject* result = NULL; + PyObject* value; + ZSTD_frameProgression progression; - if (compressor->cstream) { - zresult = ZSTD_resetCStream(compressor->cstream, sourceSize); - if (ZSTD_isError(zresult)) { - PyErr_Format(ZstdError, "could not reset CStream: %s", - ZSTD_getErrorName(zresult)); - return -1; - } - - return 0; + result = PyTuple_New(3); + if (!result) { + return NULL; } - compressor->cstream = ZSTD_createCStream(); - if (!compressor->cstream) { - PyErr_SetString(ZstdError, "could not create CStream"); - return -1; - } + progression = ZSTD_getFrameProgression(cctx); - if (compressor->dict) { - dictData = compressor->dict->dictData; - dictSize = compressor->dict->dictSize; - } - - memset(&zparams, 0, sizeof(zparams)); - if (compressor->cparams) { - ztopy_compression_parameters(compressor->cparams, &zparams.cParams); - /* Do NOT call ZSTD_adjustCParams() here because the compression params - come from the user. */ - } - else { - zparams.cParams = ZSTD_getCParams(compressor->compressionLevel, sourceSize, dictSize); + value = PyLong_FromUnsignedLongLong(progression.ingested); + if (!value) { + Py_DECREF(result); + return NULL; } - zparams.fParams = compressor->fparams; - - zresult = ZSTD_initCStream_advanced(compressor->cstream, dictData, dictSize, - zparams, sourceSize); + PyTuple_SET_ITEM(result, 0, value); - if (ZSTD_isError(zresult)) { - ZSTD_freeCStream(compressor->cstream); - compressor->cstream = NULL; - PyErr_Format(ZstdError, "cannot init CStream: %s", ZSTD_getErrorName(zresult)); - return -1; + value = PyLong_FromUnsignedLongLong(progression.consumed); + if (!value) { + Py_DECREF(result); + return NULL; } - return 0;; -} + PyTuple_SET_ITEM(result, 1, value); -int init_mtcstream(ZstdCompressor* compressor, Py_ssize_t sourceSize) { - size_t zresult; - void* dictData = NULL; - size_t dictSize = 0; - ZSTD_parameters zparams; - - assert(compressor->mtcctx); - - if (compressor->dict) { - dictData = compressor->dict->dictData; - dictSize = compressor->dict->dictSize; + value = PyLong_FromUnsignedLongLong(progression.produced); + if (!value) { + Py_DECREF(result); + return NULL; } - memset(&zparams, 0, sizeof(zparams)); - if (compressor->cparams) { - ztopy_compression_parameters(compressor->cparams, &zparams.cParams); - } - else { - zparams.cParams = ZSTD_getCParams(compressor->compressionLevel, sourceSize, dictSize); - } - - zparams.fParams = compressor->fparams; + PyTuple_SET_ITEM(result, 2, value); - zresult = ZSTDMT_initCStream_advanced(compressor->mtcctx, dictData, dictSize, - zparams, sourceSize); - - if (ZSTD_isError(zresult)) { - PyErr_Format(ZstdError, "cannot init CStream: %s", ZSTD_getErrorName(zresult)); - return -1; - } - - return 0; + return result; } PyDoc_STRVAR(ZstdCompressor__doc__, @@ -147,9 +107,9 @@ " If True, a 4 byte content checksum will be written with the compressed\n" " data, allowing the decompressor to perform content verification.\n" "write_content_size\n" -" If True, the decompressed content size will be included in the header of\n" -" the compressed data. This data will only be written if the compressor\n" -" knows the size of the input data.\n" +" If True (the default), the decompressed content size will be included in\n" +" the header of the compressed data. This data will only be written if the\n" +" compressor knows the size of the input data.\n" "write_dict_id\n" " Determines whether the dictionary ID will be written into the compressed\n" " data. Defaults to True. Only adds content to the compressed data if\n" @@ -175,7 +135,7 @@ int level = 3; ZstdCompressionDict* dict = NULL; - CompressionParametersObject* params = NULL; + ZstdCompressionParametersObject* params = NULL; PyObject* writeChecksum = NULL; PyObject* writeContentSize = NULL; PyObject* writeDictID = NULL; @@ -183,16 +143,11 @@ if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|iO!O!OOOi:ZstdCompressor", kwlist, &level, &ZstdCompressionDictType, &dict, - &CompressionParametersType, ¶ms, + &ZstdCompressionParametersType, ¶ms, &writeChecksum, &writeContentSize, &writeDictID, &threads)) { return -1; } - if (level < 1) { - PyErr_SetString(PyExc_ValueError, "level must be greater than 0"); - return -1; - } - if (level > ZSTD_maxCLevel()) { PyErr_Format(PyExc_ValueError, "level must be less than %d", ZSTD_maxCLevel() + 1); @@ -203,79 +158,135 @@ threads = cpu_count(); } - self->threads = threads; - /* We create a ZSTD_CCtx for reuse among multiple operations to reduce the overhead of each compression operation. */ - if (threads) { - self->mtcctx = ZSTDMT_createCCtx(threads); - if (!self->mtcctx) { - PyErr_NoMemory(); + self->cctx = ZSTD_createCCtx(); + if (!self->cctx) { + PyErr_NoMemory(); + return -1; + } + + /* TODO stuff the original parameters away somewhere so we can reset later. This + will allow us to do things like automatically adjust cparams based on input + size (assuming zstd isn't doing that internally). */ + + self->params = ZSTD_createCCtxParams(); + if (!self->params) { + PyErr_NoMemory(); + return -1; + } + + if (params && writeChecksum) { + PyErr_SetString(PyExc_ValueError, + "cannot define compression_params and write_checksum"); + return -1; + } + + if (params && writeContentSize) { + PyErr_SetString(PyExc_ValueError, + "cannot define compression_params and write_content_size"); + return -1; + } + + if (params && writeDictID) { + PyErr_SetString(PyExc_ValueError, + "cannot define compression_params and write_dict_id"); + return -1; + } + + if (params && threads) { + PyErr_SetString(PyExc_ValueError, + "cannot define compression_params and threads"); + return -1; + } + + if (params) { + if (set_parameters(self->params, params)) { return -1; } } else { - self->cctx = ZSTD_createCCtx(); - if (!self->cctx) { - PyErr_NoMemory(); + if (set_parameter(self->params, ZSTD_p_compressionLevel, level)) { + return -1; + } + + if (set_parameter(self->params, ZSTD_p_contentSizeFlag, + writeContentSize ? PyObject_IsTrue(writeContentSize) : 1)) { + return -1; + } + + if (set_parameter(self->params, ZSTD_p_checksumFlag, + writeChecksum ? PyObject_IsTrue(writeChecksum) : 0)) { return -1; } - } + + if (set_parameter(self->params, ZSTD_p_dictIDFlag, + writeDictID ? PyObject_IsTrue(writeDictID) : 1)) { + return -1; + } - self->compressionLevel = level; + if (threads) { + if (set_parameter(self->params, ZSTD_p_nbWorkers, threads)) { + return -1; + } + } + } if (dict) { self->dict = dict; Py_INCREF(dict); } - if (params) { - self->cparams = params; - Py_INCREF(params); - } - - memset(&self->fparams, 0, sizeof(self->fparams)); - - if (writeChecksum && PyObject_IsTrue(writeChecksum)) { - self->fparams.checksumFlag = 1; - } - if (writeContentSize && PyObject_IsTrue(writeContentSize)) { - self->fparams.contentSizeFlag = 1; - } - if (writeDictID && PyObject_Not(writeDictID)) { - self->fparams.noDictIDFlag = 1; + if (ensure_cctx(self)) { + return -1; } return 0; } static void ZstdCompressor_dealloc(ZstdCompressor* self) { - if (self->cstream) { - ZSTD_freeCStream(self->cstream); - self->cstream = NULL; - } - - Py_XDECREF(self->cparams); - Py_XDECREF(self->dict); - - if (self->cdict) { - ZSTD_freeCDict(self->cdict); - self->cdict = NULL; - } - if (self->cctx) { ZSTD_freeCCtx(self->cctx); self->cctx = NULL; } - if (self->mtcctx) { - ZSTDMT_freeCCtx(self->mtcctx); - self->mtcctx = NULL; + if (self->params) { + ZSTD_freeCCtxParams(self->params); + self->params = NULL; } + Py_XDECREF(self->dict); PyObject_Del(self); } +PyDoc_STRVAR(ZstdCompressor_memory_size__doc__, +"memory_size()\n" +"\n" +"Obtain the memory usage of this compressor, in bytes.\n" +); + +static PyObject* ZstdCompressor_memory_size(ZstdCompressor* self) { + if (self->cctx) { + return PyLong_FromSize_t(ZSTD_sizeof_CCtx(self->cctx)); + } + else { + PyErr_SetString(ZstdError, "no compressor context found; this should never happen"); + return NULL; + } +} + +PyDoc_STRVAR(ZstdCompressor_frame_progression__doc__, +"frame_progression()\n" +"\n" +"Return information on how much work the compressor has done.\n" +"\n" +"Returns a 3-tuple of (ingested, consumed, produced).\n" +); + +static PyObject* ZstdCompressor_frame_progression(ZstdCompressor* self) { + return frame_progression(self->cctx); +} + PyDoc_STRVAR(ZstdCompressor_copy_stream__doc__, "copy_stream(ifh, ofh[, size=0, read_size=default, write_size=default])\n" "compress data between streams\n" @@ -304,7 +315,7 @@ PyObject* source; PyObject* dest; - Py_ssize_t sourceSize = 0; + unsigned long long sourceSize = ZSTD_CONTENTSIZE_UNKNOWN; size_t inSize = ZSTD_CStreamInSize(); size_t outSize = ZSTD_CStreamOutSize(); ZSTD_inBuffer input; @@ -313,14 +324,14 @@ Py_ssize_t totalWrite = 0; char* readBuffer; Py_ssize_t readSize; - PyObject* readResult; + PyObject* readResult = NULL; PyObject* res = NULL; size_t zresult; PyObject* writeResult; PyObject* totalReadPy; PyObject* totalWritePy; - if (!PyArg_ParseTupleAndKeywords(args, kwargs, "OO|nkk:copy_stream", kwlist, + if (!PyArg_ParseTupleAndKeywords(args, kwargs, "OO|Kkk:copy_stream", kwlist, &source, &dest, &sourceSize, &inSize, &outSize)) { return NULL; } @@ -335,22 +346,18 @@ return NULL; } - /* Prevent free on uninitialized memory in finally. */ - output.dst = NULL; - - if (self->mtcctx) { - if (init_mtcstream(self, sourceSize)) { - res = NULL; - goto finally; - } - } - else { - if (0 != init_cstream(self, sourceSize)) { - res = NULL; - goto finally; - } + if (ensure_cctx(self)) { + return NULL; } + zresult = ZSTD_CCtx_setPledgedSrcSize(self->cctx, sourceSize); + if (ZSTD_isError(zresult)) { + PyErr_Format(ZstdError, "error setting source size: %s", + ZSTD_getErrorName(zresult)); + return NULL; + } + + /* Prevent free on uninitialized memory in finally. */ output.dst = PyMem_Malloc(outSize); if (!output.dst) { PyErr_NoMemory(); @@ -360,6 +367,10 @@ output.size = outSize; output.pos = 0; + input.src = NULL; + input.size = 0; + input.pos = 0; + while (1) { /* Try to read from source stream. */ readResult = PyObject_CallMethod(source, "read", "n", inSize); @@ -384,12 +395,7 @@ while (input.pos < input.size) { Py_BEGIN_ALLOW_THREADS - if (self->mtcctx) { - zresult = ZSTDMT_compressStream(self->mtcctx, &output, &input); - } - else { - zresult = ZSTD_compressStream(self->cstream, &output, &input); - } + zresult = ZSTD_compress_generic(self->cctx, &output, &input, ZSTD_e_continue); Py_END_ALLOW_THREADS if (ZSTD_isError(zresult)) { @@ -410,16 +416,18 @@ output.pos = 0; } } + + Py_CLEAR(readResult); } /* We've finished reading. Now flush the compressor stream. */ + assert(input.pos == input.size); + while (1) { - if (self->mtcctx) { - zresult = ZSTDMT_endStream(self->mtcctx, &output); - } - else { - zresult = ZSTD_endStream(self->cstream, &output); - } + Py_BEGIN_ALLOW_THREADS + zresult = ZSTD_compress_generic(self->cctx, &output, &input, ZSTD_e_end); + Py_END_ALLOW_THREADS + if (ZSTD_isError(zresult)) { PyErr_Format(ZstdError, "error ending compression stream: %s", ZSTD_getErrorName(zresult)); @@ -455,11 +463,81 @@ PyMem_Free(output.dst); } + Py_XDECREF(readResult); + return res; } +PyDoc_STRVAR(ZstdCompressor_stream_reader__doc__, +"stream_reader(source, [size=0])\n" +"\n" +"Obtain an object that behaves like an I/O stream.\n" +"\n" +"The source object can be any object with a ``read(size)`` method\n" +"or an object that conforms to the buffer protocol.\n" +); + +static ZstdCompressionReader* ZstdCompressor_stream_reader(ZstdCompressor* self, PyObject* args, PyObject* kwargs) { + static char* kwlist[] = { + "source", + "size", + "read_size", + NULL + }; + + PyObject* source; + unsigned long long sourceSize = ZSTD_CONTENTSIZE_UNKNOWN; + size_t readSize = ZSTD_CStreamInSize(); + ZstdCompressionReader* result = NULL; + + if (!PyArg_ParseTupleAndKeywords(args, kwargs, "O|Kk:stream_reader", kwlist, + &source, &sourceSize, &readSize)) { + return NULL; + } + + result = (ZstdCompressionReader*)PyObject_CallObject((PyObject*)&ZstdCompressionReaderType, NULL); + if (!result) { + return NULL; + } + + if (PyObject_HasAttrString(source, "read")) { + result->reader = source; + Py_INCREF(source); + result->readSize = readSize; + } + else if (1 == PyObject_CheckBuffer(source)) { + if (0 != PyObject_GetBuffer(source, &result->buffer, PyBUF_CONTIG_RO)) { + goto except; + } + + assert(result->buffer.len >= 0); + + sourceSize = result->buffer.len; + } + else { + PyErr_SetString(PyExc_TypeError, + "must pass an object with a read() method or that conforms to the buffer protocol"); + goto except; + } + + if (ensure_cctx(self)) { + goto except; + } + + result->compressor = self; + Py_INCREF(self); + result->sourceSize = sourceSize; + + return result; + +except: + Py_CLEAR(result); + + return NULL; +} + PyDoc_STRVAR(ZstdCompressor_compress__doc__, -"compress(data, allow_empty=False)\n" +"compress(data)\n" "\n" "Compress data in a single operation.\n" "\n" @@ -473,122 +551,79 @@ static PyObject* ZstdCompressor_compress(ZstdCompressor* self, PyObject* args, PyObject* kwargs) { static char* kwlist[] = { "data", - "allow_empty", NULL }; - const char* source; - Py_ssize_t sourceSize; - PyObject* allowEmpty = NULL; + Py_buffer source; size_t destSize; - PyObject* output; - char* dest; - void* dictData = NULL; - size_t dictSize = 0; + PyObject* output = NULL; size_t zresult; - ZSTD_parameters zparams; + ZSTD_outBuffer outBuffer; + ZSTD_inBuffer inBuffer; #if PY_MAJOR_VERSION >= 3 - if (!PyArg_ParseTupleAndKeywords(args, kwargs, "y#|O:compress", + if (!PyArg_ParseTupleAndKeywords(args, kwargs, "y*|O:compress", #else - if (!PyArg_ParseTupleAndKeywords(args, kwargs, "s#|O:compress", + if (!PyArg_ParseTupleAndKeywords(args, kwargs, "s*|O:compress", #endif - kwlist, &source, &sourceSize, &allowEmpty)) { - return NULL; - } - - if (self->threads && self->dict) { - PyErr_SetString(ZstdError, - "compress() cannot be used with both dictionaries and multi-threaded compression"); - return NULL; - } - - if (self->threads && self->cparams) { - PyErr_SetString(ZstdError, - "compress() cannot be used with both compression parameters and multi-threaded compression"); - return NULL; - } - - /* Limitation in zstd C API doesn't let decompression side distinguish - between content size of 0 and unknown content size. This can make round - tripping via Python difficult. Until this is fixed, require a flag - to fire the footgun. - https://github.com/indygreg/python-zstandard/issues/11 */ - if (0 == sourceSize && self->fparams.contentSizeFlag - && (!allowEmpty || PyObject_Not(allowEmpty))) { - PyErr_SetString(PyExc_ValueError, "cannot write empty inputs when writing content sizes"); - return NULL; - } - - destSize = ZSTD_compressBound(sourceSize); - output = PyBytes_FromStringAndSize(NULL, destSize); - if (!output) { + kwlist, &source)) { return NULL; } - dest = PyBytes_AsString(output); - - if (self->dict) { - dictData = self->dict->dictData; - dictSize = self->dict->dictSize; + if (!PyBuffer_IsContiguous(&source, 'C') || source.ndim > 1) { + PyErr_SetString(PyExc_ValueError, + "data buffer should be contiguous and have at most one dimension"); + goto finally; } - memset(&zparams, 0, sizeof(zparams)); - if (!self->cparams) { - zparams.cParams = ZSTD_getCParams(self->compressionLevel, sourceSize, dictSize); + if (ensure_cctx(self)) { + goto finally; } - else { - ztopy_compression_parameters(self->cparams, &zparams.cParams); - /* Do NOT call ZSTD_adjustCParams() here because the compression params - come from the user. */ + + destSize = ZSTD_compressBound(source.len); + output = PyBytes_FromStringAndSize(NULL, destSize); + if (!output) { + goto finally; } - zparams.fParams = self->fparams; - - /* The raw dict data has to be processed before it can be used. Since this - adds overhead - especially if multiple dictionary compression operations - are performed on the same ZstdCompressor instance - we create a - ZSTD_CDict once and reuse it for all operations. + zresult = ZSTD_CCtx_setPledgedSrcSize(self->cctx, source.len); + if (ZSTD_isError(zresult)) { + PyErr_Format(ZstdError, "error setting source size: %s", + ZSTD_getErrorName(zresult)); + Py_CLEAR(output); + goto finally; + } - Note: the compression parameters used for the first invocation (possibly - derived from the source size) will be reused on all subsequent invocations. - https://github.com/facebook/zstd/issues/358 contains more info. We could - potentially add an argument somewhere to control this behavior. - */ - if (0 != populate_cdict(self, &zparams)) { - Py_DECREF(output); - return NULL; - } + inBuffer.src = source.buf; + inBuffer.size = source.len; + inBuffer.pos = 0; + + outBuffer.dst = PyBytes_AsString(output); + outBuffer.size = destSize; + outBuffer.pos = 0; Py_BEGIN_ALLOW_THREADS - if (self->mtcctx) { - zresult = ZSTDMT_compressCCtx(self->mtcctx, dest, destSize, - source, sourceSize, self->compressionLevel); - } - else { - /* By avoiding ZSTD_compress(), we don't necessarily write out content - size. This means the argument to ZstdCompressor to control frame - parameters is honored. */ - if (self->cdict) { - zresult = ZSTD_compress_usingCDict(self->cctx, dest, destSize, - source, sourceSize, self->cdict); - } - else { - zresult = ZSTD_compress_advanced(self->cctx, dest, destSize, - source, sourceSize, dictData, dictSize, zparams); - } - } + /* By avoiding ZSTD_compress(), we don't necessarily write out content + size. This means the argument to ZstdCompressor to control frame + parameters is honored. */ + zresult = ZSTD_compress_generic(self->cctx, &outBuffer, &inBuffer, ZSTD_e_end); Py_END_ALLOW_THREADS if (ZSTD_isError(zresult)) { PyErr_Format(ZstdError, "cannot compress: %s", ZSTD_getErrorName(zresult)); Py_CLEAR(output); - return NULL; + goto finally; } - else { - Py_SIZE(output) = zresult; + else if (zresult) { + PyErr_SetString(ZstdError, "unexpected partial frame flush"); + Py_CLEAR(output); + goto finally; } + Py_SIZE(output) = outBuffer.pos; + +finally: + PyBuffer_Release(&source); return output; } @@ -608,11 +643,23 @@ NULL }; - Py_ssize_t inSize = 0; + unsigned long long inSize = ZSTD_CONTENTSIZE_UNKNOWN; size_t outSize = ZSTD_CStreamOutSize(); ZstdCompressionObj* result = NULL; + size_t zresult; - if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|n:compressobj", kwlist, &inSize)) { + if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|K:compressobj", kwlist, &inSize)) { + return NULL; + } + + if (ensure_cctx(self)) { + return NULL; + } + + zresult = ZSTD_CCtx_setPledgedSrcSize(self->cctx, inSize); + if (ZSTD_isError(zresult)) { + PyErr_Format(ZstdError, "error setting source size: %s", + ZSTD_getErrorName(zresult)); return NULL; } @@ -621,19 +668,6 @@ return NULL; } - if (self->mtcctx) { - if (init_mtcstream(self, inSize)) { - Py_DECREF(result); - return NULL; - } - } - else { - if (0 != init_cstream(self, inSize)) { - Py_DECREF(result); - return NULL; - } - } - result->output.dst = PyMem_Malloc(outSize); if (!result->output.dst) { PyErr_NoMemory(); @@ -647,9 +681,9 @@ return result; } -PyDoc_STRVAR(ZstdCompressor_read_from__doc__, -"read_from(reader, [size=0, read_size=default, write_size=default])\n" -"Read uncompress data from a reader and return an iterator\n" +PyDoc_STRVAR(ZstdCompressor_read_to_iter__doc__, +"read_to_iter(reader, [size=0, read_size=default, write_size=default])\n" +"Read uncompressed data from a reader and return an iterator\n" "\n" "Returns an iterator of compressed data produced from reading from ``reader``.\n" "\n" @@ -667,7 +701,7 @@ "not consume from the reader unless the caller consumes from the iterator.\n" ); -static ZstdCompressorIterator* ZstdCompressor_read_from(ZstdCompressor* self, PyObject* args, PyObject* kwargs) { +static ZstdCompressorIterator* ZstdCompressor_read_to_iter(ZstdCompressor* self, PyObject* args, PyObject* kwargs) { static char* kwlist[] = { "reader", "size", @@ -677,12 +711,13 @@ }; PyObject* reader; - Py_ssize_t sourceSize = 0; + unsigned long long sourceSize = ZSTD_CONTENTSIZE_UNKNOWN; size_t inSize = ZSTD_CStreamInSize(); size_t outSize = ZSTD_CStreamOutSize(); ZstdCompressorIterator* result; + size_t zresult; - if (!PyArg_ParseTupleAndKeywords(args, kwargs, "O|nkk:read_from", kwlist, + if (!PyArg_ParseTupleAndKeywords(args, kwargs, "O|Kkk:read_to_iter", kwlist, &reader, &sourceSize, &inSize, &outSize)) { return NULL; } @@ -696,18 +731,11 @@ Py_INCREF(result->reader); } else if (1 == PyObject_CheckBuffer(reader)) { - result->buffer = PyMem_Malloc(sizeof(Py_buffer)); - if (!result->buffer) { + if (0 != PyObject_GetBuffer(reader, &result->buffer, PyBUF_CONTIG_RO)) { goto except; } - memset(result->buffer, 0, sizeof(Py_buffer)); - - if (0 != PyObject_GetBuffer(reader, result->buffer, PyBUF_CONTIG_RO)) { - goto except; - } - - sourceSize = result->buffer->len; + sourceSize = result->buffer.len; } else { PyErr_SetString(PyExc_ValueError, @@ -715,22 +743,20 @@ goto except; } + if (ensure_cctx(self)) { + return NULL; + } + + zresult = ZSTD_CCtx_setPledgedSrcSize(self->cctx, sourceSize); + if (ZSTD_isError(zresult)) { + PyErr_Format(ZstdError, "error setting source size: %s", + ZSTD_getErrorName(zresult)); + return NULL; + } + result->compressor = self; Py_INCREF(result->compressor); - result->sourceSize = sourceSize; - - if (self->mtcctx) { - if (init_mtcstream(self, sourceSize)) { - goto except; - } - } - else { - if (0 != init_cstream(self, sourceSize)) { - goto except; - } - } - result->inSize = inSize; result->outSize = outSize; @@ -744,16 +770,13 @@ goto finally; except: - Py_XDECREF(result->compressor); - Py_XDECREF(result->reader); - Py_DECREF(result); - result = NULL; + Py_CLEAR(result); finally: return result; } -PyDoc_STRVAR(ZstdCompressor_write_to___doc__, +PyDoc_STRVAR(ZstdCompressor_stream_writer___doc__, "Create a context manager to write compressed data to an object.\n" "\n" "The passed object must have a ``write()`` method.\n" @@ -771,7 +794,7 @@ "for a compressor output stream.\n" ); -static ZstdCompressionWriter* ZstdCompressor_write_to(ZstdCompressor* self, PyObject* args, PyObject* kwargs) { +static ZstdCompressionWriter* ZstdCompressor_stream_writer(ZstdCompressor* self, PyObject* args, PyObject* kwargs) { static char* kwlist[] = { "writer", "size", @@ -781,10 +804,10 @@ PyObject* writer; ZstdCompressionWriter* result; - Py_ssize_t sourceSize = 0; + unsigned long long sourceSize = ZSTD_CONTENTSIZE_UNKNOWN; size_t outSize = ZSTD_CStreamOutSize(); - if (!PyArg_ParseTupleAndKeywords(args, kwargs, "O|nk:write_to", kwlist, + if (!PyArg_ParseTupleAndKeywords(args, kwargs, "O|Kk:stream_writer", kwlist, &writer, &sourceSize, &outSize)) { return NULL; } @@ -794,6 +817,10 @@ return NULL; } + if (ensure_cctx(self)) { + return NULL; + } + result = (ZstdCompressionWriter*)PyObject_CallObject((PyObject*)&ZstdCompressionWriterType, NULL); if (!result) { return NULL; @@ -807,6 +834,7 @@ result->sourceSize = sourceSize; result->outSize = outSize; + result->bytesCompressed = 0; return result; } @@ -833,6 +861,7 @@ WorkerError_none = 0, WorkerError_zstd = 1, WorkerError_no_memory = 2, + WorkerError_nospace = 3, } WorkerError; /** @@ -841,10 +870,6 @@ typedef struct { /* Used for compression. */ ZSTD_CCtx* cctx; - ZSTD_CDict* cdict; - int cLevel; - CompressionParametersObject* cParams; - ZSTD_frameParameters fParams; /* What to compress. */ DataSource* sources; @@ -868,7 +893,6 @@ Py_ssize_t remainingItems = state->endOffset - state->startOffset + 1; Py_ssize_t currentBufferStartOffset = state->startOffset; size_t zresult; - ZSTD_parameters zparams; void* newDest; size_t allocationSize; size_t boundSize; @@ -879,16 +903,10 @@ assert(!state->destBuffers); assert(0 == state->destCount); - if (state->cParams) { - ztopy_compression_parameters(state->cParams, &zparams.cParams); - } - - zparams.fParams = state->fParams; - /* * The total size of the compressed data is unknown until we actually * compress data. That means we can't pre-allocate the exact size we need. - * + * * There is a cost to every allocation and reallocation. So, it is in our * interest to minimize the number of allocations. * @@ -927,7 +945,8 @@ destBuffer->segmentsSize = remainingItems; - allocationSize = roundpow2(state->totalSourceSize >> 4); + assert(state->totalSourceSize <= SIZE_MAX); + allocationSize = roundpow2((size_t)state->totalSourceSize >> 4); /* If the maximum size of the output is larger than that, round up. */ boundSize = ZSTD_compressBound(sources[inputOffset].sourceSize); @@ -949,6 +968,8 @@ size_t sourceSize = sources[inputOffset].sourceSize; size_t destAvailable; void* dest; + ZSTD_outBuffer opOutBuffer; + ZSTD_inBuffer opInBuffer; destAvailable = destBuffer->destSize - destOffset; boundSize = ZSTD_compressBound(sourceSize); @@ -1004,7 +1025,8 @@ * We could dynamically update allocation size based on work done so far. * For now, keep is simple. */ - allocationSize = roundpow2(state->totalSourceSize >> 4); + assert(state->totalSourceSize <= SIZE_MAX); + allocationSize = roundpow2((size_t)state->totalSourceSize >> 4); if (boundSize > allocationSize) { allocationSize = roundpow2(boundSize); @@ -1032,19 +1054,15 @@ dest = (char*)destBuffer->dest + destOffset; - if (state->cdict) { - zresult = ZSTD_compress_usingCDict(state->cctx, dest, destAvailable, - source, sourceSize, state->cdict); - } - else { - if (!state->cParams) { - zparams.cParams = ZSTD_getCParams(state->cLevel, sourceSize, 0); - } + opInBuffer.src = source; + opInBuffer.size = sourceSize; + opInBuffer.pos = 0; - zresult = ZSTD_compress_advanced(state->cctx, dest, destAvailable, - source, sourceSize, NULL, 0, zparams); - } + opOutBuffer.dst = dest; + opOutBuffer.size = destAvailable; + opOutBuffer.pos = 0; + zresult = ZSTD_CCtx_setPledgedSrcSize(state->cctx, sourceSize); if (ZSTD_isError(zresult)) { state->error = WorkerError_zstd; state->zresult = zresult; @@ -1052,10 +1070,23 @@ break; } + zresult = ZSTD_compress_generic(state->cctx, &opOutBuffer, &opInBuffer, ZSTD_e_end); + if (ZSTD_isError(zresult)) { + state->error = WorkerError_zstd; + state->zresult = zresult; + state->errorOffset = inputOffset; + break; + } + else if (zresult) { + state->error = WorkerError_nospace; + state->errorOffset = inputOffset; + break; + } + destBuffer->segments[inputOffset - currentBufferStartOffset].offset = destOffset; - destBuffer->segments[inputOffset - currentBufferStartOffset].length = zresult; + destBuffer->segments[inputOffset - currentBufferStartOffset].length = opOutBuffer.pos; - destOffset += zresult; + destOffset += opOutBuffer.pos; remainingItems--; } @@ -1072,15 +1103,14 @@ } ZstdBufferWithSegmentsCollection* compress_from_datasources(ZstdCompressor* compressor, - DataSources* sources, unsigned int threadCount) { - ZSTD_parameters zparams; + DataSources* sources, Py_ssize_t threadCount) { unsigned long long bytesPerWorker; POOL_ctx* pool = NULL; WorkerState* workerStates = NULL; Py_ssize_t i; unsigned long long workerBytes = 0; Py_ssize_t workerStartOffset = 0; - size_t currentThread = 0; + Py_ssize_t currentThread = 0; int errored = 0; Py_ssize_t segmentsCount = 0; Py_ssize_t segmentIndex; @@ -1093,34 +1123,12 @@ assert(threadCount >= 1); /* More threads than inputs makes no sense. */ - threadCount = sources->sourcesSize < threadCount ? (unsigned int)sources->sourcesSize + threadCount = sources->sourcesSize < threadCount ? sources->sourcesSize : threadCount; /* TODO lower thread count when input size is too small and threads would add overhead. */ - /* - * When dictionaries are used, parameters are derived from the size of the - * first element. - * - * TODO come up with a better mechanism. - */ - memset(&zparams, 0, sizeof(zparams)); - if (compressor->cparams) { - ztopy_compression_parameters(compressor->cparams, &zparams.cParams); - } - else { - zparams.cParams = ZSTD_getCParams(compressor->compressionLevel, - sources->sources[0].sourceSize, - compressor->dict ? compressor->dict->dictSize : 0); - } - - zparams.fParams = compressor->fparams; - - if (0 != populate_cdict(compressor, &zparams)) { - return NULL; - } - workerStates = PyMem_Malloc(threadCount * sizeof(WorkerState)); if (NULL == workerStates) { PyErr_NoMemory(); @@ -1140,16 +1148,42 @@ bytesPerWorker = sources->totalSourceSize / threadCount; for (i = 0; i < threadCount; i++) { + size_t zresult; + workerStates[i].cctx = ZSTD_createCCtx(); if (!workerStates[i].cctx) { PyErr_NoMemory(); goto finally; } - workerStates[i].cdict = compressor->cdict; - workerStates[i].cLevel = compressor->compressionLevel; - workerStates[i].cParams = compressor->cparams; - workerStates[i].fParams = compressor->fparams; + zresult = ZSTD_CCtx_setParametersUsingCCtxParams(workerStates[i].cctx, + compressor->params); + if (ZSTD_isError(zresult)) { + PyErr_Format(ZstdError, "could not set compression parameters: %s", + ZSTD_getErrorName(zresult)); + goto finally; + } + + if (compressor->dict) { + if (compressor->dict->cdict) { + zresult = ZSTD_CCtx_refCDict(workerStates[i].cctx, compressor->dict->cdict); + } + else { + zresult = ZSTD_CCtx_loadDictionary_advanced( + workerStates[i].cctx, + compressor->dict->dictData, + compressor->dict->dictSize, + ZSTD_dlm_byRef, + compressor->dict->dictType); + } + + if (ZSTD_isError(zresult)) { + PyErr_Format(ZstdError, "could not load compression dictionary: %s", + ZSTD_getErrorName(zresult)); + goto finally; + } + + } workerStates[i].sources = sources->sources; workerStates[i].sourcesSize = sources->sourcesSize; @@ -1221,6 +1255,13 @@ workerStates[i].errorOffset, ZSTD_getErrorName(workerStates[i].zresult)); errored = 1; break; + + case WorkerError_nospace: + PyErr_Format(ZstdError, "error compressing item %zd: not enough space in output", + workerStates[i].errorOffset); + errored = 1; + break; + default: ; } @@ -1341,12 +1382,6 @@ Py_ssize_t sourceCount = 0; ZstdBufferWithSegmentsCollection* result = NULL; - if (self->mtcctx) { - PyErr_SetString(ZstdError, - "function cannot be called on ZstdCompressor configured for multi-threaded compression"); - return NULL; - } - memset(&sources, 0, sizeof(sources)); if (!PyArg_ParseTupleAndKeywords(args, kwargs, "O|i:multi_compress_to_buffer", kwlist, @@ -1372,8 +1407,14 @@ } for (i = 0; i < buffer->segmentCount; i++) { + if (buffer->segments[i].length > SIZE_MAX) { + PyErr_Format(PyExc_ValueError, + "buffer segment %zd is too large for this platform", i); + goto finally; + } + sources.sources[i].sourceData = (char*)buffer->data + buffer->segments[i].offset; - sources.sources[i].sourceSize = buffer->segments[i].length; + sources.sources[i].sourceSize = (size_t)buffer->segments[i].length; sources.totalSourceSize += buffer->segments[i].length; } @@ -1397,8 +1438,15 @@ buffer = collection->buffers[i]; for (j = 0; j < buffer->segmentCount; j++) { + if (buffer->segments[j].length > SIZE_MAX) { + PyErr_Format(PyExc_ValueError, + "buffer segment %zd in buffer %zd is too large for this platform", + j, i); + goto finally; + } + sources.sources[offset].sourceData = (char*)buffer->data + buffer->segments[j].offset; - sources.sources[offset].sourceSize = buffer->segments[j].length; + sources.sources[offset].sourceSize = (size_t)buffer->segments[j].length; sources.totalSourceSize += buffer->segments[j].length; offset++; @@ -1416,11 +1464,6 @@ goto finally; } - /* - * It isn't clear whether the address referred to by Py_buffer.buf - * is still valid after PyBuffer_Release. We we hold a reference to all - * Py_buffer instances for the duration of the operation. - */ dataBuffers = PyMem_Malloc(sourceCount * sizeof(Py_buffer)); if (NULL == dataBuffers) { PyErr_NoMemory(); @@ -1459,6 +1502,11 @@ goto finally; } + if (sources.totalSourceSize > SIZE_MAX) { + PyErr_SetString(PyExc_ValueError, "sources are too large for this platform"); + goto finally; + } + result = compress_from_datasources(self, &sources, threads); finally: @@ -1482,12 +1530,24 @@ METH_VARARGS | METH_KEYWORDS, ZstdCompressionObj__doc__ }, { "copy_stream", (PyCFunction)ZstdCompressor_copy_stream, METH_VARARGS | METH_KEYWORDS, ZstdCompressor_copy_stream__doc__ }, - { "read_from", (PyCFunction)ZstdCompressor_read_from, - METH_VARARGS | METH_KEYWORDS, ZstdCompressor_read_from__doc__ }, - { "write_to", (PyCFunction)ZstdCompressor_write_to, - METH_VARARGS | METH_KEYWORDS, ZstdCompressor_write_to___doc__ }, + { "stream_reader", (PyCFunction)ZstdCompressor_stream_reader, + METH_VARARGS | METH_KEYWORDS, ZstdCompressor_stream_reader__doc__ }, + { "stream_writer", (PyCFunction)ZstdCompressor_stream_writer, + METH_VARARGS | METH_KEYWORDS, ZstdCompressor_stream_writer___doc__ }, + { "read_to_iter", (PyCFunction)ZstdCompressor_read_to_iter, + METH_VARARGS | METH_KEYWORDS, ZstdCompressor_read_to_iter__doc__ }, + /* TODO Remove deprecated API */ + { "read_from", (PyCFunction)ZstdCompressor_read_to_iter, + METH_VARARGS | METH_KEYWORDS, ZstdCompressor_read_to_iter__doc__ }, + /* TODO remove deprecated API */ + { "write_to", (PyCFunction)ZstdCompressor_stream_writer, + METH_VARARGS | METH_KEYWORDS, ZstdCompressor_stream_writer___doc__ }, { "multi_compress_to_buffer", (PyCFunction)ZstdCompressor_multi_compress_to_buffer, METH_VARARGS | METH_KEYWORDS, ZstdCompressor_multi_compress_to_buffer__doc__ }, + { "memory_size", (PyCFunction)ZstdCompressor_memory_size, + METH_NOARGS, ZstdCompressor_memory_size__doc__ }, + { "frame_progression", (PyCFunction)ZstdCompressor_frame_progression, + METH_NOARGS, ZstdCompressor_frame_progression__doc__ }, { NULL, NULL } }; diff -r fb92df8b634c -r ed5448edcbfa contrib/python-zstandard/c-ext/compressoriterator.c --- a/contrib/python-zstandard/c-ext/compressoriterator.c Wed Apr 04 10:35:09 2018 -0400 +++ b/contrib/python-zstandard/c-ext/compressoriterator.c Wed Apr 18 15:32:08 2018 -0400 @@ -21,10 +21,9 @@ Py_XDECREF(self->compressor); Py_XDECREF(self->reader); - if (self->buffer) { - PyBuffer_Release(self->buffer); - PyMem_FREE(self->buffer); - self->buffer = NULL; + if (self->buffer.buf) { + PyBuffer_Release(&self->buffer); + memset(&self->buffer, 0, sizeof(self->buffer)); } if (self->output.dst) { @@ -58,14 +57,8 @@ /* If we have data left in the input, consume it. */ if (self->input.pos < self->input.size) { Py_BEGIN_ALLOW_THREADS - if (self->compressor->mtcctx) { - zresult = ZSTDMT_compressStream(self->compressor->mtcctx, - &self->output, &self->input); - } - else { - zresult = ZSTD_compressStream(self->compressor->cstream, &self->output, - &self->input); - } + zresult = ZSTD_compress_generic(self->compressor->cctx, &self->output, + &self->input, ZSTD_e_continue); Py_END_ALLOW_THREADS /* Release the Python object holding the input buffer. */ @@ -107,14 +100,14 @@ PyBytes_AsStringAndSize(readResult, &readBuffer, &readSize); } else { - assert(self->buffer && self->buffer->buf); + assert(self->buffer.buf); /* Only support contiguous C arrays. */ - assert(self->buffer->strides == NULL && self->buffer->suboffsets == NULL); - assert(self->buffer->itemsize == 1); + assert(self->buffer.strides == NULL && self->buffer.suboffsets == NULL); + assert(self->buffer.itemsize == 1); - readBuffer = (char*)self->buffer->buf + self->bufferOffset; - bufferRemaining = self->buffer->len - self->bufferOffset; + readBuffer = (char*)self->buffer.buf + self->bufferOffset; + bufferRemaining = self->buffer.len - self->bufferOffset; readSize = min(bufferRemaining, (Py_ssize_t)self->inSize); self->bufferOffset += readSize; } @@ -130,12 +123,12 @@ /* EOF */ if (0 == readSize) { - if (self->compressor->mtcctx) { - zresult = ZSTDMT_endStream(self->compressor->mtcctx, &self->output); - } - else { - zresult = ZSTD_endStream(self->compressor->cstream, &self->output); - } + self->input.src = NULL; + self->input.size = 0; + self->input.pos = 0; + + zresult = ZSTD_compress_generic(self->compressor->cctx, &self->output, + &self->input, ZSTD_e_end); if (ZSTD_isError(zresult)) { PyErr_Format(ZstdError, "error ending compression stream: %s", ZSTD_getErrorName(zresult)); @@ -159,13 +152,8 @@ self->input.pos = 0; Py_BEGIN_ALLOW_THREADS - if (self->compressor->mtcctx) { - zresult = ZSTDMT_compressStream(self->compressor->mtcctx, &self->output, - &self->input); - } - else { - zresult = ZSTD_compressStream(self->compressor->cstream, &self->output, &self->input); - } + zresult = ZSTD_compress_generic(self->compressor->cctx, &self->output, + &self->input, ZSTD_e_continue); Py_END_ALLOW_THREADS /* The input buffer currently points to memory managed by Python diff -r fb92df8b634c -r ed5448edcbfa contrib/python-zstandard/c-ext/constants.c --- a/contrib/python-zstandard/c-ext/constants.c Wed Apr 04 10:35:09 2018 -0400 +++ b/contrib/python-zstandard/c-ext/constants.c Wed Apr 18 15:32:08 2018 -0400 @@ -52,6 +52,11 @@ PyErr_Format(PyExc_ValueError, "could not create frame header object"); } + PyModule_AddObject(mod, "CONTENTSIZE_UNKNOWN", + PyLong_FromUnsignedLongLong(ZSTD_CONTENTSIZE_UNKNOWN)); + PyModule_AddObject(mod, "CONTENTSIZE_ERROR", + PyLong_FromUnsignedLongLong(ZSTD_CONTENTSIZE_ERROR)); + PyModule_AddIntConstant(mod, "MAX_COMPRESSION_LEVEL", ZSTD_maxCLevel()); PyModule_AddIntConstant(mod, "COMPRESSION_RECOMMENDED_INPUT_SIZE", (long)ZSTD_CStreamInSize()); @@ -75,7 +80,9 @@ PyModule_AddIntConstant(mod, "SEARCHLENGTH_MIN", ZSTD_SEARCHLENGTH_MIN); PyModule_AddIntConstant(mod, "SEARCHLENGTH_MAX", ZSTD_SEARCHLENGTH_MAX); PyModule_AddIntConstant(mod, "TARGETLENGTH_MIN", ZSTD_TARGETLENGTH_MIN); - PyModule_AddIntConstant(mod, "TARGETLENGTH_MAX", ZSTD_TARGETLENGTH_MAX); + PyModule_AddIntConstant(mod, "LDM_MINMATCH_MIN", ZSTD_LDM_MINMATCH_MIN); + PyModule_AddIntConstant(mod, "LDM_MINMATCH_MAX", ZSTD_LDM_MINMATCH_MAX); + PyModule_AddIntConstant(mod, "LDM_BUCKETSIZELOG_MAX", ZSTD_LDM_BUCKETSIZELOG_MAX); PyModule_AddIntConstant(mod, "STRATEGY_FAST", ZSTD_fast); PyModule_AddIntConstant(mod, "STRATEGY_DFAST", ZSTD_dfast); @@ -84,4 +91,12 @@ PyModule_AddIntConstant(mod, "STRATEGY_LAZY2", ZSTD_lazy2); PyModule_AddIntConstant(mod, "STRATEGY_BTLAZY2", ZSTD_btlazy2); PyModule_AddIntConstant(mod, "STRATEGY_BTOPT", ZSTD_btopt); + PyModule_AddIntConstant(mod, "STRATEGY_BTULTRA", ZSTD_btultra); + + PyModule_AddIntConstant(mod, "DICT_TYPE_AUTO", ZSTD_dct_auto); + PyModule_AddIntConstant(mod, "DICT_TYPE_RAWCONTENT", ZSTD_dct_rawContent); + PyModule_AddIntConstant(mod, "DICT_TYPE_FULLDICT", ZSTD_dct_fullDict); + + PyModule_AddIntConstant(mod, "FORMAT_ZSTD1", ZSTD_f_zstd1); + PyModule_AddIntConstant(mod, "FORMAT_ZSTD1_MAGICLESS", ZSTD_f_zstd1_magicless); } diff -r fb92df8b634c -r ed5448edcbfa contrib/python-zstandard/c-ext/decompressionreader.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/contrib/python-zstandard/c-ext/decompressionreader.c Wed Apr 18 15:32:08 2018 -0400 @@ -0,0 +1,459 @@ +/** +* Copyright (c) 2017-present, Gregory Szorc +* All rights reserved. +* +* This software may be modified and distributed under the terms +* of the BSD license. See the LICENSE file for details. +*/ + +#include "python-zstandard.h" + +extern PyObject* ZstdError; + +static void set_unsupported_operation(void) { + PyObject* iomod; + PyObject* exc; + + iomod = PyImport_ImportModule("io"); + if (NULL == iomod) { + return; + } + + exc = PyObject_GetAttrString(iomod, "UnsupportedOperation"); + if (NULL == exc) { + Py_DECREF(iomod); + return; + } + + PyErr_SetNone(exc); + Py_DECREF(exc); + Py_DECREF(iomod); +} + +static void reader_dealloc(ZstdDecompressionReader* self) { + Py_XDECREF(self->decompressor); + Py_XDECREF(self->reader); + + if (self->buffer.buf) { + PyBuffer_Release(&self->buffer); + } + + PyObject_Del(self); +} + +static ZstdDecompressionReader* reader_enter(ZstdDecompressionReader* self) { + if (self->entered) { + PyErr_SetString(PyExc_ValueError, "cannot __enter__ multiple times"); + return NULL; + } + + if (ensure_dctx(self->decompressor, 1)) { + return NULL; + } + + self->entered = 1; + + Py_INCREF(self); + return self; +} + +static PyObject* reader_exit(ZstdDecompressionReader* self, PyObject* args) { + PyObject* exc_type; + PyObject* exc_value; + PyObject* exc_tb; + + if (!PyArg_ParseTuple(args, "OOO:__exit__", &exc_type, &exc_value, &exc_tb)) { + return NULL; + } + + self->entered = 0; + self->closed = 1; + + /* Release resources. */ + Py_CLEAR(self->reader); + if (self->buffer.buf) { + PyBuffer_Release(&self->buffer); + memset(&self->buffer, 0, sizeof(self->buffer)); + } + + Py_CLEAR(self->decompressor); + + Py_RETURN_FALSE; +} + +static PyObject* reader_readable(PyObject* self) { + Py_RETURN_TRUE; +} + +static PyObject* reader_writable(PyObject* self) { + Py_RETURN_FALSE; +} + +static PyObject* reader_seekable(PyObject* self) { + Py_RETURN_TRUE; +} + +static PyObject* reader_close(ZstdDecompressionReader* self) { + self->closed = 1; + Py_RETURN_NONE; +} + +static PyObject* reader_closed(ZstdDecompressionReader* self) { + if (self->closed) { + Py_RETURN_TRUE; + } + else { + Py_RETURN_FALSE; + } +} + +static PyObject* reader_flush(PyObject* self) { + Py_RETURN_NONE; +} + +static PyObject* reader_isatty(PyObject* self) { + Py_RETURN_FALSE; +} + +static PyObject* reader_read(ZstdDecompressionReader* self, PyObject* args, PyObject* kwargs) { + static char* kwlist[] = { + "size", + NULL + }; + + Py_ssize_t size = -1; + PyObject* result = NULL; + char* resultBuffer; + Py_ssize_t resultSize; + ZSTD_outBuffer output; + size_t zresult; + + if (!self->entered) { + PyErr_SetString(ZstdError, "read() must be called from an active context manager"); + return NULL; + } + + if (self->closed) { + PyErr_SetString(PyExc_ValueError, "stream is closed"); + return NULL; + } + + if (self->finishedOutput) { + return PyBytes_FromStringAndSize("", 0); + } + + if (!PyArg_ParseTupleAndKeywords(args, kwargs, "n", kwlist, &size)) { + return NULL; + } + + if (size < 1) { + PyErr_SetString(PyExc_ValueError, "cannot read negative or size 0 amounts"); + return NULL; + } + + result = PyBytes_FromStringAndSize(NULL, size); + if (NULL == result) { + return NULL; + } + + PyBytes_AsStringAndSize(result, &resultBuffer, &resultSize); + + output.dst = resultBuffer; + output.size = resultSize; + output.pos = 0; + +readinput: + + /* Consume input data left over from last time. */ + if (self->input.pos < self->input.size) { + Py_BEGIN_ALLOW_THREADS + zresult = ZSTD_decompress_generic(self->decompressor->dctx, + &output, &self->input); + Py_END_ALLOW_THREADS + + /* Input exhausted. Clear our state tracking. */ + if (self->input.pos == self->input.size) { + memset(&self->input, 0, sizeof(self->input)); + Py_CLEAR(self->readResult); + + if (self->buffer.buf) { + self->finishedInput = 1; + } + } + + if (ZSTD_isError(zresult)) { + PyErr_Format(ZstdError, "zstd decompress error: %s", ZSTD_getErrorName(zresult)); + return NULL; + } + else if (0 == zresult) { + self->finishedOutput = 1; + } + + /* We fulfilled the full read request. Emit it. */ + if (output.pos && output.pos == output.size) { + self->bytesDecompressed += output.size; + return result; + } + + /* + * There is more room in the output. Fall through to try to collect + * more data so we can try to fill the output. + */ + } + + if (!self->finishedInput) { + if (self->reader) { + Py_buffer buffer; + + assert(self->readResult == NULL); + self->readResult = PyObject_CallMethod(self->reader, "read", + "k", self->readSize); + if (NULL == self->readResult) { + return NULL; + } + + memset(&buffer, 0, sizeof(buffer)); + + if (0 != PyObject_GetBuffer(self->readResult, &buffer, PyBUF_CONTIG_RO)) { + return NULL; + } + + /* EOF */ + if (0 == buffer.len) { + self->finishedInput = 1; + Py_CLEAR(self->readResult); + } + else { + self->input.src = buffer.buf; + self->input.size = buffer.len; + self->input.pos = 0; + } + + PyBuffer_Release(&buffer); + } + else { + assert(self->buffer.buf); + /* + * We should only get here once since above block will exhaust + * source buffer until finishedInput is set. + */ + assert(self->input.src == NULL); + + self->input.src = self->buffer.buf; + self->input.size = self->buffer.len; + self->input.pos = 0; + } + } + + if (self->input.size) { + goto readinput; + } + + /* EOF */ + self->bytesDecompressed += output.pos; + + if (safe_pybytes_resize(&result, output.pos)) { + Py_XDECREF(result); + return NULL; + } + + return result; +} + +static PyObject* reader_readall(PyObject* self) { + PyErr_SetNone(PyExc_NotImplementedError); + return NULL; +} + +static PyObject* reader_readline(PyObject* self) { + PyErr_SetNone(PyExc_NotImplementedError); + return NULL; +} + +static PyObject* reader_readlines(PyObject* self) { + PyErr_SetNone(PyExc_NotImplementedError); + return NULL; +} + +static PyObject* reader_seek(ZstdDecompressionReader* self, PyObject* args) { + Py_ssize_t pos; + int whence = 0; + unsigned long long readAmount = 0; + size_t defaultOutSize = ZSTD_DStreamOutSize(); + + if (!self->entered) { + PyErr_SetString(ZstdError, "seek() must be called from an active context manager"); + return NULL; + } + + if (self->closed) { + PyErr_SetString(PyExc_ValueError, "stream is closed"); + return NULL; + } + + if (!PyArg_ParseTuple(args, "n|i:seek", &pos, &whence)) { + return NULL; + } + + if (whence == SEEK_SET) { + if (pos < 0) { + PyErr_SetString(PyExc_ValueError, + "cannot seek to negative position with SEEK_SET"); + return NULL; + } + + if ((unsigned long long)pos < self->bytesDecompressed) { + PyErr_SetString(PyExc_ValueError, + "cannot seek zstd decompression stream backwards"); + return NULL; + } + + readAmount = pos - self->bytesDecompressed; + } + else if (whence == SEEK_CUR) { + if (pos < 0) { + PyErr_SetString(PyExc_ValueError, + "cannot seek zstd decompression stream backwards"); + return NULL; + } + + readAmount = pos; + } + else if (whence == SEEK_END) { + /* We /could/ support this with pos==0. But let's not do that until someone + needs it. */ + PyErr_SetString(PyExc_ValueError, + "zstd decompression streams cannot be seeked with SEEK_END"); + return NULL; + } + + /* It is a bit inefficient to do this via the Python API. But since there + is a bit of state tracking involved to read from this type, it is the + easiest to implement. */ + while (readAmount) { + Py_ssize_t readSize; + PyObject* readResult = PyObject_CallMethod((PyObject*)self, "read", "K", + readAmount < defaultOutSize ? readAmount : defaultOutSize); + + if (!readResult) { + return NULL; + } + + readSize = PyBytes_GET_SIZE(readResult); + + /* Empty read means EOF. */ + if (!readSize) { + break; + } + + readAmount -= readSize; + } + + return PyLong_FromUnsignedLongLong(self->bytesDecompressed); +} + +static PyObject* reader_tell(ZstdDecompressionReader* self) { + /* TODO should this raise OSError since stream isn't seekable? */ + return PyLong_FromUnsignedLongLong(self->bytesDecompressed); +} + +static PyObject* reader_write(PyObject* self, PyObject* args) { + set_unsupported_operation(); + return NULL; +} + +static PyObject* reader_writelines(PyObject* self, PyObject* args) { + set_unsupported_operation(); + return NULL; +} + +static PyObject* reader_iter(PyObject* self) { + PyErr_SetNone(PyExc_NotImplementedError); + return NULL; +} + +static PyObject* reader_iternext(PyObject* self) { + PyErr_SetNone(PyExc_NotImplementedError); + return NULL; +} + +static PyMethodDef reader_methods[] = { + { "__enter__", (PyCFunction)reader_enter, METH_NOARGS, + PyDoc_STR("Enter a compression context") }, + { "__exit__", (PyCFunction)reader_exit, METH_VARARGS, + PyDoc_STR("Exit a compression context") }, + { "close", (PyCFunction)reader_close, METH_NOARGS, + PyDoc_STR("Close the stream so it cannot perform any more operations") }, + { "closed", (PyCFunction)reader_closed, METH_NOARGS, + PyDoc_STR("Whether stream is closed") }, + { "flush", (PyCFunction)reader_flush, METH_NOARGS, PyDoc_STR("no-ops") }, + { "isatty", (PyCFunction)reader_isatty, METH_NOARGS, PyDoc_STR("Returns False") }, + { "readable", (PyCFunction)reader_readable, METH_NOARGS, + PyDoc_STR("Returns True") }, + { "read", (PyCFunction)reader_read, METH_VARARGS | METH_KEYWORDS, + PyDoc_STR("read compressed data") }, + { "readall", (PyCFunction)reader_readall, METH_NOARGS, PyDoc_STR("Not implemented") }, + { "readline", (PyCFunction)reader_readline, METH_NOARGS, PyDoc_STR("Not implemented") }, + { "readlines", (PyCFunction)reader_readlines, METH_NOARGS, PyDoc_STR("Not implemented") }, + { "seek", (PyCFunction)reader_seek, METH_VARARGS, PyDoc_STR("Seek the stream") }, + { "seekable", (PyCFunction)reader_seekable, METH_NOARGS, + PyDoc_STR("Returns True") }, + { "tell", (PyCFunction)reader_tell, METH_NOARGS, + PyDoc_STR("Returns current number of bytes compressed") }, + { "writable", (PyCFunction)reader_writable, METH_NOARGS, + PyDoc_STR("Returns False") }, + { "write", (PyCFunction)reader_write, METH_VARARGS, PyDoc_STR("unsupported operation") }, + { "writelines", (PyCFunction)reader_writelines, METH_VARARGS, PyDoc_STR("unsupported operation") }, + { NULL, NULL } +}; + +PyTypeObject ZstdDecompressionReaderType = { + PyVarObject_HEAD_INIT(NULL, 0) + "zstd.ZstdDecompressionReader", /* tp_name */ + sizeof(ZstdDecompressionReader), /* tp_basicsize */ + 0, /* tp_itemsize */ + (destructor)reader_dealloc, /* tp_dealloc */ + 0, /* tp_print */ + 0, /* tp_getattr */ + 0, /* tp_setattr */ + 0, /* tp_compare */ + 0, /* tp_repr */ + 0, /* tp_as_number */ + 0, /* tp_as_sequence */ + 0, /* tp_as_mapping */ + 0, /* tp_hash */ + 0, /* tp_call */ + 0, /* tp_str */ + 0, /* tp_getattro */ + 0, /* tp_setattro */ + 0, /* tp_as_buffer */ + Py_TPFLAGS_DEFAULT, /* tp_flags */ + 0, /* tp_doc */ + 0, /* tp_traverse */ + 0, /* tp_clear */ + 0, /* tp_richcompare */ + 0, /* tp_weaklistoffset */ + reader_iter, /* tp_iter */ + reader_iternext, /* tp_iternext */ + reader_methods, /* tp_methods */ + 0, /* tp_members */ + 0, /* tp_getset */ + 0, /* tp_base */ + 0, /* tp_dict */ + 0, /* tp_descr_get */ + 0, /* tp_descr_set */ + 0, /* tp_dictoffset */ + 0, /* tp_init */ + 0, /* tp_alloc */ + PyType_GenericNew, /* tp_new */ +}; + + +void decompressionreader_module_init(PyObject* mod) { + /* TODO make reader a sub-class of io.RawIOBase */ + + Py_TYPE(&ZstdDecompressionReaderType) = &PyType_Type; + if (PyType_Ready(&ZstdDecompressionReaderType) < 0) { + return; + } +} diff -r fb92df8b634c -r ed5448edcbfa contrib/python-zstandard/c-ext/decompressionwriter.c --- a/contrib/python-zstandard/c-ext/decompressionwriter.c Wed Apr 04 10:35:09 2018 -0400 +++ b/contrib/python-zstandard/c-ext/decompressionwriter.c Wed Apr 18 15:32:08 2018 -0400 @@ -27,7 +27,7 @@ return NULL; } - if (0 != init_dstream(self->decompressor)) { + if (ensure_dctx(self->decompressor, 1)) { return NULL; } @@ -44,18 +44,17 @@ } static PyObject* ZstdDecompressionWriter_memory_size(ZstdDecompressionWriter* self) { - if (!self->decompressor->dstream) { - PyErr_SetString(ZstdError, "cannot determine size of inactive decompressor; " - "call when context manager is active"); - return NULL; - } - - return PyLong_FromSize_t(ZSTD_sizeof_DStream(self->decompressor->dstream)); + return PyLong_FromSize_t(ZSTD_sizeof_DCtx(self->decompressor->dctx)); } -static PyObject* ZstdDecompressionWriter_write(ZstdDecompressionWriter* self, PyObject* args) { - const char* source; - Py_ssize_t sourceSize; +static PyObject* ZstdDecompressionWriter_write(ZstdDecompressionWriter* self, PyObject* args, PyObject* kwargs) { + static char* kwlist[] = { + "data", + NULL + }; + + PyObject* result = NULL; + Py_buffer source; size_t zresult = 0; ZSTD_inBuffer input; ZSTD_outBuffer output; @@ -63,41 +62,47 @@ Py_ssize_t totalWrite = 0; #if PY_MAJOR_VERSION >= 3 - if (!PyArg_ParseTuple(args, "y#:write", &source, &sourceSize)) { + if (!PyArg_ParseTupleAndKeywords(args, kwargs, "y*:write", #else - if (!PyArg_ParseTuple(args, "s#:write", &source, &sourceSize)) { + if (!PyArg_ParseTupleAndKeywords(args, kwargs, "s*:write", #endif + kwlist, &source)) { return NULL; } + if (!PyBuffer_IsContiguous(&source, 'C') || source.ndim > 1) { + PyErr_SetString(PyExc_ValueError, + "data buffer should be contiguous and have at most one dimension"); + goto finally; + } + if (!self->entered) { PyErr_SetString(ZstdError, "write must be called from an active context manager"); - return NULL; + goto finally; } - assert(self->decompressor->dstream); - output.dst = PyMem_Malloc(self->outSize); if (!output.dst) { - return PyErr_NoMemory(); + PyErr_NoMemory(); + goto finally; } output.size = self->outSize; output.pos = 0; - input.src = source; - input.size = sourceSize; + input.src = source.buf; + input.size = source.len; input.pos = 0; - while ((ssize_t)input.pos < sourceSize) { + while ((ssize_t)input.pos < source.len) { Py_BEGIN_ALLOW_THREADS - zresult = ZSTD_decompressStream(self->decompressor->dstream, &output, &input); + zresult = ZSTD_decompress_generic(self->decompressor->dctx, &output, &input); Py_END_ALLOW_THREADS if (ZSTD_isError(zresult)) { PyMem_Free(output.dst); PyErr_Format(ZstdError, "zstd decompress error: %s", ZSTD_getErrorName(zresult)); - return NULL; + goto finally; } if (output.pos) { @@ -115,7 +120,11 @@ PyMem_Free(output.dst); - return PyLong_FromSsize_t(totalWrite); + result = PyLong_FromSsize_t(totalWrite); + +finally: + PyBuffer_Release(&source); + return result; } static PyMethodDef ZstdDecompressionWriter_methods[] = { @@ -125,7 +134,7 @@ PyDoc_STR("Exit a decompression context.") }, { "memory_size", (PyCFunction)ZstdDecompressionWriter_memory_size, METH_NOARGS, PyDoc_STR("Obtain the memory size in bytes of the underlying decompressor.") }, - { "write", (PyCFunction)ZstdDecompressionWriter_write, METH_VARARGS, + { "write", (PyCFunction)ZstdDecompressionWriter_write, METH_VARARGS | METH_KEYWORDS, PyDoc_STR("Compress data") }, { NULL, NULL } }; diff -r fb92df8b634c -r ed5448edcbfa contrib/python-zstandard/c-ext/decompressobj.c --- a/contrib/python-zstandard/c-ext/decompressobj.c Wed Apr 04 10:35:09 2018 -0400 +++ b/contrib/python-zstandard/c-ext/decompressobj.c Wed Apr 18 15:32:08 2018 -0400 @@ -20,56 +20,61 @@ PyObject_Del(self); } -static PyObject* DecompressionObj_decompress(ZstdDecompressionObj* self, PyObject* args) { - const char* source; - Py_ssize_t sourceSize; +static PyObject* DecompressionObj_decompress(ZstdDecompressionObj* self, PyObject* args, PyObject* kwargs) { + static char* kwlist[] = { + "data", + NULL + }; + + Py_buffer source; size_t zresult; ZSTD_inBuffer input; ZSTD_outBuffer output; - size_t outSize = ZSTD_DStreamOutSize(); PyObject* result = NULL; Py_ssize_t resultSize = 0; - /* Constructor should ensure stream is populated. */ - assert(self->decompressor->dstream); - if (self->finished) { PyErr_SetString(ZstdError, "cannot use a decompressobj multiple times"); return NULL; } #if PY_MAJOR_VERSION >= 3 - if (!PyArg_ParseTuple(args, "y#:decompress", + if (!PyArg_ParseTupleAndKeywords(args, kwargs, "y*:decompress", #else - if (!PyArg_ParseTuple(args, "s#:decompress", + if (!PyArg_ParseTupleAndKeywords(args, kwargs, "s*:decompress", #endif - &source, &sourceSize)) { + kwlist, &source)) { return NULL; } - input.src = source; - input.size = sourceSize; + if (!PyBuffer_IsContiguous(&source, 'C') || source.ndim > 1) { + PyErr_SetString(PyExc_ValueError, + "data buffer should be contiguous and have at most one dimension"); + goto finally; + } + + input.src = source.buf; + input.size = source.len; input.pos = 0; - output.dst = PyMem_Malloc(outSize); + output.dst = PyMem_Malloc(self->outSize); if (!output.dst) { PyErr_NoMemory(); - return NULL; + goto except; } - output.size = outSize; + output.size = self->outSize; output.pos = 0; /* Read input until exhausted. */ while (input.pos < input.size) { Py_BEGIN_ALLOW_THREADS - zresult = ZSTD_decompressStream(self->decompressor->dstream, &output, &input); + zresult = ZSTD_decompress_generic(self->decompressor->dctx, &output, &input); Py_END_ALLOW_THREADS if (ZSTD_isError(zresult)) { PyErr_Format(ZstdError, "zstd decompressor error: %s", ZSTD_getErrorName(zresult)); - result = NULL; - goto finally; + goto except; } if (0 == zresult) { @@ -79,7 +84,8 @@ if (output.pos) { if (result) { resultSize = PyBytes_GET_SIZE(result); - if (-1 == _PyBytes_Resize(&result, resultSize + output.pos)) { + if (-1 == safe_pybytes_resize(&result, resultSize + output.pos)) { + Py_XDECREF(result); goto except; } @@ -108,13 +114,14 @@ finally: PyMem_Free(output.dst); + PyBuffer_Release(&source); return result; } static PyMethodDef DecompressionObj_methods[] = { { "decompress", (PyCFunction)DecompressionObj_decompress, - METH_VARARGS, PyDoc_STR("decompress data") }, + METH_VARARGS | METH_KEYWORDS, PyDoc_STR("decompress data") }, { NULL, NULL } }; diff -r fb92df8b634c -r ed5448edcbfa contrib/python-zstandard/c-ext/decompressor.c --- a/contrib/python-zstandard/c-ext/decompressor.c Wed Apr 04 10:35:09 2018 -0400 +++ b/contrib/python-zstandard/c-ext/decompressor.c Wed Apr 18 15:32:08 2018 -0400 @@ -12,54 +12,40 @@ extern PyObject* ZstdError; /** - * Ensure the ZSTD_DStream on a ZstdDecompressor is initialized and reset. - * - * This should be called before starting a decompression operation with a - * ZSTD_DStream on a ZstdDecompressor. - */ -int init_dstream(ZstdDecompressor* decompressor) { - void* dictData = NULL; - size_t dictSize = 0; + * Ensure the ZSTD_DCtx on a decompressor is initiated and ready for a new operation. + */ +int ensure_dctx(ZstdDecompressor* decompressor, int loadDict) { size_t zresult; - /* Simple case of dstream already exists. Just reset it. */ - if (decompressor->dstream) { - zresult = ZSTD_resetDStream(decompressor->dstream); + ZSTD_DCtx_reset(decompressor->dctx); + + if (decompressor->maxWindowSize) { + zresult = ZSTD_DCtx_setMaxWindowSize(decompressor->dctx, decompressor->maxWindowSize); if (ZSTD_isError(zresult)) { - PyErr_Format(ZstdError, "could not reset DStream: %s", + PyErr_Format(ZstdError, "unable to set max window size: %s", ZSTD_getErrorName(zresult)); - return -1; + return 1; } - - return 0; } - decompressor->dstream = ZSTD_createDStream(); - if (!decompressor->dstream) { - PyErr_SetString(ZstdError, "could not create DStream"); - return -1; - } - - if (decompressor->dict) { - dictData = decompressor->dict->dictData; - dictSize = decompressor->dict->dictSize; + zresult = ZSTD_DCtx_setFormat(decompressor->dctx, decompressor->format); + if (ZSTD_isError(zresult)) { + PyErr_Format(ZstdError, "unable to set decoding format: %s", + ZSTD_getErrorName(zresult)); + return 1; } - if (dictData) { - zresult = ZSTD_initDStream_usingDict(decompressor->dstream, dictData, dictSize); - } - else { - zresult = ZSTD_initDStream(decompressor->dstream); - } + if (loadDict && decompressor->dict) { + if (ensure_ddict(decompressor->dict)) { + return 1; + } - if (ZSTD_isError(zresult)) { - /* Don't leave a reference to an invalid object. */ - ZSTD_freeDStream(decompressor->dstream); - decompressor->dstream = NULL; - - PyErr_Format(ZstdError, "could not initialize DStream: %s", - ZSTD_getErrorName(zresult)); - return -1; + zresult = ZSTD_DCtx_refDDict(decompressor->dctx, decompressor->dict->ddict); + if (ZSTD_isError(zresult)) { + PyErr_Format(ZstdError, "unable to reference prepared dictionary: %s", + ZSTD_getErrorName(zresult)); + return 1; + } } return 0; @@ -76,36 +62,46 @@ static int Decompressor_init(ZstdDecompressor* self, PyObject* args, PyObject* kwargs) { static char* kwlist[] = { "dict_data", + "max_window_size", + "format", NULL }; ZstdCompressionDict* dict = NULL; + size_t maxWindowSize = 0; + ZSTD_format_e format = ZSTD_f_zstd1; self->dctx = NULL; self->dict = NULL; - self->ddict = NULL; - if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|O!:ZstdDecompressor", kwlist, - &ZstdCompressionDictType, &dict)) { + if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|O!II:ZstdDecompressor", kwlist, + &ZstdCompressionDictType, &dict, &maxWindowSize, &format)) { return -1; } - /* TODO lazily initialize the reference ZSTD_DCtx on first use since - not instances of ZstdDecompressor will use a ZSTD_DCtx. */ self->dctx = ZSTD_createDCtx(); if (!self->dctx) { PyErr_NoMemory(); goto except; } + self->maxWindowSize = maxWindowSize; + self->format = format; + if (dict) { self->dict = dict; Py_INCREF(dict); } + if (ensure_dctx(self, 1)) { + goto except; + } + return 0; except: + Py_CLEAR(self->dict); + if (self->dctx) { ZSTD_freeDCtx(self->dctx); self->dctx = NULL; @@ -117,16 +113,6 @@ static void Decompressor_dealloc(ZstdDecompressor* self) { Py_CLEAR(self->dict); - if (self->ddict) { - ZSTD_freeDDict(self->ddict); - self->ddict = NULL; - } - - if (self->dstream) { - ZSTD_freeDStream(self->dstream); - self->dstream = NULL; - } - if (self->dctx) { ZSTD_freeDCtx(self->dctx); self->dctx = NULL; @@ -135,6 +121,20 @@ PyObject_Del(self); } +PyDoc_STRVAR(Decompressor_memory_size__doc__, +"memory_size() -- Size of decompression context, in bytes\n" +); + +static PyObject* Decompressor_memory_size(ZstdDecompressor* self) { + if (self->dctx) { + return PyLong_FromSize_t(ZSTD_sizeof_DCtx(self->dctx)); + } + else { + PyErr_SetString(ZstdError, "no decompressor context found; this should never happen"); + return NULL; + } +} + PyDoc_STRVAR(Decompressor_copy_stream__doc__, "copy_stream(ifh, ofh[, read_size=default, write_size=default]) -- decompress data between streams\n" "\n" @@ -166,7 +166,7 @@ Py_ssize_t totalWrite = 0; char* readBuffer; Py_ssize_t readSize; - PyObject* readResult; + PyObject* readResult = NULL; PyObject* res = NULL; size_t zresult = 0; PyObject* writeResult; @@ -191,7 +191,7 @@ /* Prevent free on uninitialized memory in finally. */ output.dst = NULL; - if (0 != init_dstream(self)) { + if (ensure_dctx(self, 1)) { res = NULL; goto finally; } @@ -229,7 +229,7 @@ while (input.pos < input.size) { Py_BEGIN_ALLOW_THREADS - zresult = ZSTD_decompressStream(self->dstream, &output, &input); + zresult = ZSTD_decompress_generic(self->dctx, &output, &input); Py_END_ALLOW_THREADS if (ZSTD_isError(zresult)) { @@ -252,6 +252,8 @@ output.pos = 0; } } + + Py_CLEAR(readResult); } /* Source stream is exhausted. Finish up. */ @@ -267,6 +269,8 @@ PyMem_Free(output.dst); } + Py_XDECREF(readResult); + return res; } @@ -300,98 +304,114 @@ NULL }; - const char* source; - Py_ssize_t sourceSize; + Py_buffer source; Py_ssize_t maxOutputSize = 0; unsigned long long decompressedSize; size_t destCapacity; PyObject* result = NULL; - void* dictData = NULL; - size_t dictSize = 0; size_t zresult; + ZSTD_outBuffer outBuffer; + ZSTD_inBuffer inBuffer; #if PY_MAJOR_VERSION >= 3 - if (!PyArg_ParseTupleAndKeywords(args, kwargs, "y#|n:decompress", + if (!PyArg_ParseTupleAndKeywords(args, kwargs, "y*|n:decompress", #else - if (!PyArg_ParseTupleAndKeywords(args, kwargs, "s#|n:decompress", + if (!PyArg_ParseTupleAndKeywords(args, kwargs, "s*|n:decompress", #endif - kwlist, &source, &sourceSize, &maxOutputSize)) { + kwlist, &source, &maxOutputSize)) { return NULL; } - if (self->dict) { - dictData = self->dict->dictData; - dictSize = self->dict->dictSize; + if (!PyBuffer_IsContiguous(&source, 'C') || source.ndim > 1) { + PyErr_SetString(PyExc_ValueError, + "data buffer should be contiguous and have at most one dimension"); + goto finally; } - if (dictData && !self->ddict) { - Py_BEGIN_ALLOW_THREADS - self->ddict = ZSTD_createDDict_byReference(dictData, dictSize); - Py_END_ALLOW_THREADS - - if (!self->ddict) { - PyErr_SetString(ZstdError, "could not create decompression dict"); - return NULL; - } + if (ensure_dctx(self, 1)) { + goto finally; } - decompressedSize = ZSTD_getDecompressedSize(source, sourceSize); - /* 0 returned if content size not in the zstd frame header */ - if (0 == decompressedSize) { + decompressedSize = ZSTD_getFrameContentSize(source.buf, source.len); + + if (ZSTD_CONTENTSIZE_ERROR == decompressedSize) { + PyErr_SetString(ZstdError, "error determining content size from frame header"); + goto finally; + } + /* Special case of empty frame. */ + else if (0 == decompressedSize) { + result = PyBytes_FromStringAndSize("", 0); + goto finally; + } + /* Missing content size in frame header. */ + if (ZSTD_CONTENTSIZE_UNKNOWN == decompressedSize) { if (0 == maxOutputSize) { - PyErr_SetString(ZstdError, "input data invalid or missing content size " - "in frame header"); - return NULL; + PyErr_SetString(ZstdError, "could not determine content size in frame header"); + goto finally; } - else { - result = PyBytes_FromStringAndSize(NULL, maxOutputSize); - destCapacity = maxOutputSize; + + result = PyBytes_FromStringAndSize(NULL, maxOutputSize); + destCapacity = maxOutputSize; + decompressedSize = 0; + } + /* Size is recorded in frame header. */ + else { + assert(SIZE_MAX >= PY_SSIZE_T_MAX); + if (decompressedSize > PY_SSIZE_T_MAX) { + PyErr_SetString(ZstdError, "frame is too large to decompress on this platform"); + goto finally; } - } - else { - result = PyBytes_FromStringAndSize(NULL, decompressedSize); - destCapacity = decompressedSize; + + result = PyBytes_FromStringAndSize(NULL, (Py_ssize_t)decompressedSize); + destCapacity = (size_t)decompressedSize; } if (!result) { - return NULL; + goto finally; } + outBuffer.dst = PyBytes_AsString(result); + outBuffer.size = destCapacity; + outBuffer.pos = 0; + + inBuffer.src = source.buf; + inBuffer.size = source.len; + inBuffer.pos = 0; + Py_BEGIN_ALLOW_THREADS - if (self->ddict) { - zresult = ZSTD_decompress_usingDDict(self->dctx, - PyBytes_AsString(result), destCapacity, - source, sourceSize, self->ddict); - } - else { - zresult = ZSTD_decompressDCtx(self->dctx, - PyBytes_AsString(result), destCapacity, source, sourceSize); - } + zresult = ZSTD_decompress_generic(self->dctx, &outBuffer, &inBuffer); Py_END_ALLOW_THREADS if (ZSTD_isError(zresult)) { PyErr_Format(ZstdError, "decompression error: %s", ZSTD_getErrorName(zresult)); - Py_DECREF(result); - return NULL; + Py_CLEAR(result); + goto finally; } - else if (decompressedSize && zresult != decompressedSize) { + else if (zresult) { + PyErr_Format(ZstdError, "decompression error: did not decompress full frame"); + Py_CLEAR(result); + goto finally; + } + else if (decompressedSize && outBuffer.pos != decompressedSize) { PyErr_Format(ZstdError, "decompression error: decompressed %zu bytes; expected %llu", zresult, decompressedSize); - Py_DECREF(result); - return NULL; + Py_CLEAR(result); + goto finally; } - else if (zresult < destCapacity) { - if (_PyBytes_Resize(&result, zresult)) { - Py_DECREF(result); - return NULL; + else if (outBuffer.pos < destCapacity) { + if (safe_pybytes_resize(&result, outBuffer.pos)) { + Py_CLEAR(result); + goto finally; } } +finally: + PyBuffer_Release(&source); return result; } PyDoc_STRVAR(Decompressor_decompressobj__doc__, -"decompressobj()\n" +"decompressobj([write_size=default])\n" "\n" "Incrementally feed data into a decompressor.\n" "\n" @@ -400,25 +420,43 @@ "callers can swap in the zstd decompressor while using the same API.\n" ); -static ZstdDecompressionObj* Decompressor_decompressobj(ZstdDecompressor* self) { - ZstdDecompressionObj* result = (ZstdDecompressionObj*)PyObject_CallObject((PyObject*)&ZstdDecompressionObjType, NULL); +static ZstdDecompressionObj* Decompressor_decompressobj(ZstdDecompressor* self, PyObject* args, PyObject* kwargs) { + static char* kwlist[] = { + "write_size", + NULL + }; + + ZstdDecompressionObj* result = NULL; + size_t outSize = ZSTD_DStreamOutSize(); + + if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|k:decompressobj", kwlist, &outSize)) { + return NULL; + } + + if (!outSize) { + PyErr_SetString(PyExc_ValueError, "write_size must be positive"); + return NULL; + } + + result = (ZstdDecompressionObj*)PyObject_CallObject((PyObject*)&ZstdDecompressionObjType, NULL); if (!result) { return NULL; } - if (0 != init_dstream(self)) { + if (ensure_dctx(self, 1)) { Py_DECREF(result); return NULL; } result->decompressor = self; Py_INCREF(result->decompressor); + result->outSize = outSize; return result; } -PyDoc_STRVAR(Decompressor_read_from__doc__, -"read_from(reader[, read_size=default, write_size=default, skip_bytes=0])\n" +PyDoc_STRVAR(Decompressor_read_to_iter__doc__, +"read_to_iter(reader[, read_size=default, write_size=default, skip_bytes=0])\n" "Read compressed data and return an iterator\n" "\n" "Returns an iterator of decompressed data chunks produced from reading from\n" @@ -437,7 +475,7 @@ "the source.\n" ); -static ZstdDecompressorIterator* Decompressor_read_from(ZstdDecompressor* self, PyObject* args, PyObject* kwargs) { +static ZstdDecompressorIterator* Decompressor_read_to_iter(ZstdDecompressor* self, PyObject* args, PyObject* kwargs) { static char* kwlist[] = { "reader", "read_size", @@ -452,7 +490,7 @@ ZstdDecompressorIterator* result; size_t skipBytes = 0; - if (!PyArg_ParseTupleAndKeywords(args, kwargs, "O|kkk:read_from", kwlist, + if (!PyArg_ParseTupleAndKeywords(args, kwargs, "O|kkk:read_to_iter", kwlist, &reader, &inSize, &outSize, &skipBytes)) { return NULL; } @@ -474,14 +512,7 @@ } else if (1 == PyObject_CheckBuffer(reader)) { /* Object claims it is a buffer. Try to get a handle to it. */ - result->buffer = PyMem_Malloc(sizeof(Py_buffer)); - if (!result->buffer) { - goto except; - } - - memset(result->buffer, 0, sizeof(Py_buffer)); - - if (0 != PyObject_GetBuffer(reader, result->buffer, PyBUF_CONTIG_RO)) { + if (0 != PyObject_GetBuffer(reader, &result->buffer, PyBUF_CONTIG_RO)) { goto except; } } @@ -498,7 +529,7 @@ result->outSize = outSize; result->skipBytes = skipBytes; - if (0 != init_dstream(self)) { + if (ensure_dctx(self, 1)) { goto except; } @@ -511,13 +542,6 @@ goto finally; except: - Py_CLEAR(result->reader); - - if (result->buffer) { - PyBuffer_Release(result->buffer); - Py_CLEAR(result->buffer); - } - Py_CLEAR(result); finally: @@ -525,7 +549,62 @@ return result; } -PyDoc_STRVAR(Decompressor_write_to__doc__, +PyDoc_STRVAR(Decompressor_stream_reader__doc__, +"stream_reader(source, [read_size=default])\n" +"\n" +"Obtain an object that behaves like an I/O stream that can be used for\n" +"reading decompressed output from an object.\n" +"\n" +"The source object can be any object with a ``read(size)`` method or that\n" +"conforms to the buffer protocol.\n" +); + +static ZstdDecompressionReader* Decompressor_stream_reader(ZstdDecompressor* self, PyObject* args, PyObject* kwargs) { + static char* kwlist[] = { + "source", + "read_size", + NULL + }; + + PyObject* source; + size_t readSize = ZSTD_DStreamInSize(); + ZstdDecompressionReader* result; + + if (!PyArg_ParseTupleAndKeywords(args, kwargs, "O|k:stream_reader", kwlist, + &source, &readSize)) { + return NULL; + } + + result = (ZstdDecompressionReader*)PyObject_CallObject((PyObject*)&ZstdDecompressionReaderType, NULL); + if (NULL == result) { + return NULL; + } + + if (PyObject_HasAttrString(source, "read")) { + result->reader = source; + Py_INCREF(source); + result->readSize = readSize; + } + else if (1 == PyObject_CheckBuffer(source)) { + if (0 != PyObject_GetBuffer(source, &result->buffer, PyBUF_CONTIG_RO)) { + Py_CLEAR(result); + return NULL; + } + } + else { + PyErr_SetString(PyExc_TypeError, + "must pass an object with a read() method or that conforms to the buffer protocol"); + Py_CLEAR(result); + return NULL; + } + + result->decompressor = self; + Py_INCREF(self); + + return result; +} + +PyDoc_STRVAR(Decompressor_stream_writer__doc__, "Create a context manager to write decompressed data to an object.\n" "\n" "The passed object must have a ``write()`` method.\n" @@ -538,7 +617,7 @@ "streaming decompressor.\n" ); -static ZstdDecompressionWriter* Decompressor_write_to(ZstdDecompressor* self, PyObject* args, PyObject* kwargs) { +static ZstdDecompressionWriter* Decompressor_stream_writer(ZstdDecompressor* self, PyObject* args, PyObject* kwargs) { static char* kwlist[] = { "writer", "write_size", @@ -549,7 +628,7 @@ size_t outSize = ZSTD_DStreamOutSize(); ZstdDecompressionWriter* result; - if (!PyArg_ParseTupleAndKeywords(args, kwargs, "O|k:write_to", kwlist, + if (!PyArg_ParseTupleAndKeywords(args, kwargs, "O|k:stream_writer", kwlist, &writer, &outSize)) { return NULL; } @@ -579,7 +658,7 @@ "Decompress a series of chunks using the content dictionary chaining technique\n" ); -static PyObject* Decompressor_decompress_content_dict_chain(PyObject* self, PyObject* args, PyObject* kwargs) { +static PyObject* Decompressor_decompress_content_dict_chain(ZstdDecompressor* self, PyObject* args, PyObject* kwargs) { static char* kwlist[] = { "frames", NULL @@ -592,9 +671,8 @@ PyObject* chunk; char* chunkData; Py_ssize_t chunkSize; - ZSTD_DCtx* dctx = NULL; size_t zresult; - ZSTD_frameParams frameParams; + ZSTD_frameHeader frameHeader; void* buffer1 = NULL; size_t buffer1Size = 0; size_t buffer1ContentSize = 0; @@ -603,6 +681,8 @@ size_t buffer2ContentSize = 0; void* destBuffer = NULL; PyObject* result = NULL; + ZSTD_outBuffer outBuffer; + ZSTD_inBuffer inBuffer; if (!PyArg_ParseTupleAndKeywords(args, kwargs, "O!:decompress_content_dict_chain", kwlist, &PyList_Type, &chunks)) { @@ -624,7 +704,7 @@ /* We require that all chunks be zstd frames and that they have content size set. */ PyBytes_AsStringAndSize(chunk, &chunkData, &chunkSize); - zresult = ZSTD_getFrameParams(&frameParams, (void*)chunkData, chunkSize); + zresult = ZSTD_getFrameHeader(&frameHeader, (void*)chunkData, chunkSize); if (ZSTD_isError(zresult)) { PyErr_SetString(PyExc_ValueError, "chunk 0 is not a valid zstd frame"); return NULL; @@ -634,32 +714,56 @@ return NULL; } - if (0 == frameParams.frameContentSize) { + if (ZSTD_CONTENTSIZE_UNKNOWN == frameHeader.frameContentSize) { PyErr_SetString(PyExc_ValueError, "chunk 0 missing content size in frame"); return NULL; } - dctx = ZSTD_createDCtx(); - if (!dctx) { - PyErr_NoMemory(); + assert(ZSTD_CONTENTSIZE_ERROR != frameHeader.frameContentSize); + + /* We check against PY_SSIZE_T_MAX here because we ultimately cast the + * result to a Python object and it's length can be no greater than + * Py_ssize_t. In theory, we could have an intermediate frame that is + * larger. But a) why would this API be used for frames that large b) + * it isn't worth the complexity to support. */ + assert(SIZE_MAX >= PY_SSIZE_T_MAX); + if (frameHeader.frameContentSize > PY_SSIZE_T_MAX) { + PyErr_SetString(PyExc_ValueError, + "chunk 0 is too large to decompress on this platform"); + return NULL; + } + + if (ensure_dctx(self, 0)) { goto finally; } - buffer1Size = frameParams.frameContentSize; + buffer1Size = (size_t)frameHeader.frameContentSize; buffer1 = PyMem_Malloc(buffer1Size); if (!buffer1) { goto finally; } + outBuffer.dst = buffer1; + outBuffer.size = buffer1Size; + outBuffer.pos = 0; + + inBuffer.src = chunkData; + inBuffer.size = chunkSize; + inBuffer.pos = 0; + Py_BEGIN_ALLOW_THREADS - zresult = ZSTD_decompressDCtx(dctx, buffer1, buffer1Size, chunkData, chunkSize); + zresult = ZSTD_decompress_generic(self->dctx, &outBuffer, &inBuffer); Py_END_ALLOW_THREADS if (ZSTD_isError(zresult)) { PyErr_Format(ZstdError, "could not decompress chunk 0: %s", ZSTD_getErrorName(zresult)); goto finally; } + else if (zresult) { + PyErr_Format(ZstdError, "chunk 0 did not decompress full frame"); + goto finally; + } - buffer1ContentSize = zresult; + buffer1ContentSize = outBuffer.pos; /* Special case of a simple chain. */ if (1 == chunksLen) { @@ -668,7 +772,7 @@ } /* This should ideally look at next chunk. But this is slightly simpler. */ - buffer2Size = frameParams.frameContentSize; + buffer2Size = (size_t)frameHeader.frameContentSize; buffer2 = PyMem_Malloc(buffer2Size); if (!buffer2) { goto finally; @@ -688,7 +792,7 @@ } PyBytes_AsStringAndSize(chunk, &chunkData, &chunkSize); - zresult = ZSTD_getFrameParams(&frameParams, (void*)chunkData, chunkSize); + zresult = ZSTD_getFrameHeader(&frameHeader, (void*)chunkData, chunkSize); if (ZSTD_isError(zresult)) { PyErr_Format(PyExc_ValueError, "chunk %zd is not a valid zstd frame", chunkIndex); goto finally; @@ -698,18 +802,30 @@ goto finally; } - if (0 == frameParams.frameContentSize) { + if (ZSTD_CONTENTSIZE_UNKNOWN == frameHeader.frameContentSize) { PyErr_Format(PyExc_ValueError, "chunk %zd missing content size in frame", chunkIndex); goto finally; } + assert(ZSTD_CONTENTSIZE_ERROR != frameHeader.frameContentSize); + + if (frameHeader.frameContentSize > PY_SSIZE_T_MAX) { + PyErr_Format(PyExc_ValueError, + "chunk %zd is too large to decompress on this platform", chunkIndex); + goto finally; + } + + inBuffer.src = chunkData; + inBuffer.size = chunkSize; + inBuffer.pos = 0; + parity = chunkIndex % 2; /* This could definitely be abstracted to reduce code duplication. */ if (parity) { /* Resize destination buffer to hold larger content. */ - if (buffer2Size < frameParams.frameContentSize) { - buffer2Size = frameParams.frameContentSize; + if (buffer2Size < frameHeader.frameContentSize) { + buffer2Size = (size_t)frameHeader.frameContentSize; destBuffer = PyMem_Realloc(buffer2, buffer2Size); if (!destBuffer) { goto finally; @@ -718,19 +834,38 @@ } Py_BEGIN_ALLOW_THREADS - zresult = ZSTD_decompress_usingDict(dctx, buffer2, buffer2Size, - chunkData, chunkSize, buffer1, buffer1ContentSize); + zresult = ZSTD_DCtx_refPrefix_advanced(self->dctx, + buffer1, buffer1ContentSize, ZSTD_dct_rawContent); + Py_END_ALLOW_THREADS + if (ZSTD_isError(zresult)) { + PyErr_Format(ZstdError, + "failed to load prefix dictionary at chunk %zd", chunkIndex); + goto finally; + } + + outBuffer.dst = buffer2; + outBuffer.size = buffer2Size; + outBuffer.pos = 0; + + Py_BEGIN_ALLOW_THREADS + zresult = ZSTD_decompress_generic(self->dctx, &outBuffer, &inBuffer); Py_END_ALLOW_THREADS if (ZSTD_isError(zresult)) { PyErr_Format(ZstdError, "could not decompress chunk %zd: %s", chunkIndex, ZSTD_getErrorName(zresult)); goto finally; } - buffer2ContentSize = zresult; + else if (zresult) { + PyErr_Format(ZstdError, "chunk %zd did not decompress full frame", + chunkIndex); + goto finally; + } + + buffer2ContentSize = outBuffer.pos; } else { - if (buffer1Size < frameParams.frameContentSize) { - buffer1Size = frameParams.frameContentSize; + if (buffer1Size < frameHeader.frameContentSize) { + buffer1Size = (size_t)frameHeader.frameContentSize; destBuffer = PyMem_Realloc(buffer1, buffer1Size); if (!destBuffer) { goto finally; @@ -739,15 +874,34 @@ } Py_BEGIN_ALLOW_THREADS - zresult = ZSTD_decompress_usingDict(dctx, buffer1, buffer1Size, - chunkData, chunkSize, buffer2, buffer2ContentSize); + zresult = ZSTD_DCtx_refPrefix_advanced(self->dctx, + buffer2, buffer2ContentSize, ZSTD_dct_rawContent); + Py_END_ALLOW_THREADS + if (ZSTD_isError(zresult)) { + PyErr_Format(ZstdError, + "failed to load prefix dictionary at chunk %zd", chunkIndex); + goto finally; + } + + outBuffer.dst = buffer1; + outBuffer.size = buffer1Size; + outBuffer.pos = 0; + + Py_BEGIN_ALLOW_THREADS + zresult = ZSTD_decompress_generic(self->dctx, &outBuffer, &inBuffer); Py_END_ALLOW_THREADS if (ZSTD_isError(zresult)) { PyErr_Format(ZstdError, "could not decompress chunk %zd: %s", chunkIndex, ZSTD_getErrorName(zresult)); goto finally; } - buffer1ContentSize = zresult; + else if (zresult) { + PyErr_Format(ZstdError, "chunk %zd did not decompress full frame", + chunkIndex); + goto finally; + } + + buffer1ContentSize = outBuffer.pos; } } @@ -762,17 +916,13 @@ PyMem_Free(buffer1); } - if (dctx) { - ZSTD_freeDCtx(dctx); - } - return result; } typedef struct { void* sourceData; size_t sourceSize; - unsigned long long destSize; + size_t destSize; } FramePointer; typedef struct { @@ -806,7 +956,6 @@ /* Compression state and settings. */ ZSTD_DCtx* dctx; - ZSTD_DDict* ddict; int requireOutputSizes; /* Output storage. */ @@ -838,6 +987,14 @@ assert(0 == state->destCount); assert(state->endOffset - state->startOffset >= 0); + /* We could get here due to the way work is allocated. Ideally we wouldn't + get here. But that would require a bit of a refactor in the caller. */ + if (state->totalSourceSize > SIZE_MAX) { + state->error = WorkerError_memory; + state->errorOffset = 0; + return; + } + /* * We need to allocate a buffer to hold decompressed data. How we do this * depends on what we know about the output. The following scenarios are @@ -853,14 +1010,34 @@ /* Resolve ouput segments. */ for (frameIndex = state->startOffset; frameIndex <= state->endOffset; frameIndex++) { FramePointer* fp = &framePointers[frameIndex]; + unsigned long long decompressedSize; if (0 == fp->destSize) { - fp->destSize = ZSTD_getDecompressedSize(fp->sourceData, fp->sourceSize); - if (0 == fp->destSize && state->requireOutputSizes) { + decompressedSize = ZSTD_getFrameContentSize(fp->sourceData, fp->sourceSize); + + if (ZSTD_CONTENTSIZE_ERROR == decompressedSize) { state->error = WorkerError_unknownSize; state->errorOffset = frameIndex; return; } + else if (ZSTD_CONTENTSIZE_UNKNOWN == decompressedSize) { + if (state->requireOutputSizes) { + state->error = WorkerError_unknownSize; + state->errorOffset = frameIndex; + return; + } + + /* This will fail the assert for .destSize > 0 below. */ + decompressedSize = 0; + } + + if (decompressedSize > SIZE_MAX) { + state->error = WorkerError_memory; + state->errorOffset = frameIndex; + return; + } + + fp->destSize = (size_t)decompressedSize; } totalOutputSize += fp->destSize; @@ -878,7 +1055,7 @@ assert(framePointers[state->startOffset].destSize > 0); /* For now. */ - allocationSize = roundpow2(state->totalSourceSize); + allocationSize = roundpow2((size_t)state->totalSourceSize); if (framePointers[state->startOffset].destSize > allocationSize) { allocationSize = roundpow2(framePointers[state->startOffset].destSize); @@ -902,6 +1079,8 @@ destBuffer->segmentsSize = remainingItems; for (frameIndex = state->startOffset; frameIndex <= state->endOffset; frameIndex++) { + ZSTD_outBuffer outBuffer; + ZSTD_inBuffer inBuffer; const void* source = framePointers[frameIndex].sourceData; const size_t sourceSize = framePointers[frameIndex].sourceSize; void* dest; @@ -956,7 +1135,7 @@ /* Don't take any chances will non-NULL pointers. */ memset(destBuffer, 0, sizeof(DestBuffer)); - allocationSize = roundpow2(state->totalSourceSize); + allocationSize = roundpow2((size_t)state->totalSourceSize); if (decompressedSize > allocationSize) { allocationSize = roundpow2(decompressedSize); @@ -985,31 +1164,31 @@ dest = (char*)destBuffer->dest + destOffset; - if (state->ddict) { - zresult = ZSTD_decompress_usingDDict(state->dctx, dest, decompressedSize, - source, sourceSize, state->ddict); - } - else { - zresult = ZSTD_decompressDCtx(state->dctx, dest, decompressedSize, - source, sourceSize); - } + outBuffer.dst = dest; + outBuffer.size = decompressedSize; + outBuffer.pos = 0; + inBuffer.src = source; + inBuffer.size = sourceSize; + inBuffer.pos = 0; + + zresult = ZSTD_decompress_generic(state->dctx, &outBuffer, &inBuffer); if (ZSTD_isError(zresult)) { state->error = WorkerError_zstd; state->zresult = zresult; state->errorOffset = frameIndex; return; } - else if (zresult != decompressedSize) { + else if (zresult || outBuffer.pos != decompressedSize) { state->error = WorkerError_sizeMismatch; - state->zresult = zresult; + state->zresult = outBuffer.pos; state->errorOffset = frameIndex; return; } destBuffer->segments[localOffset].offset = destOffset; - destBuffer->segments[localOffset].length = decompressedSize; - destOffset += zresult; + destBuffer->segments[localOffset].length = outBuffer.pos; + destOffset += outBuffer.pos; localOffset++; remainingItems--; } @@ -1027,9 +1206,7 @@ } ZstdBufferWithSegmentsCollection* decompress_from_framesources(ZstdDecompressor* decompressor, FrameSources* frames, - unsigned int threadCount) { - void* dictData = NULL; - size_t dictSize = 0; + Py_ssize_t threadCount) { Py_ssize_t i = 0; int errored = 0; Py_ssize_t segmentsCount; @@ -1039,7 +1216,7 @@ ZstdBufferWithSegmentsCollection* result = NULL; FramePointer* framePointers = frames->frames; unsigned long long workerBytes = 0; - int currentThread = 0; + Py_ssize_t currentThread = 0; Py_ssize_t workerStartOffset = 0; POOL_ctx* pool = NULL; WorkerState* workerStates = NULL; @@ -1049,24 +1226,14 @@ assert(threadCount >= 1); /* More threads than inputs makes no sense under any conditions. */ - threadCount = frames->framesSize < threadCount ? (unsigned int)frames->framesSize + threadCount = frames->framesSize < threadCount ? frames->framesSize : threadCount; /* TODO lower thread count if input size is too small and threads would just add overhead. */ if (decompressor->dict) { - dictData = decompressor->dict->dictData; - dictSize = decompressor->dict->dictSize; - } - - if (dictData && !decompressor->ddict) { - Py_BEGIN_ALLOW_THREADS - decompressor->ddict = ZSTD_createDDict_byReference(dictData, dictSize); - Py_END_ALLOW_THREADS - - if (!decompressor->ddict) { - PyErr_SetString(ZstdError, "could not create decompression dict"); + if (ensure_ddict(decompressor->dict)) { return NULL; } } @@ -1091,7 +1258,14 @@ bytesPerWorker = frames->compressedSize / threadCount; + if (bytesPerWorker > SIZE_MAX) { + PyErr_SetString(ZstdError, "too much data per worker for this platform"); + goto finally; + } + for (i = 0; i < threadCount; i++) { + size_t zresult; + workerStates[i].dctx = ZSTD_createDCtx(); if (NULL == workerStates[i].dctx) { PyErr_NoMemory(); @@ -1100,7 +1274,15 @@ ZSTD_copyDCtx(workerStates[i].dctx, decompressor->dctx); - workerStates[i].ddict = decompressor->ddict; + if (decompressor->dict) { + zresult = ZSTD_DCtx_refDDict(workerStates[i].dctx, decompressor->dict->ddict); + if (zresult) { + PyErr_Format(ZstdError, "unable to reference prepared dictionary: %s", + ZSTD_getErrorName(zresult)); + goto finally; + } + } + workerStates[i].framePointers = framePointers; workerStates[i].requireOutputSizes = 1; } @@ -1178,7 +1360,7 @@ break; case WorkerError_sizeMismatch: - PyErr_Format(ZstdError, "error decompressing item %zd: decompressed %zu bytes; expected %llu", + PyErr_Format(ZstdError, "error decompressing item %zd: decompressed %zu bytes; expected %zu", workerStates[i].errorOffset, workerStates[i].zresult, framePointers[workerStates[i].errorOffset].destSize); errored = 1; @@ -1388,9 +1570,21 @@ decompressedSize = frameSizesP[i]; } + if (sourceSize > SIZE_MAX) { + PyErr_Format(PyExc_ValueError, + "item %zd is too large for this platform", i); + goto finally; + } + + if (decompressedSize > SIZE_MAX) { + PyErr_Format(PyExc_ValueError, + "decompressed size of item %zd is too large for this platform", i); + goto finally; + } + framePointers[i].sourceData = sourceData; - framePointers[i].sourceSize = sourceSize; - framePointers[i].destSize = decompressedSize; + framePointers[i].sourceSize = (size_t)sourceSize; + framePointers[i].destSize = (size_t)decompressedSize; } } else if (PyObject_TypeCheck(frames, &ZstdBufferWithSegmentsCollectionType)) { @@ -1419,17 +1613,33 @@ buffer = collection->buffers[i]; for (segmentIndex = 0; segmentIndex < buffer->segmentCount; segmentIndex++) { + unsigned long long decompressedSize = frameSizesP ? frameSizesP[offset] : 0; + if (buffer->segments[segmentIndex].offset + buffer->segments[segmentIndex].length > buffer->dataSize) { PyErr_Format(PyExc_ValueError, "item %zd has offset outside memory area", offset); goto finally; } + if (buffer->segments[segmentIndex].length > SIZE_MAX) { + PyErr_Format(PyExc_ValueError, + "item %zd in buffer %zd is too large for this platform", + segmentIndex, i); + goto finally; + } + + if (decompressedSize > SIZE_MAX) { + PyErr_Format(PyExc_ValueError, + "decompressed size of item %zd in buffer %zd is too large for this platform", + segmentIndex, i); + goto finally; + } + totalInputSize += buffer->segments[segmentIndex].length; framePointers[offset].sourceData = (char*)buffer->data + buffer->segments[segmentIndex].offset; - framePointers[offset].sourceSize = buffer->segments[segmentIndex].length; - framePointers[offset].destSize = frameSizesP ? frameSizesP[offset] : 0; + framePointers[offset].sourceSize = (size_t)buffer->segments[segmentIndex].length; + framePointers[offset].destSize = (size_t)decompressedSize; offset++; } @@ -1450,11 +1660,6 @@ goto finally; } - /* - * It is not clear whether Py_buffer.buf is still valid after - * PyBuffer_Release. So, we hold a reference to all Py_buffer instances - * for the duration of the operation. - */ frameBuffers = PyMem_Malloc(frameCount * sizeof(Py_buffer)); if (NULL == frameBuffers) { PyErr_NoMemory(); @@ -1465,6 +1670,8 @@ /* Do a pass to assemble info about our input buffers and output sizes. */ for (i = 0; i < frameCount; i++) { + unsigned long long decompressedSize = frameSizesP ? frameSizesP[i] : 0; + if (0 != PyObject_GetBuffer(PyList_GET_ITEM(frames, i), &frameBuffers[i], PyBUF_CONTIG_RO)) { PyErr_Clear(); @@ -1472,11 +1679,17 @@ goto finally; } + if (decompressedSize > SIZE_MAX) { + PyErr_Format(PyExc_ValueError, + "decompressed size of item %zd is too large for this platform", i); + goto finally; + } + totalInputSize += frameBuffers[i].len; framePointers[i].sourceData = frameBuffers[i].buf; framePointers[i].sourceSize = frameBuffers[i].len; - framePointers[i].destSize = frameSizesP ? frameSizesP[i] : 0; + framePointers[i].destSize = (size_t)decompressedSize; } } else { @@ -1514,16 +1727,26 @@ Decompressor_copy_stream__doc__ }, { "decompress", (PyCFunction)Decompressor_decompress, METH_VARARGS | METH_KEYWORDS, Decompressor_decompress__doc__ }, - { "decompressobj", (PyCFunction)Decompressor_decompressobj, METH_NOARGS, + { "decompressobj", (PyCFunction)Decompressor_decompressobj, METH_VARARGS | METH_KEYWORDS, Decompressor_decompressobj__doc__ }, - { "read_from", (PyCFunction)Decompressor_read_from, METH_VARARGS | METH_KEYWORDS, - Decompressor_read_from__doc__ }, - { "write_to", (PyCFunction)Decompressor_write_to, METH_VARARGS | METH_KEYWORDS, - Decompressor_write_to__doc__ }, + { "read_to_iter", (PyCFunction)Decompressor_read_to_iter, METH_VARARGS | METH_KEYWORDS, + Decompressor_read_to_iter__doc__ }, + /* TODO Remove deprecated API */ + { "read_from", (PyCFunction)Decompressor_read_to_iter, METH_VARARGS | METH_KEYWORDS, + Decompressor_read_to_iter__doc__ }, + { "stream_reader", (PyCFunction)Decompressor_stream_reader, + METH_VARARGS | METH_KEYWORDS, Decompressor_stream_reader__doc__ }, + { "stream_writer", (PyCFunction)Decompressor_stream_writer, METH_VARARGS | METH_KEYWORDS, + Decompressor_stream_writer__doc__ }, + /* TODO remove deprecated API */ + { "write_to", (PyCFunction)Decompressor_stream_writer, METH_VARARGS | METH_KEYWORDS, + Decompressor_stream_writer__doc__ }, { "decompress_content_dict_chain", (PyCFunction)Decompressor_decompress_content_dict_chain, METH_VARARGS | METH_KEYWORDS, Decompressor_decompress_content_dict_chain__doc__ }, { "multi_decompress_to_buffer", (PyCFunction)Decompressor_multi_decompress_to_buffer, METH_VARARGS | METH_KEYWORDS, Decompressor_multi_decompress_to_buffer__doc__ }, + { "memory_size", (PyCFunction)Decompressor_memory_size, METH_NOARGS, + Decompressor_memory_size__doc__ }, { NULL, NULL } }; diff -r fb92df8b634c -r ed5448edcbfa contrib/python-zstandard/c-ext/decompressoriterator.c --- a/contrib/python-zstandard/c-ext/decompressoriterator.c Wed Apr 04 10:35:09 2018 -0400 +++ b/contrib/python-zstandard/c-ext/decompressoriterator.c Wed Apr 18 15:32:08 2018 -0400 @@ -20,10 +20,9 @@ Py_XDECREF(self->decompressor); Py_XDECREF(self->reader); - if (self->buffer) { - PyBuffer_Release(self->buffer); - PyMem_FREE(self->buffer); - self->buffer = NULL; + if (self->buffer.buf) { + PyBuffer_Release(&self->buffer); + memset(&self->buffer, 0, sizeof(self->buffer)); } if (self->input.src) { @@ -45,8 +44,6 @@ DecompressorIteratorResult result; size_t oldInputPos = self->input.pos; - assert(self->decompressor->dstream); - result.chunk = NULL; chunk = PyBytes_FromStringAndSize(NULL, self->outSize); @@ -60,7 +57,7 @@ self->output.pos = 0; Py_BEGIN_ALLOW_THREADS - zresult = ZSTD_decompressStream(self->decompressor->dstream, &self->output, &self->input); + zresult = ZSTD_decompress_generic(self->decompressor->dctx, &self->output, &self->input); Py_END_ALLOW_THREADS /* We're done with the pointer. Nullify to prevent anyone from getting a @@ -86,7 +83,8 @@ /* If it produced output data, return it. */ if (self->output.pos) { if (self->output.pos < self->outSize) { - if (_PyBytes_Resize(&chunk, self->output.pos)) { + if (safe_pybytes_resize(&chunk, self->output.pos)) { + Py_XDECREF(chunk); result.errored = 1; return result; } @@ -137,15 +135,15 @@ PyBytes_AsStringAndSize(readResult, &readBuffer, &readSize); } else { - assert(self->buffer && self->buffer->buf); + assert(self->buffer.buf); /* Only support contiguous C arrays for now */ - assert(self->buffer->strides == NULL && self->buffer->suboffsets == NULL); - assert(self->buffer->itemsize == 1); + assert(self->buffer.strides == NULL && self->buffer.suboffsets == NULL); + assert(self->buffer.itemsize == 1); /* TODO avoid memcpy() below */ - readBuffer = (char *)self->buffer->buf + self->bufferOffset; - bufferRemaining = self->buffer->len - self->bufferOffset; + readBuffer = (char *)self->buffer.buf + self->bufferOffset; + bufferRemaining = self->buffer.len - self->bufferOffset; readSize = min(bufferRemaining, (Py_ssize_t)self->inSize); self->bufferOffset += readSize; } diff -r fb92df8b634c -r ed5448edcbfa contrib/python-zstandard/c-ext/frameparams.c --- a/contrib/python-zstandard/c-ext/frameparams.c Wed Apr 04 10:35:09 2018 -0400 +++ b/contrib/python-zstandard/c-ext/frameparams.c Wed Apr 18 15:32:08 2018 -0400 @@ -13,50 +13,56 @@ PyDoc_STRVAR(FrameParameters__doc__, "FrameParameters: information about a zstd frame"); -FrameParametersObject* get_frame_parameters(PyObject* self, PyObject* args) { - const char* source; - Py_ssize_t sourceSize; - ZSTD_frameParams params; +FrameParametersObject* get_frame_parameters(PyObject* self, PyObject* args, PyObject* kwargs) { + static char* kwlist[] = { + "data", + NULL + }; + + Py_buffer source; + ZSTD_frameHeader header; FrameParametersObject* result = NULL; size_t zresult; #if PY_MAJOR_VERSION >= 3 - if (!PyArg_ParseTuple(args, "y#:get_frame_parameters", + if (!PyArg_ParseTupleAndKeywords(args, kwargs, "y*:get_frame_parameters", #else - if (!PyArg_ParseTuple(args, "s#:get_frame_parameters", + if (!PyArg_ParseTupleAndKeywords(args, kwargs, "s*:get_frame_parameters", #endif - &source, &sourceSize)) { + kwlist, &source)) { return NULL; } - /* Needed for Python 2 to reject unicode */ - if (!PyBytes_Check(PyTuple_GET_ITEM(args, 0))) { - PyErr_SetString(PyExc_TypeError, "argument must be bytes"); - return NULL; + if (!PyBuffer_IsContiguous(&source, 'C') || source.ndim > 1) { + PyErr_SetString(PyExc_ValueError, + "data buffer should be contiguous and have at most one dimension"); + goto finally; } - zresult = ZSTD_getFrameParams(¶ms, (void*)source, sourceSize); + zresult = ZSTD_getFrameHeader(&header, source.buf, source.len); if (ZSTD_isError(zresult)) { PyErr_Format(ZstdError, "cannot get frame parameters: %s", ZSTD_getErrorName(zresult)); - return NULL; + goto finally; } if (zresult) { PyErr_Format(ZstdError, "not enough data for frame parameters; need %zu bytes", zresult); - return NULL; + goto finally; } result = PyObject_New(FrameParametersObject, &FrameParametersType); if (!result) { - return NULL; + goto finally; } - result->frameContentSize = params.frameContentSize; - result->windowSize = params.windowSize; - result->dictID = params.dictID; - result->checksumFlag = params.checksumFlag ? 1 : 0; + result->frameContentSize = header.frameContentSize; + result->windowSize = header.windowSize; + result->dictID = header.dictID; + result->checksumFlag = header.checksumFlag ? 1 : 0; +finally: + PyBuffer_Release(&source); return result; } @@ -68,7 +74,7 @@ { "content_size", T_ULONGLONG, offsetof(FrameParametersObject, frameContentSize), READONLY, "frame content size" }, - { "window_size", T_UINT, + { "window_size", T_ULONGLONG, offsetof(FrameParametersObject, windowSize), READONLY, "window size" }, { "dict_id", T_UINT, diff -r fb92df8b634c -r ed5448edcbfa contrib/python-zstandard/c-ext/python-zstandard.h --- a/contrib/python-zstandard/c-ext/python-zstandard.h Wed Apr 04 10:35:09 2018 -0400 +++ b/contrib/python-zstandard/c-ext/python-zstandard.h Wed Apr 18 15:32:08 2018 -0400 @@ -12,12 +12,10 @@ #define ZSTD_STATIC_LINKING_ONLY #define ZDICT_STATIC_LINKING_ONLY -#include "mem.h" -#include "zstd.h" -#include "zdict.h" -#include "zstdmt_compress.h" +#include +#include -#define PYTHON_ZSTANDARD_VERSION "0.8.1" +#define PYTHON_ZSTANDARD_VERSION "0.9.0" typedef enum { compressorobj_flush_finish, @@ -25,22 +23,38 @@ } CompressorObj_Flush; /* - Represents a CompressionParameters type. + Represents a ZstdCompressionParameters type. - This type is basically a wrapper around ZSTD_compressionParameters. + This type holds all the low-level compression parameters that can be set. */ typedef struct { PyObject_HEAD + ZSTD_CCtx_params* params; + unsigned format; + int compressionLevel; unsigned windowLog; + unsigned hashLog; unsigned chainLog; - unsigned hashLog; unsigned searchLog; - unsigned searchLength; + unsigned minMatch; unsigned targetLength; - ZSTD_strategy strategy; -} CompressionParametersObject; + unsigned compressionStrategy; + unsigned contentSizeFlag; + unsigned checksumFlag; + unsigned dictIDFlag; + unsigned threads; + unsigned jobSize; + unsigned overlapSizeLog; + unsigned compressLiterals; + unsigned forceMaxWindow; + unsigned enableLongDistanceMatching; + unsigned ldmHashLog; + unsigned ldmMinMatch; + unsigned ldmBucketSizeLog; + unsigned ldmHashEveryLog; +} ZstdCompressionParametersObject; -extern PyTypeObject CompressionParametersType; +extern PyTypeObject ZstdCompressionParametersType; /* Represents a FrameParameters type. @@ -50,7 +64,7 @@ typedef struct { PyObject_HEAD unsigned long long frameContentSize; - unsigned windowSize; + unsigned long long windowSize; unsigned dictID; char checksumFlag; } FrameParametersObject; @@ -69,10 +83,14 @@ void* dictData; /* Size of dictionary data. */ size_t dictSize; + ZSTD_dictContentType_e dictType; /* k parameter for cover dictionaries. Only populated by train_cover_dict(). */ unsigned k; /* d parameter for cover dictionaries. Only populated by train_cover_dict(). */ unsigned d; + /* Digested dictionary, suitable for reuse. */ + ZSTD_CDict* cdict; + ZSTD_DDict* ddict; } ZstdCompressionDict; extern PyTypeObject ZstdCompressionDictType; @@ -83,29 +101,15 @@ typedef struct { PyObject_HEAD - /* Configured compression level. Should be always set. */ - int compressionLevel; /* Number of threads to use for operations. */ unsigned int threads; /* Pointer to compression dictionary to use. NULL if not using dictionary compression. */ ZstdCompressionDict* dict; - /* Compression context to use. Populated during object construction. NULL - if using multi-threaded compression. */ + /* Compression context to use. Populated during object construction. */ ZSTD_CCtx* cctx; - /* Multi-threaded compression context to use. Populated during object - construction. NULL if not using multi-threaded compression. */ - ZSTDMT_CCtx* mtcctx; - /* Digest compression dictionary. NULL initially. Populated on first use. */ - ZSTD_CDict* cdict; - /* Low-level compression parameter control. NULL unless passed to - constructor. Takes precedence over `compressionLevel` if defined. */ - CompressionParametersObject* cparams; - /* Controls zstd frame options. */ - ZSTD_frameParameters fparams; - /* Holds state for streaming compression. Shared across all invocation. - Populated on first use. */ - ZSTD_CStream* cstream; + /* Compression parameters in use. */ + ZSTD_CCtx_params* params; } ZstdCompressor; extern PyTypeObject ZstdCompressorType; @@ -125,9 +129,10 @@ ZstdCompressor* compressor; PyObject* writer; - Py_ssize_t sourceSize; + unsigned long long sourceSize; size_t outSize; int entered; + unsigned long long bytesCompressed; } ZstdCompressionWriter; extern PyTypeObject ZstdCompressionWriterType; @@ -137,9 +142,8 @@ ZstdCompressor* compressor; PyObject* reader; - Py_buffer* buffer; + Py_buffer buffer; Py_ssize_t bufferOffset; - Py_ssize_t sourceSize; size_t inSize; size_t outSize; @@ -155,11 +159,32 @@ typedef struct { PyObject_HEAD + ZstdCompressor* compressor; + PyObject* reader; + Py_buffer buffer; + unsigned long long sourceSize; + size_t readSize; + + int entered; + int closed; + unsigned long long bytesCompressed; + + ZSTD_inBuffer input; + ZSTD_outBuffer output; + int finishedInput; + int finishedOutput; + PyObject* readResult; +} ZstdCompressionReader; + +extern PyTypeObject ZstdCompressionReaderType; + +typedef struct { + PyObject_HEAD + ZSTD_DCtx* dctx; - ZstdCompressionDict* dict; - ZSTD_DDict* ddict; - ZSTD_DStream* dstream; + size_t maxWindowSize; + ZSTD_format_e format; } ZstdDecompressor; extern PyTypeObject ZstdDecompressorType; @@ -168,6 +193,7 @@ PyObject_HEAD ZstdDecompressor* decompressor; + size_t outSize; int finished; } ZstdDecompressionObj; @@ -176,6 +202,40 @@ typedef struct { PyObject_HEAD + /* Parent decompressor to which this object is associated. */ + ZstdDecompressor* decompressor; + /* Object to read() from (if reading from a stream). */ + PyObject* reader; + /* Size for read() operations on reader. */ + size_t readSize; + /* Buffer to read from (if reading from a buffer). */ + Py_buffer buffer; + + /* Whether the context manager is active. */ + int entered; + /* Whether we've closed the stream. */ + int closed; + + /* Number of bytes decompressed and returned to user. */ + unsigned long long bytesDecompressed; + + /* Tracks data going into decompressor. */ + ZSTD_inBuffer input; + + /* Holds output from read() operation on reader. */ + PyObject* readResult; + + /* Whether all input has been sent to the decompressor. */ + int finishedInput; + /* Whether all output has been flushed from the decompressor. */ + int finishedOutput; +} ZstdDecompressionReader; + +extern PyTypeObject ZstdDecompressionReaderType; + +typedef struct { + PyObject_HEAD + ZstdDecompressor* decompressor; PyObject* writer; size_t outSize; @@ -189,7 +249,7 @@ ZstdDecompressor* decompressor; PyObject* reader; - Py_buffer* buffer; + Py_buffer buffer; Py_ssize_t bufferOffset; size_t inSize; size_t outSize; @@ -209,6 +269,9 @@ } DecompressorIteratorResult; typedef struct { + /* The public API is that these are 64-bit unsigned integers. So these can't + * be size_t, even though values larger than SIZE_MAX or PY_SSIZE_T_MAX may + * be nonsensical for this platform. */ unsigned long long offset; unsigned long long length; } BufferSegment; @@ -270,16 +333,14 @@ extern PyTypeObject ZstdBufferWithSegmentsCollectionType; -void ztopy_compression_parameters(CompressionParametersObject* params, ZSTD_compressionParameters* zparams); -CompressionParametersObject* get_compression_parameters(PyObject* self, PyObject* args); -FrameParametersObject* get_frame_parameters(PyObject* self, PyObject* args); -PyObject* estimate_compression_context_size(PyObject* self, PyObject* args); -int init_cstream(ZstdCompressor* compressor, unsigned long long sourceSize); -int init_mtcstream(ZstdCompressor* compressor, Py_ssize_t sourceSize); -int init_dstream(ZstdDecompressor* decompressor); +int set_parameter(ZSTD_CCtx_params* params, ZSTD_cParameter param, unsigned value); +int set_parameters(ZSTD_CCtx_params* params, ZstdCompressionParametersObject* obj); +FrameParametersObject* get_frame_parameters(PyObject* self, PyObject* args, PyObject* kwargs); +int ensure_ddict(ZstdCompressionDict* dict); +int ensure_dctx(ZstdDecompressor* decompressor, int loadDict); ZstdCompressionDict* train_dictionary(PyObject* self, PyObject* args, PyObject* kwargs); -ZstdCompressionDict* train_cover_dictionary(PyObject* self, PyObject* args, PyObject* kwargs); ZstdBufferWithSegments* BufferWithSegments_FromMemory(void* data, unsigned long long dataSize, BufferSegment* segments, Py_ssize_t segmentsSize); Py_ssize_t BufferWithSegmentsCollection_length(ZstdBufferWithSegmentsCollection*); int cpu_count(void); size_t roundpow2(size_t); +int safe_pybytes_resize(PyObject** obj, Py_ssize_t size); diff -r fb92df8b634c -r ed5448edcbfa contrib/python-zstandard/make_cffi.py --- a/contrib/python-zstandard/make_cffi.py Wed Apr 04 10:35:09 2018 -0400 +++ b/contrib/python-zstandard/make_cffi.py Wed Apr 18 15:32:08 2018 -0400 @@ -27,6 +27,11 @@ 'compress/fse_compress.c', 'compress/huf_compress.c', 'compress/zstd_compress.c', + 'compress/zstd_double_fast.c', + 'compress/zstd_fast.c', + 'compress/zstd_lazy.c', + 'compress/zstd_ldm.c', + 'compress/zstd_opt.c', 'compress/zstdmt_compress.c', 'decompress/huf_decompress.c', 'decompress/zstd_decompress.c', @@ -38,7 +43,6 @@ # Headers whose preprocessed output will be fed into cdef(). HEADERS = [os.path.join(HERE, 'zstd', *p) for p in ( ('zstd.h',), - ('compress', 'zstdmt_compress.h'), ('dictBuilder', 'zdict.h'), )] @@ -80,7 +84,9 @@ def preprocess(path): with open(path, 'rb') as fh: lines = [] - for l in fh: + it = iter(fh) + + for l in it: # zstd.h includes , which is also included by cffi's # boilerplate. This can lead to duplicate declarations. So we strip # this include from the preprocessor invocation. @@ -137,18 +143,21 @@ ffi = cffi.FFI() +# zstd.h uses a possible undefined MIN(). Define it until +# https://github.com/facebook/zstd/issues/976 is fixed. # *_DISABLE_DEPRECATE_WARNINGS prevents the compiler from emitting a warning # when cffi uses the function. Since we statically link against zstd, even # if we use the deprecated functions it shouldn't be a huge problem. ffi.set_source('_zstd_cffi', ''' -#include "mem.h" +#define MIN(a,b) ((a)<(b) ? (a) : (b)) #define ZSTD_STATIC_LINKING_ONLY -#include "zstd.h" +#include #define ZDICT_STATIC_LINKING_ONLY #define ZDICT_DISABLE_DEPRECATE_WARNINGS -#include "zdict.h" -#include "zstdmt_compress.h" -''', sources=SOURCES, include_dirs=INCLUDE_DIRS) +#include +''', sources=SOURCES, + include_dirs=INCLUDE_DIRS, + extra_compile_args=['-DZSTD_MULTITHREAD']) DEFINE = re.compile(b'^\\#define ([a-zA-Z0-9_]+) ') diff -r fb92df8b634c -r ed5448edcbfa contrib/python-zstandard/setup.py --- a/contrib/python-zstandard/setup.py Wed Apr 04 10:35:09 2018 -0400 +++ b/contrib/python-zstandard/setup.py Wed Apr 18 15:32:08 2018 -0400 @@ -5,6 +5,7 @@ # This software may be modified and distributed under the terms # of the BSD license. See the LICENSE file for details. +import os import sys from setuptools import setup @@ -16,14 +17,32 @@ import setup_zstd SUPPORT_LEGACY = False +SYSTEM_ZSTD = False +WARNINGS_AS_ERRORS = False -if "--legacy" in sys.argv: +if os.environ.get('ZSTD_WARNINGS_AS_ERRORS', ''): + WARNINGS_AS_ERRORS = True + +if '--legacy' in sys.argv: SUPPORT_LEGACY = True - sys.argv.remove("--legacy") + sys.argv.remove('--legacy') + +if '--system-zstd' in sys.argv: + SYSTEM_ZSTD = True + sys.argv.remove('--system-zstd') + +if '--warnings-as-errors' in sys.argv: + WARNINGS_AS_ERRORS = True + sys.argv.remote('--warning-as-errors') # Code for obtaining the Extension instance is in its own module to # facilitate reuse in other projects. -extensions = [setup_zstd.get_c_extension(SUPPORT_LEGACY, 'zstd')] +extensions = [ + setup_zstd.get_c_extension(name='zstd', + support_legacy=SUPPORT_LEGACY, + system_zstd=SYSTEM_ZSTD, + warnings_as_errors=WARNINGS_AS_ERRORS), +] install_requires = [] @@ -31,8 +50,11 @@ import make_cffi extensions.append(make_cffi.ffi.distutils_extension()) - # Need change in 1.8 for ffi.from_buffer() behavior. - install_requires.append('cffi>=1.8') + # Need change in 1.10 for ffi.from_buffer() to handle all buffer types + # (like memoryview). + # Need feature in 1.11 for ffi.gc() to declare size of objects so we avoid + # garbage collection pitfalls. + install_requires.append('cffi>=1.11') version = None @@ -62,14 +84,13 @@ 'Intended Audience :: Developers', 'License :: OSI Approved :: BSD License', 'Programming Language :: C', - 'Programming Language :: Python :: 2.6', 'Programming Language :: Python :: 2.7', - 'Programming Language :: Python :: 3.3', 'Programming Language :: Python :: 3.4', 'Programming Language :: Python :: 3.5', 'Programming Language :: Python :: 3.6', ], keywords='zstandard zstd compression', + packages=['zstandard'], ext_modules=extensions, test_suite='tests', install_requires=install_requires, diff -r fb92df8b634c -r ed5448edcbfa contrib/python-zstandard/setup_zstd.py --- a/contrib/python-zstandard/setup_zstd.py Wed Apr 04 10:35:09 2018 -0400 +++ b/contrib/python-zstandard/setup_zstd.py Wed Apr 18 15:32:08 2018 -0400 @@ -4,7 +4,10 @@ # This software may be modified and distributed under the terms # of the BSD license. See the LICENSE file for details. +import distutils.ccompiler import os +import sys + from distutils.extension import Extension @@ -19,6 +22,11 @@ 'compress/fse_compress.c', 'compress/huf_compress.c', 'compress/zstd_compress.c', + 'compress/zstd_double_fast.c', + 'compress/zstd_fast.c', + 'compress/zstd_lazy.c', + 'compress/zstd_ldm.c', + 'compress/zstd_opt.c', 'compress/zstdmt_compress.c', 'decompress/huf_decompress.c', 'decompress/zstd_decompress.c', @@ -41,7 +49,6 @@ )] zstd_includes = [ - 'c-ext', 'zstd', 'zstd/common', 'zstd/compress', @@ -54,7 +61,14 @@ 'zstd/legacy', ] +ext_includes = [ + 'c-ext', + 'zstd/common', +] + ext_sources = [ + 'zstd/common/pool.c', + 'zstd/common/threading.c', 'zstd.c', 'c-ext/bufferutil.c', 'c-ext/compressiondict.c', @@ -62,11 +76,13 @@ 'c-ext/compressor.c', 'c-ext/compressoriterator.c', 'c-ext/compressionparams.c', + 'c-ext/compressionreader.c', 'c-ext/compressionwriter.c', 'c-ext/constants.c', 'c-ext/decompressobj.c', 'c-ext/decompressor.c', 'c-ext/decompressoriterator.c', + 'c-ext/decompressionreader.c', 'c-ext/decompressionwriter.c', 'c-ext/frameparams.c', ] @@ -76,27 +92,67 @@ ] -def get_c_extension(support_legacy=False, name='zstd'): +def get_c_extension(support_legacy=False, system_zstd=False, name='zstd', + warnings_as_errors=False): """Obtain a distutils.extension.Extension for the C extension.""" root = os.path.abspath(os.path.dirname(__file__)) - sources = [os.path.join(root, p) for p in zstd_sources + ext_sources] - if support_legacy: - sources.extend([os.path.join(root, p) for p in zstd_sources_legacy]) + sources = set([os.path.join(root, p) for p in ext_sources]) + if not system_zstd: + sources.update([os.path.join(root, p) for p in zstd_sources]) + if support_legacy: + sources.update([os.path.join(root, p) for p in zstd_sources_legacy]) + sources = list(sources) - include_dirs = [os.path.join(root, d) for d in zstd_includes] - if support_legacy: - include_dirs.extend([os.path.join(root, d) for d in zstd_includes_legacy]) + include_dirs = set([os.path.join(root, d) for d in ext_includes]) + if not system_zstd: + include_dirs.update([os.path.join(root, d) for d in zstd_includes]) + if support_legacy: + include_dirs.update([os.path.join(root, d) for d in zstd_includes_legacy]) + include_dirs = list(include_dirs) depends = [os.path.join(root, p) for p in zstd_depends] + compiler = distutils.ccompiler.new_compiler() + + # Needed for MSVC. + if hasattr(compiler, 'initialize'): + compiler.initialize() + + if compiler.compiler_type == 'unix': + compiler_type = 'unix' + elif compiler.compiler_type == 'msvc': + compiler_type = 'msvc' + else: + raise Exception('unhandled compiler type: %s' % + compiler.compiler_type) + extra_args = ['-DZSTD_MULTITHREAD'] - if support_legacy: + if not system_zstd: + extra_args.append('-DZSTDLIB_VISIBILITY=') + extra_args.append('-DZDICTLIB_VISIBILITY=') + extra_args.append('-DZSTDERRORLIB_VISIBILITY=') + + if compiler_type == 'unix': + extra_args.append('-fvisibility=hidden') + + if not system_zstd and support_legacy: extra_args.append('-DZSTD_LEGACY_SUPPORT=1') + if warnings_as_errors: + if compiler_type == 'unix': + extra_args.append('-Werror') + elif compiler_type == 'msvc': + extra_args.append('/WX') + else: + assert False + + libraries = ['zstd'] if system_zstd else [] + # TODO compile with optimizations. return Extension(name, sources, include_dirs=include_dirs, depends=depends, - extra_compile_args=extra_args) + extra_compile_args=extra_args, + libraries=libraries) diff -r fb92df8b634c -r ed5448edcbfa contrib/python-zstandard/tests/common.py --- a/contrib/python-zstandard/tests/common.py Wed Apr 04 10:35:09 2018 -0400 +++ b/contrib/python-zstandard/tests/common.py Wed Apr 18 15:32:08 2018 -0400 @@ -1,16 +1,48 @@ +import imp import inspect import io import os import types +try: + import hypothesis +except ImportError: + hypothesis = None + def make_cffi(cls): """Decorator to add CFFI versions of each test method.""" + # The module containing this class definition should + # `import zstandard as zstd`. Otherwise things may blow up. + mod = inspect.getmodule(cls) + if not hasattr(mod, 'zstd'): + raise Exception('test module does not contain "zstd" symbol') + + if not hasattr(mod.zstd, 'backend'): + raise Exception('zstd symbol does not have "backend" attribute; did ' + 'you `import zstandard as zstd`?') + + # If `import zstandard` already chose the cffi backend, there is nothing + # for us to do: we only add the cffi variation if the default backend + # is the C extension. + if mod.zstd.backend == 'cffi': + return cls + + old_env = dict(os.environ) + os.environ['PYTHON_ZSTANDARD_IMPORT_POLICY'] = 'cffi' try: - import zstd_cffi - except ImportError: - return cls + try: + mod_info = imp.find_module('zstandard') + mod = imp.load_module('zstandard_cffi', *mod_info) + except ImportError: + return cls + finally: + os.environ.clear() + os.environ.update(old_env) + + if mod.backend != 'cffi': + raise Exception('got the zstandard %s backend instead of cffi' % mod.backend) # If CFFI version is available, dynamically construct test methods # that use it. @@ -29,13 +61,13 @@ # the function object and install it in a new attribute. if isinstance(fn, types.FunctionType): globs = dict(fn.__globals__) - globs['zstd'] = zstd_cffi + globs['zstd'] = mod new_fn = types.FunctionType(fn.__code__, globs, name, fn.__defaults__, fn.__closure__) new_method = new_fn else: globs = dict(fn.__func__.func_globals) - globs['zstd'] = zstd_cffi + globs['zstd'] = mod new_fn = types.FunctionType(fn.__func__.func_code, globs, name, fn.__func__.func_defaults, fn.__func__.func_closure) @@ -86,3 +118,34 @@ pass return _source_files + + +def generate_samples(): + inputs = [ + b'foo', + b'bar', + b'abcdef', + b'sometext', + b'baz', + ] + + samples = [] + + for i in range(128): + samples.append(inputs[i % 5]) + samples.append(inputs[i % 5] * (i + 3)) + samples.append(inputs[-(i % 5)] * (i + 2)) + + return samples + + +if hypothesis: + default_settings = hypothesis.settings() + hypothesis.settings.register_profile('default', default_settings) + + ci_settings = hypothesis.settings(max_examples=2500, + max_iterations=2500) + hypothesis.settings.register_profile('ci', ci_settings) + + hypothesis.settings.load_profile( + os.environ.get('HYPOTHESIS_PROFILE', 'default')) diff -r fb92df8b634c -r ed5448edcbfa contrib/python-zstandard/tests/test_buffer_util.py --- a/contrib/python-zstandard/tests/test_buffer_util.py Wed Apr 04 10:35:09 2018 -0400 +++ b/contrib/python-zstandard/tests/test_buffer_util.py Wed Apr 18 15:32:08 2018 -0400 @@ -1,11 +1,7 @@ import struct +import unittest -try: - import unittest2 as unittest -except ImportError: - import unittest - -import zstd +import zstandard as zstd ss = struct.Struct('=QQ') diff -r fb92df8b634c -r ed5448edcbfa contrib/python-zstandard/tests/test_compressor.py --- a/contrib/python-zstandard/tests/test_compressor.py Wed Apr 04 10:35:09 2018 -0400 +++ b/contrib/python-zstandard/tests/test_compressor.py Wed Apr 18 15:32:08 2018 -0400 @@ -2,13 +2,10 @@ import io import struct import sys +import tarfile +import unittest -try: - import unittest2 as unittest -except ImportError: - import unittest - -import zstd +import zstandard as zstd from .common import ( make_cffi, @@ -23,7 +20,8 @@ def multithreaded_chunk_size(level, source_size=0): - params = zstd.get_compression_parameters(level, source_size) + params = zstd.ZstdCompressionParameters.from_level(level, + source_size=source_size) return 1 << (params.window_log + 2) @@ -32,67 +30,82 @@ class TestCompressor(unittest.TestCase): def test_level_bounds(self): with self.assertRaises(ValueError): - zstd.ZstdCompressor(level=0) + zstd.ZstdCompressor(level=23) - with self.assertRaises(ValueError): - zstd.ZstdCompressor(level=23) + def test_memory_size(self): + cctx = zstd.ZstdCompressor(level=1) + self.assertGreater(cctx.memory_size(), 100) @make_cffi class TestCompressor_compress(unittest.TestCase): - def test_multithreaded_unsupported(self): - samples = [] - for i in range(128): - samples.append(b'foo' * 64) - samples.append(b'bar' * 64) - - d = zstd.train_dictionary(8192, samples) - - cctx = zstd.ZstdCompressor(dict_data=d, threads=2) - - with self.assertRaisesRegexp(zstd.ZstdError, 'compress\(\) cannot be used with both dictionaries and multi-threaded compression'): - cctx.compress(b'foo') - - params = zstd.get_compression_parameters(3) - cctx = zstd.ZstdCompressor(compression_params=params, threads=2) - with self.assertRaisesRegexp(zstd.ZstdError, 'compress\(\) cannot be used with both compression parameters and multi-threaded compression'): - cctx.compress(b'foo') - def test_compress_empty(self): - cctx = zstd.ZstdCompressor(level=1) + cctx = zstd.ZstdCompressor(level=1, write_content_size=False) result = cctx.compress(b'') self.assertEqual(result, b'\x28\xb5\x2f\xfd\x00\x48\x01\x00\x00') params = zstd.get_frame_parameters(result) - self.assertEqual(params.content_size, 0) + self.assertEqual(params.content_size, zstd.CONTENTSIZE_UNKNOWN) self.assertEqual(params.window_size, 524288) self.assertEqual(params.dict_id, 0) self.assertFalse(params.has_checksum, 0) - # TODO should be temporary until https://github.com/facebook/zstd/issues/506 - # is fixed. - cctx = zstd.ZstdCompressor(write_content_size=True) - with self.assertRaises(ValueError): - cctx.compress(b'') + cctx = zstd.ZstdCompressor() + result = cctx.compress(b'') + self.assertEqual(result, b'\x28\xb5\x2f\xfd\x20\x00\x01\x00\x00') + params = zstd.get_frame_parameters(result) + self.assertEqual(params.content_size, 0) + + def test_input_types(self): + cctx = zstd.ZstdCompressor(level=1, write_content_size=False) + expected = b'\x28\xb5\x2f\xfd\x00\x00\x19\x00\x00\x66\x6f\x6f' - cctx.compress(b'', allow_empty=True) + mutable_array = bytearray(3) + mutable_array[:] = b'foo' + + sources = [ + memoryview(b'foo'), + bytearray(b'foo'), + mutable_array, + ] + + for source in sources: + self.assertEqual(cctx.compress(source), expected) def test_compress_large(self): chunks = [] for i in range(255): chunks.append(struct.Struct('>B').pack(i) * 16384) - cctx = zstd.ZstdCompressor(level=3) + cctx = zstd.ZstdCompressor(level=3, write_content_size=False) result = cctx.compress(b''.join(chunks)) self.assertEqual(len(result), 999) self.assertEqual(result[0:4], b'\x28\xb5\x2f\xfd') - # This matches the test for read_from() below. - cctx = zstd.ZstdCompressor(level=1) + # This matches the test for read_to_iter() below. + cctx = zstd.ZstdCompressor(level=1, write_content_size=False) result = cctx.compress(b'f' * zstd.COMPRESSION_RECOMMENDED_INPUT_SIZE + b'o') self.assertEqual(result, b'\x28\xb5\x2f\xfd\x00\x40\x54\x00\x00' b'\x10\x66\x66\x01\x00\xfb\xff\x39\xc0' b'\x02\x09\x00\x00\x6f') + def test_negative_level(self): + cctx = zstd.ZstdCompressor(level=-4) + result = cctx.compress(b'foo' * 256) + + def test_no_magic(self): + params = zstd.ZstdCompressionParameters.from_level( + 1, format=zstd.FORMAT_ZSTD1) + cctx = zstd.ZstdCompressor(compression_params=params) + magic = cctx.compress(b'foobar') + + params = zstd.ZstdCompressionParameters.from_level( + 1, format=zstd.FORMAT_ZSTD1_MAGICLESS) + cctx = zstd.ZstdCompressor(compression_params=params) + no_magic = cctx.compress(b'foobar') + + self.assertEqual(magic[0:4], b'\x28\xb5\x2f\xfd') + self.assertEqual(magic[4:], no_magic) + def test_write_checksum(self): cctx = zstd.ZstdCompressor(level=1) no_checksum = cctx.compress(b'foobar') @@ -109,15 +122,15 @@ def test_write_content_size(self): cctx = zstd.ZstdCompressor(level=1) + with_size = cctx.compress(b'foobar' * 256) + cctx = zstd.ZstdCompressor(level=1, write_content_size=False) no_size = cctx.compress(b'foobar' * 256) - cctx = zstd.ZstdCompressor(level=1, write_content_size=True) - with_size = cctx.compress(b'foobar' * 256) self.assertEqual(len(with_size), len(no_size) + 1) no_params = zstd.get_frame_parameters(no_size) with_params = zstd.get_frame_parameters(with_size) - self.assertEqual(no_params.content_size, 0) + self.assertEqual(no_params.content_size, zstd.CONTENTSIZE_UNKNOWN) self.assertEqual(with_params.content_size, 1536) def test_no_dict_id(self): @@ -140,7 +153,7 @@ no_params = zstd.get_frame_parameters(no_dict_id) with_params = zstd.get_frame_parameters(with_dict_id) self.assertEqual(no_params.dict_id, 0) - self.assertEqual(with_params.dict_id, 1584102229) + self.assertEqual(with_params.dict_id, 1387616518) def test_compress_dict_multiple(self): samples = [] @@ -156,6 +169,21 @@ for i in range(32): cctx.compress(b'foo bar foobar foo bar foobar') + def test_dict_precompute(self): + samples = [] + for i in range(128): + samples.append(b'foo' * 64) + samples.append(b'bar' * 64) + samples.append(b'foobar' * 64) + + d = zstd.train_dictionary(8192, samples) + d.precompute_compress(level=1) + + cctx = zstd.ZstdCompressor(level=1, dict_data=d) + + for i in range(32): + cctx.compress(b'foo bar foobar foo bar foobar') + def test_multithreaded(self): chunk_size = multithreaded_chunk_size(1) source = b''.join([b'x' * chunk_size, b'y' * chunk_size]) @@ -171,16 +199,65 @@ dctx = zstd.ZstdDecompressor() self.assertEqual(dctx.decompress(compressed), source) + def test_multithreaded_dict(self): + samples = [] + for i in range(128): + samples.append(b'foo' * 64) + samples.append(b'bar' * 64) + samples.append(b'foobar' * 64) + + d = zstd.train_dictionary(1024, samples) + + cctx = zstd.ZstdCompressor(dict_data=d, threads=2) + + result = cctx.compress(b'foo') + params = zstd.get_frame_parameters(result); + self.assertEqual(params.content_size, 3); + self.assertEqual(params.dict_id, d.dict_id()) + + self.assertEqual(result, + b'\x28\xb5\x2f\xfd\x23\x06\x59\xb5\x52\x03\x19\x00\x00' + b'\x66\x6f\x6f') + + def test_multithreaded_compression_params(self): + params = zstd.ZstdCompressionParameters.from_level(0, threads=2) + cctx = zstd.ZstdCompressor(compression_params=params) + + result = cctx.compress(b'foo') + params = zstd.get_frame_parameters(result); + self.assertEqual(params.content_size, 3); + + self.assertEqual(result, + b'\x28\xb5\x2f\xfd\x20\x03\x19\x00\x00\x66\x6f\x6f') + @make_cffi class TestCompressor_compressobj(unittest.TestCase): def test_compressobj_empty(self): - cctx = zstd.ZstdCompressor(level=1) + cctx = zstd.ZstdCompressor(level=1, write_content_size=False) cobj = cctx.compressobj() self.assertEqual(cobj.compress(b''), b'') self.assertEqual(cobj.flush(), b'\x28\xb5\x2f\xfd\x00\x48\x01\x00\x00') + def test_input_types(self): + expected = b'\x28\xb5\x2f\xfd\x00\x48\x19\x00\x00\x66\x6f\x6f' + cctx = zstd.ZstdCompressor(level=1, write_content_size=False) + + mutable_array = bytearray(3) + mutable_array[:] = b'foo' + + sources = [ + memoryview(b'foo'), + bytearray(b'foo'), + mutable_array, + ] + + for source in sources: + cobj = cctx.compressobj() + self.assertEqual(cobj.compress(source), b'') + self.assertEqual(cobj.flush(), expected) + def test_compressobj_large(self): chunks = [] for i in range(255): @@ -194,7 +271,7 @@ self.assertEqual(result[0:4], b'\x28\xb5\x2f\xfd') params = zstd.get_frame_parameters(result) - self.assertEqual(params.content_size, 0) + self.assertEqual(params.content_size, zstd.CONTENTSIZE_UNKNOWN) self.assertEqual(params.window_size, 1048576) self.assertEqual(params.dict_id, 0) self.assertFalse(params.has_checksum) @@ -209,8 +286,8 @@ no_params = zstd.get_frame_parameters(no_checksum) with_params = zstd.get_frame_parameters(with_checksum) - self.assertEqual(no_params.content_size, 0) - self.assertEqual(with_params.content_size, 0) + self.assertEqual(no_params.content_size, zstd.CONTENTSIZE_UNKNOWN) + self.assertEqual(with_params.content_size, zstd.CONTENTSIZE_UNKNOWN) self.assertEqual(no_params.dict_id, 0) self.assertEqual(with_params.dict_id, 0) self.assertFalse(no_params.has_checksum) @@ -221,14 +298,14 @@ def test_write_content_size(self): cctx = zstd.ZstdCompressor(level=1) cobj = cctx.compressobj(size=len(b'foobar' * 256)) + with_size = cobj.compress(b'foobar' * 256) + cobj.flush() + cctx = zstd.ZstdCompressor(level=1, write_content_size=False) + cobj = cctx.compressobj(size=len(b'foobar' * 256)) no_size = cobj.compress(b'foobar' * 256) + cobj.flush() - cctx = zstd.ZstdCompressor(level=1, write_content_size=True) - cobj = cctx.compressobj(size=len(b'foobar' * 256)) - with_size = cobj.compress(b'foobar' * 256) + cobj.flush() no_params = zstd.get_frame_parameters(no_size) with_params = zstd.get_frame_parameters(with_size) - self.assertEqual(no_params.content_size, 0) + self.assertEqual(no_params.content_size, zstd.CONTENTSIZE_UNKNOWN) self.assertEqual(with_params.content_size, 1536) self.assertEqual(no_params.dict_id, 0) self.assertEqual(with_params.dict_id, 0) @@ -300,6 +377,34 @@ self.assertEqual(len(compressed), 295) + def test_frame_progression(self): + cctx = zstd.ZstdCompressor() + + self.assertEqual(cctx.frame_progression(), (0, 0, 0)) + + cobj = cctx.compressobj() + + cobj.compress(b'foobar') + self.assertEqual(cctx.frame_progression(), (6, 0, 0)) + + cobj.flush() + self.assertEqual(cctx.frame_progression(), (6, 6, 15)) + + def test_bad_size(self): + cctx = zstd.ZstdCompressor() + + cobj = cctx.compressobj(size=2) + with self.assertRaisesRegexp(zstd.ZstdError, 'Src size is incorrect'): + cobj.compress(b'foo') + + # Try another operation on this instance. + with self.assertRaisesRegexp(zstd.ZstdError, 'Src size is incorrect'): + cobj.compress(b'aa') + + # Try another operation on the compressor. + cctx.compressobj(size=4) + cctx.compress(b'foobar') + @make_cffi class TestCompressor_copy_stream(unittest.TestCase): @@ -323,7 +428,7 @@ source = io.BytesIO() dest = io.BytesIO() - cctx = zstd.ZstdCompressor(level=1) + cctx = zstd.ZstdCompressor(level=1, write_content_size=False) r, w = cctx.copy_stream(source, dest) self.assertEqual(int(r), 0) self.assertEqual(w, 9) @@ -345,7 +450,7 @@ self.assertEqual(w, 999) params = zstd.get_frame_parameters(dest.getvalue()) - self.assertEqual(params.content_size, 0) + self.assertEqual(params.content_size, zstd.CONTENTSIZE_UNKNOWN) self.assertEqual(params.window_size, 1048576) self.assertEqual(params.dict_id, 0) self.assertFalse(params.has_checksum) @@ -367,8 +472,8 @@ no_params = zstd.get_frame_parameters(no_checksum.getvalue()) with_params = zstd.get_frame_parameters(with_checksum.getvalue()) - self.assertEqual(no_params.content_size, 0) - self.assertEqual(with_params.content_size, 0) + self.assertEqual(no_params.content_size, zstd.CONTENTSIZE_UNKNOWN) + self.assertEqual(with_params.content_size, zstd.CONTENTSIZE_UNKNOWN) self.assertEqual(no_params.dict_id, 0) self.assertEqual(with_params.dict_id, 0) self.assertFalse(no_params.has_checksum) @@ -378,12 +483,12 @@ source = io.BytesIO(b'foobar' * 256) no_size = io.BytesIO() - cctx = zstd.ZstdCompressor(level=1) + cctx = zstd.ZstdCompressor(level=1, write_content_size=False) cctx.copy_stream(source, no_size) source.seek(0) with_size = io.BytesIO() - cctx = zstd.ZstdCompressor(level=1, write_content_size=True) + cctx = zstd.ZstdCompressor(level=1) cctx.copy_stream(source, with_size) # Source content size is unknown, so no content size written. @@ -400,7 +505,7 @@ no_params = zstd.get_frame_parameters(no_size.getvalue()) with_params = zstd.get_frame_parameters(with_size.getvalue()) - self.assertEqual(no_params.content_size, 0) + self.assertEqual(no_params.content_size, zstd.CONTENTSIZE_UNKNOWN) self.assertEqual(with_params.content_size, 1536) self.assertEqual(no_params.dict_id, 0) self.assertEqual(with_params.dict_id, 0) @@ -426,19 +531,18 @@ source.seek(0) dest = io.BytesIO() - cctx = zstd.ZstdCompressor(threads=2) + cctx = zstd.ZstdCompressor(threads=2, write_content_size=False) r, w = cctx.copy_stream(source, dest) self.assertEqual(r, 3145728) self.assertEqual(w, 295) params = zstd.get_frame_parameters(dest.getvalue()) - self.assertEqual(params.content_size, 0) + self.assertEqual(params.content_size, zstd.CONTENTSIZE_UNKNOWN) self.assertEqual(params.dict_id, 0) self.assertFalse(params.has_checksum) # Writing content size and checksum works. - cctx = zstd.ZstdCompressor(threads=2, write_content_size=True, - write_checksum=True) + cctx = zstd.ZstdCompressor(threads=2, write_checksum=True) dest = io.BytesIO() source.seek(0) cctx.copy_stream(source, dest, size=len(source.getvalue())) @@ -448,31 +552,227 @@ self.assertEqual(params.dict_id, 0) self.assertTrue(params.has_checksum) + def test_bad_size(self): + source = io.BytesIO() + source.write(b'a' * 32768) + source.write(b'b' * 32768) + source.seek(0) -def compress(data, level): - buffer = io.BytesIO() - cctx = zstd.ZstdCompressor(level=level) - with cctx.write_to(buffer) as compressor: - compressor.write(data) - return buffer.getvalue() + dest = io.BytesIO() + + cctx = zstd.ZstdCompressor() + + with self.assertRaisesRegexp(zstd.ZstdError, 'Src size is incorrect'): + cctx.copy_stream(source, dest, size=42) + + # Try another operation on this compressor. + source.seek(0) + dest = io.BytesIO() + cctx.copy_stream(source, dest) @make_cffi -class TestCompressor_write_to(unittest.TestCase): +class TestCompressor_stream_reader(unittest.TestCase): + def test_context_manager(self): + cctx = zstd.ZstdCompressor() + + reader = cctx.stream_reader(b'foo' * 60) + with self.assertRaisesRegexp(zstd.ZstdError, 'read\(\) must be called from an active'): + reader.read(10) + + with cctx.stream_reader(b'foo') as reader: + with self.assertRaisesRegexp(ValueError, 'cannot __enter__ multiple times'): + with reader as reader2: + pass + + def test_not_implemented(self): + cctx = zstd.ZstdCompressor() + + with cctx.stream_reader(b'foo' * 60) as reader: + with self.assertRaises(io.UnsupportedOperation): + reader.readline() + + with self.assertRaises(io.UnsupportedOperation): + reader.readlines() + + # This could probably be implemented someday. + with self.assertRaises(NotImplementedError): + reader.readall() + + with self.assertRaises(io.UnsupportedOperation): + iter(reader) + + with self.assertRaises(io.UnsupportedOperation): + next(reader) + + with self.assertRaises(OSError): + reader.writelines([]) + + with self.assertRaises(OSError): + reader.write(b'foo') + + def test_constant_methods(self): + cctx = zstd.ZstdCompressor() + + with cctx.stream_reader(b'boo') as reader: + self.assertTrue(reader.readable()) + self.assertFalse(reader.writable()) + self.assertFalse(reader.seekable()) + self.assertFalse(reader.isatty()) + self.assertIsNone(reader.flush()) + + def test_read_closed(self): + cctx = zstd.ZstdCompressor() + + with cctx.stream_reader(b'foo' * 60) as reader: + reader.close() + with self.assertRaisesRegexp(ValueError, 'stream is closed'): + reader.read(10) + + def test_read_bad_size(self): + cctx = zstd.ZstdCompressor() + + with cctx.stream_reader(b'foo') as reader: + with self.assertRaisesRegexp(ValueError, 'cannot read negative or size 0 amounts'): + reader.read(-1) + + with self.assertRaisesRegexp(ValueError, 'cannot read negative or size 0 amounts'): + reader.read(0) + + def test_read_buffer(self): + cctx = zstd.ZstdCompressor() + + source = b''.join([b'foo' * 60, b'bar' * 60, b'baz' * 60]) + frame = cctx.compress(source) + + with cctx.stream_reader(source) as reader: + self.assertEqual(reader.tell(), 0) + + # We should get entire frame in one read. + result = reader.read(8192) + self.assertEqual(result, frame) + self.assertEqual(reader.tell(), len(result)) + self.assertEqual(reader.read(), b'') + self.assertEqual(reader.tell(), len(result)) + + def test_read_buffer_small_chunks(self): + cctx = zstd.ZstdCompressor() + + source = b'foo' * 60 + chunks = [] + + with cctx.stream_reader(source) as reader: + self.assertEqual(reader.tell(), 0) + + while True: + chunk = reader.read(1) + if not chunk: + break + + chunks.append(chunk) + self.assertEqual(reader.tell(), sum(map(len, chunks))) + + self.assertEqual(b''.join(chunks), cctx.compress(source)) + + def test_read_stream(self): + cctx = zstd.ZstdCompressor() + + source = b''.join([b'foo' * 60, b'bar' * 60, b'baz' * 60]) + frame = cctx.compress(source) + + with cctx.stream_reader(io.BytesIO(source), size=len(source)) as reader: + self.assertEqual(reader.tell(), 0) + + chunk = reader.read(8192) + self.assertEqual(chunk, frame) + self.assertEqual(reader.tell(), len(chunk)) + self.assertEqual(reader.read(), b'') + self.assertEqual(reader.tell(), len(chunk)) + + def test_read_stream_small_chunks(self): + cctx = zstd.ZstdCompressor() + + source = b'foo' * 60 + chunks = [] + + with cctx.stream_reader(io.BytesIO(source), size=len(source)) as reader: + self.assertEqual(reader.tell(), 0) + + while True: + chunk = reader.read(1) + if not chunk: + break + + chunks.append(chunk) + self.assertEqual(reader.tell(), sum(map(len, chunks))) + + self.assertEqual(b''.join(chunks), cctx.compress(source)) + + def test_read_after_exit(self): + cctx = zstd.ZstdCompressor() + + with cctx.stream_reader(b'foo' * 60) as reader: + while reader.read(8192): + pass + + with self.assertRaisesRegexp(zstd.ZstdError, 'read\(\) must be called from an active'): + reader.read(10) + + def test_bad_size(self): + cctx = zstd.ZstdCompressor() + + source = io.BytesIO(b'foobar') + + with cctx.stream_reader(source, size=2) as reader: + with self.assertRaisesRegexp(zstd.ZstdError, 'Src size is incorrect'): + reader.read(10) + + # Try another compression operation. + with cctx.stream_reader(source, size=42): + pass + + +@make_cffi +class TestCompressor_stream_writer(unittest.TestCase): def test_empty(self): - result = compress(b'', 1) + buffer = io.BytesIO() + cctx = zstd.ZstdCompressor(level=1, write_content_size=False) + with cctx.stream_writer(buffer) as compressor: + compressor.write(b'') + + result = buffer.getvalue() self.assertEqual(result, b'\x28\xb5\x2f\xfd\x00\x48\x01\x00\x00') params = zstd.get_frame_parameters(result) - self.assertEqual(params.content_size, 0) + self.assertEqual(params.content_size, zstd.CONTENTSIZE_UNKNOWN) self.assertEqual(params.window_size, 524288) self.assertEqual(params.dict_id, 0) self.assertFalse(params.has_checksum) + def test_input_types(self): + expected = b'\x28\xb5\x2f\xfd\x00\x48\x19\x00\x00\x66\x6f\x6f' + cctx = zstd.ZstdCompressor(level=1) + + mutable_array = bytearray(3) + mutable_array[:] = b'foo' + + sources = [ + memoryview(b'foo'), + bytearray(b'foo'), + mutable_array, + ] + + for source in sources: + buffer = io.BytesIO() + with cctx.stream_writer(buffer) as compressor: + compressor.write(source) + + self.assertEqual(buffer.getvalue(), expected) + def test_multiple_compress(self): buffer = io.BytesIO() cctx = zstd.ZstdCompressor(level=5) - with cctx.write_to(buffer) as compressor: + with cctx.stream_writer(buffer) as compressor: self.assertEqual(compressor.write(b'foo'), 0) self.assertEqual(compressor.write(b'bar'), 0) self.assertEqual(compressor.write(b'x' * 8192), 0) @@ -491,35 +791,40 @@ d = zstd.train_dictionary(8192, samples) + h = hashlib.sha1(d.as_bytes()).hexdigest() + self.assertEqual(h, '3040faa0ddc37d50e71a4dd28052cb8db5d9d027') + buffer = io.BytesIO() cctx = zstd.ZstdCompressor(level=9, dict_data=d) - with cctx.write_to(buffer) as compressor: + with cctx.stream_writer(buffer) as compressor: self.assertEqual(compressor.write(b'foo'), 0) self.assertEqual(compressor.write(b'bar'), 0) - self.assertEqual(compressor.write(b'foo' * 16384), 634) + self.assertEqual(compressor.write(b'foo' * 16384), 0) compressed = buffer.getvalue() params = zstd.get_frame_parameters(compressed) - self.assertEqual(params.content_size, 0) - self.assertEqual(params.window_size, 1024) + self.assertEqual(params.content_size, zstd.CONTENTSIZE_UNKNOWN) + self.assertEqual(params.window_size, 2097152) self.assertEqual(params.dict_id, d.dict_id()) self.assertFalse(params.has_checksum) - - self.assertEqual(compressed[0:32], - b'\x28\xb5\x2f\xfd\x03\x00\x55\x7b\x6b\x5e\x54\x00' - b'\x00\x00\x02\xfc\xf4\xa5\xba\x23\x3f\x85\xb3\x54' - b'\x00\x00\x18\x6f\x6f\x66\x01\x00') - - h = hashlib.sha1(compressed).hexdigest() - self.assertEqual(h, '1c5bcd25181bcd8c1a73ea8773323e0056129f92') + self.assertEqual(compressed, + b'\x28\xb5\x2f\xfd\x03\x58\x06\x59\xb5\x52\x5d\x00' + b'\x00\x00\x02\xfc\x3d\x3f\xd9\xb0\x51\x03\x45\x89') def test_compression_params(self): - params = zstd.CompressionParameters(20, 6, 12, 5, 4, 10, zstd.STRATEGY_FAST) + params = zstd.ZstdCompressionParameters( + window_log=20, + chain_log=6, + hash_log=12, + min_match=5, + search_log=4, + target_length=10, + compression_strategy=zstd.STRATEGY_FAST) buffer = io.BytesIO() cctx = zstd.ZstdCompressor(compression_params=params) - with cctx.write_to(buffer) as compressor: + with cctx.stream_writer(buffer) as compressor: self.assertEqual(compressor.write(b'foo'), 0) self.assertEqual(compressor.write(b'bar'), 0) self.assertEqual(compressor.write(b'foobar' * 16384), 0) @@ -527,29 +832,29 @@ compressed = buffer.getvalue() params = zstd.get_frame_parameters(compressed) - self.assertEqual(params.content_size, 0) + self.assertEqual(params.content_size, zstd.CONTENTSIZE_UNKNOWN) self.assertEqual(params.window_size, 1048576) self.assertEqual(params.dict_id, 0) self.assertFalse(params.has_checksum) h = hashlib.sha1(compressed).hexdigest() - self.assertEqual(h, '1ae31f270ed7de14235221a604b31ecd517ebd99') + self.assertEqual(h, '2a8111d72eb5004cdcecbdac37da9f26720d30ef') def test_write_checksum(self): no_checksum = io.BytesIO() cctx = zstd.ZstdCompressor(level=1) - with cctx.write_to(no_checksum) as compressor: + with cctx.stream_writer(no_checksum) as compressor: self.assertEqual(compressor.write(b'foobar'), 0) with_checksum = io.BytesIO() cctx = zstd.ZstdCompressor(level=1, write_checksum=True) - with cctx.write_to(with_checksum) as compressor: + with cctx.stream_writer(with_checksum) as compressor: self.assertEqual(compressor.write(b'foobar'), 0) no_params = zstd.get_frame_parameters(no_checksum.getvalue()) with_params = zstd.get_frame_parameters(with_checksum.getvalue()) - self.assertEqual(no_params.content_size, 0) - self.assertEqual(with_params.content_size, 0) + self.assertEqual(no_params.content_size, zstd.CONTENTSIZE_UNKNOWN) + self.assertEqual(with_params.content_size, zstd.CONTENTSIZE_UNKNOWN) self.assertEqual(no_params.dict_id, 0) self.assertEqual(with_params.dict_id, 0) self.assertFalse(no_params.has_checksum) @@ -560,13 +865,13 @@ def test_write_content_size(self): no_size = io.BytesIO() - cctx = zstd.ZstdCompressor(level=1) - with cctx.write_to(no_size) as compressor: + cctx = zstd.ZstdCompressor(level=1, write_content_size=False) + with cctx.stream_writer(no_size) as compressor: self.assertEqual(compressor.write(b'foobar' * 256), 0) with_size = io.BytesIO() - cctx = zstd.ZstdCompressor(level=1, write_content_size=True) - with cctx.write_to(with_size) as compressor: + cctx = zstd.ZstdCompressor(level=1) + with cctx.stream_writer(with_size) as compressor: self.assertEqual(compressor.write(b'foobar' * 256), 0) # Source size is not known in streaming mode, so header not @@ -576,12 +881,12 @@ # Declaring size will write the header. with_size = io.BytesIO() - with cctx.write_to(with_size, size=len(b'foobar' * 256)) as compressor: + with cctx.stream_writer(with_size, size=len(b'foobar' * 256)) as compressor: self.assertEqual(compressor.write(b'foobar' * 256), 0) no_params = zstd.get_frame_parameters(no_size.getvalue()) with_params = zstd.get_frame_parameters(with_size.getvalue()) - self.assertEqual(no_params.content_size, 0) + self.assertEqual(no_params.content_size, zstd.CONTENTSIZE_UNKNOWN) self.assertEqual(with_params.content_size, 1536) self.assertEqual(no_params.dict_id, 0) self.assertEqual(with_params.dict_id, 0) @@ -602,18 +907,22 @@ with_dict_id = io.BytesIO() cctx = zstd.ZstdCompressor(level=1, dict_data=d) - with cctx.write_to(with_dict_id) as compressor: + with cctx.stream_writer(with_dict_id) as compressor: self.assertEqual(compressor.write(b'foobarfoobar'), 0) + self.assertEqual(with_dict_id.getvalue()[4:5], b'\x03') + cctx = zstd.ZstdCompressor(level=1, dict_data=d, write_dict_id=False) no_dict_id = io.BytesIO() - with cctx.write_to(no_dict_id) as compressor: + with cctx.stream_writer(no_dict_id) as compressor: self.assertEqual(compressor.write(b'foobarfoobar'), 0) + self.assertEqual(no_dict_id.getvalue()[4:5], b'\x00') + no_params = zstd.get_frame_parameters(no_dict_id.getvalue()) with_params = zstd.get_frame_parameters(with_dict_id.getvalue()) - self.assertEqual(no_params.content_size, 0) - self.assertEqual(with_params.content_size, 0) + self.assertEqual(no_params.content_size, zstd.CONTENTSIZE_UNKNOWN) + self.assertEqual(with_params.content_size, zstd.CONTENTSIZE_UNKNOWN) self.assertEqual(no_params.dict_id, 0) self.assertEqual(with_params.dict_id, d.dict_id()) self.assertFalse(no_params.has_checksum) @@ -625,7 +934,8 @@ def test_memory_size(self): cctx = zstd.ZstdCompressor(level=3) buffer = io.BytesIO() - with cctx.write_to(buffer) as compressor: + with cctx.stream_writer(buffer) as compressor: + compressor.write(b'foo') size = compressor.memory_size() self.assertGreater(size, 100000) @@ -633,7 +943,7 @@ def test_write_size(self): cctx = zstd.ZstdCompressor(level=3) dest = OpCountingBytesIO() - with cctx.write_to(dest, write_size=1) as compressor: + with cctx.stream_writer(dest, write_size=1) as compressor: self.assertEqual(compressor.write(b'foo'), 0) self.assertEqual(compressor.write(b'bar'), 0) self.assertEqual(compressor.write(b'foobar'), 0) @@ -643,7 +953,7 @@ def test_flush_repeated(self): cctx = zstd.ZstdCompressor(level=3) dest = OpCountingBytesIO() - with cctx.write_to(dest) as compressor: + with cctx.stream_writer(dest) as compressor: self.assertEqual(compressor.write(b'foo'), 0) self.assertEqual(dest._write_count, 0) self.assertEqual(compressor.flush(), 12) @@ -659,7 +969,7 @@ def test_flush_empty_block(self): cctx = zstd.ZstdCompressor(level=3, write_checksum=True) dest = OpCountingBytesIO() - with cctx.write_to(dest) as compressor: + with cctx.stream_writer(dest) as compressor: self.assertEqual(compressor.write(b'foobar' * 8192), 0) count = dest._write_count offset = dest.tell() @@ -680,50 +990,89 @@ def test_multithreaded(self): dest = io.BytesIO() cctx = zstd.ZstdCompressor(threads=2) - with cctx.write_to(dest) as compressor: + with cctx.stream_writer(dest) as compressor: compressor.write(b'a' * 1048576) compressor.write(b'b' * 1048576) compressor.write(b'c' * 1048576) self.assertEqual(len(dest.getvalue()), 295) + def test_tell(self): + dest = io.BytesIO() + cctx = zstd.ZstdCompressor() + with cctx.stream_writer(dest) as compressor: + self.assertEqual(compressor.tell(), 0) + + for i in range(256): + compressor.write(b'foo' * (i + 1)) + self.assertEqual(compressor.tell(), dest.tell()) + + def test_bad_size(self): + cctx = zstd.ZstdCompressor() + + dest = io.BytesIO() + + with self.assertRaisesRegexp(zstd.ZstdError, 'Src size is incorrect'): + with cctx.stream_writer(dest, size=2) as compressor: + compressor.write(b'foo') + + # Test another operation. + with cctx.stream_writer(dest, size=42): + pass + + def test_tarfile_compat(self): + raise unittest.SkipTest('not yet fully working') + + dest = io.BytesIO() + cctx = zstd.ZstdCompressor() + with cctx.stream_writer(dest) as compressor: + with tarfile.open('tf', mode='w', fileobj=compressor) as tf: + tf.add(__file__, 'test_compressor.py') + + dest.seek(0) + + dctx = zstd.ZstdDecompressor() + with dctx.stream_reader(dest) as reader: + with tarfile.open(mode='r:', fileobj=reader) as tf: + for member in tf: + self.assertEqual(member.name, 'test_compressor.py') @make_cffi -class TestCompressor_read_from(unittest.TestCase): +class TestCompressor_read_to_iter(unittest.TestCase): def test_type_validation(self): cctx = zstd.ZstdCompressor() # Object with read() works. - for chunk in cctx.read_from(io.BytesIO()): + for chunk in cctx.read_to_iter(io.BytesIO()): pass # Buffer protocol works. - for chunk in cctx.read_from(b'foobar'): + for chunk in cctx.read_to_iter(b'foobar'): pass with self.assertRaisesRegexp(ValueError, 'must pass an object with a read'): - for chunk in cctx.read_from(True): + for chunk in cctx.read_to_iter(True): pass def test_read_empty(self): - cctx = zstd.ZstdCompressor(level=1) + cctx = zstd.ZstdCompressor(level=1, write_content_size=False) source = io.BytesIO() - it = cctx.read_from(source) + it = cctx.read_to_iter(source) chunks = list(it) self.assertEqual(len(chunks), 1) compressed = b''.join(chunks) self.assertEqual(compressed, b'\x28\xb5\x2f\xfd\x00\x48\x01\x00\x00') # And again with the buffer protocol. - it = cctx.read_from(b'') + it = cctx.read_to_iter(b'') chunks = list(it) self.assertEqual(len(chunks), 1) compressed2 = b''.join(chunks) self.assertEqual(compressed2, compressed) def test_read_large(self): - cctx = zstd.ZstdCompressor(level=1) + cctx = zstd.ZstdCompressor(level=1, write_content_size=False) source = io.BytesIO() source.write(b'f' * zstd.COMPRESSION_RECOMMENDED_INPUT_SIZE) @@ -732,7 +1081,7 @@ # Creating an iterator should not perform any compression until # first read. - it = cctx.read_from(source, size=len(source.getvalue())) + it = cctx.read_to_iter(source, size=len(source.getvalue())) self.assertEqual(source.tell(), 0) # We should have exactly 2 output chunks. @@ -758,21 +1107,28 @@ self.assertEqual(b''.join(chunks), cctx.compress(source.getvalue())) params = zstd.get_frame_parameters(b''.join(chunks)) - self.assertEqual(params.content_size, 0) + self.assertEqual(params.content_size, zstd.CONTENTSIZE_UNKNOWN) self.assertEqual(params.window_size, 262144) self.assertEqual(params.dict_id, 0) self.assertFalse(params.has_checksum) # Now check the buffer protocol. - it = cctx.read_from(source.getvalue()) + it = cctx.read_to_iter(source.getvalue()) chunks = list(it) self.assertEqual(len(chunks), 2) + + params = zstd.get_frame_parameters(b''.join(chunks)) + self.assertEqual(params.content_size, zstd.CONTENTSIZE_UNKNOWN) + #self.assertEqual(params.window_size, 262144) + self.assertEqual(params.dict_id, 0) + self.assertFalse(params.has_checksum) + self.assertEqual(b''.join(chunks), cctx.compress(source.getvalue())) def test_read_write_size(self): source = OpCountingBytesIO(b'foobarfoobar') cctx = zstd.ZstdCompressor(level=3) - for chunk in cctx.read_from(source, read_size=1, write_size=1): + for chunk in cctx.read_to_iter(source, read_size=1, write_size=1): self.assertEqual(len(chunk), 1) self.assertEqual(source._read_count, len(source.getvalue()) + 1) @@ -786,17 +1142,22 @@ cctx = zstd.ZstdCompressor(threads=2) - compressed = b''.join(cctx.read_from(source)) + compressed = b''.join(cctx.read_to_iter(source)) self.assertEqual(len(compressed), 295) + def test_bad_size(self): + cctx = zstd.ZstdCompressor() + + source = io.BytesIO(b'a' * 42) + + with self.assertRaisesRegexp(zstd.ZstdError, 'Src size is incorrect'): + b''.join(cctx.read_to_iter(source, size=2)) + + # Test another operation on errored compressor. + b''.join(cctx.read_to_iter(source)) + class TestCompressor_multi_compress_to_buffer(unittest.TestCase): - def test_multithreaded_unsupported(self): - cctx = zstd.ZstdCompressor(threads=2) - - with self.assertRaisesRegexp(zstd.ZstdError, 'function cannot be called on ZstdCompressor configured for multi-threaded compression'): - cctx.multi_compress_to_buffer([b'foo']) - def test_invalid_inputs(self): cctx = zstd.ZstdCompressor() @@ -819,7 +1180,7 @@ cctx.multi_compress_to_buffer([b'', b'', b'']) def test_list_input(self): - cctx = zstd.ZstdCompressor(write_content_size=True, write_checksum=True) + cctx = zstd.ZstdCompressor(write_checksum=True) original = [b'foo' * 12, b'bar' * 6] frames = [cctx.compress(c) for c in original] @@ -834,7 +1195,7 @@ self.assertEqual(b[1].tobytes(), frames[1]) def test_buffer_with_segments_input(self): - cctx = zstd.ZstdCompressor(write_content_size=True, write_checksum=True) + cctx = zstd.ZstdCompressor(write_checksum=True) original = [b'foo' * 4, b'bar' * 6] frames = [cctx.compress(c) for c in original] @@ -852,7 +1213,7 @@ self.assertEqual(result[1].tobytes(), frames[1]) def test_buffer_with_segments_collection_input(self): - cctx = zstd.ZstdCompressor(write_content_size=True, write_checksum=True) + cctx = zstd.ZstdCompressor(write_checksum=True) original = [ b'foo1', @@ -886,10 +1247,10 @@ def test_multiple_threads(self): # threads argument will cause multi-threaded ZSTD APIs to be used, which will # make output different. - refcctx = zstd.ZstdCompressor(write_content_size=True, write_checksum=True) + refcctx = zstd.ZstdCompressor(write_checksum=True) reference = [refcctx.compress(b'x' * 64), refcctx.compress(b'y' * 64)] - cctx = zstd.ZstdCompressor(write_content_size=True, write_checksum=True) + cctx = zstd.ZstdCompressor(write_checksum=True) frames = [] frames.extend(b'x' * 64 for i in range(256)) diff -r fb92df8b634c -r ed5448edcbfa contrib/python-zstandard/tests/test_compressor_fuzzing.py --- a/contrib/python-zstandard/tests/test_compressor_fuzzing.py Wed Apr 04 10:35:09 2018 -0400 +++ b/contrib/python-zstandard/tests/test_compressor_fuzzing.py Wed Apr 18 15:32:08 2018 -0400 @@ -1,10 +1,6 @@ import io import os - -try: - import unittest2 as unittest -except ImportError: - import unittest +import unittest try: import hypothesis @@ -12,7 +8,7 @@ except ImportError: raise unittest.SkipTest('hypothesis not available') -import zstd +import zstandard as zstd from . common import ( make_cffi, @@ -22,7 +18,57 @@ @unittest.skipUnless('ZSTD_SLOW_TESTS' in os.environ, 'ZSTD_SLOW_TESTS not set') @make_cffi -class TestCompressor_write_to_fuzzing(unittest.TestCase): +class TestCompressor_stream_reader_fuzzing(unittest.TestCase): + @hypothesis.given(original=strategies.sampled_from(random_input_data()), + level=strategies.integers(min_value=1, max_value=5), + source_read_size=strategies.integers(1, 16384), + read_sizes=strategies.data()) + def test_stream_source_read_variance(self, original, level, source_read_size, + read_sizes): + refctx = zstd.ZstdCompressor(level=level) + ref_frame = refctx.compress(original) + + cctx = zstd.ZstdCompressor(level=level) + with cctx.stream_reader(io.BytesIO(original), size=len(original), + read_size=source_read_size) as reader: + chunks = [] + while True: + read_size = read_sizes.draw(strategies.integers(1, 16384)) + chunk = reader.read(read_size) + + if not chunk: + break + chunks.append(chunk) + + self.assertEqual(b''.join(chunks), ref_frame) + + @hypothesis.given(original=strategies.sampled_from(random_input_data()), + level=strategies.integers(min_value=1, max_value=5), + source_read_size=strategies.integers(1, 16384), + read_sizes=strategies.data()) + def test_buffer_source_read_variance(self, original, level, source_read_size, + read_sizes): + + refctx = zstd.ZstdCompressor(level=level) + ref_frame = refctx.compress(original) + + cctx = zstd.ZstdCompressor(level=level) + with cctx.stream_reader(original, size=len(original), + read_size=source_read_size) as reader: + chunks = [] + while True: + read_size = read_sizes.draw(strategies.integers(1, 16384)) + chunk = reader.read(read_size) + if not chunk: + break + chunks.append(chunk) + + self.assertEqual(b''.join(chunks), ref_frame) + + +@unittest.skipUnless('ZSTD_SLOW_TESTS' in os.environ, 'ZSTD_SLOW_TESTS not set') +@make_cffi +class TestCompressor_stream_writer_fuzzing(unittest.TestCase): @hypothesis.given(original=strategies.sampled_from(random_input_data()), level=strategies.integers(min_value=1, max_value=5), write_size=strategies.integers(min_value=1, max_value=1048576)) @@ -32,7 +78,7 @@ cctx = zstd.ZstdCompressor(level=level) b = io.BytesIO() - with cctx.write_to(b, size=len(original), write_size=write_size) as compressor: + with cctx.stream_writer(b, size=len(original), write_size=write_size) as compressor: compressor.write(original) self.assertEqual(b.getvalue(), ref_frame) @@ -62,13 +108,12 @@ @unittest.skipUnless('ZSTD_SLOW_TESTS' in os.environ, 'ZSTD_SLOW_TESTS not set') @make_cffi class TestCompressor_compressobj_fuzzing(unittest.TestCase): + @hypothesis.settings( + suppress_health_check=[hypothesis.HealthCheck.large_base_example]) @hypothesis.given(original=strategies.sampled_from(random_input_data()), level=strategies.integers(min_value=1, max_value=5), - chunk_sizes=strategies.streaming( - strategies.integers(min_value=1, max_value=4096))) + chunk_sizes=strategies.data()) def test_random_input_sizes(self, original, level, chunk_sizes): - chunk_sizes = iter(chunk_sizes) - refctx = zstd.ZstdCompressor(level=level) ref_frame = refctx.compress(original) @@ -78,7 +123,7 @@ chunks = [] i = 0 while True: - chunk_size = next(chunk_sizes) + chunk_size = chunk_sizes.draw(strategies.integers(1, 4096)) source = original[i:i + chunk_size] if not source: break @@ -93,7 +138,7 @@ @unittest.skipUnless('ZSTD_SLOW_TESTS' in os.environ, 'ZSTD_SLOW_TESTS not set') @make_cffi -class TestCompressor_read_from_fuzzing(unittest.TestCase): +class TestCompressor_read_to_iter_fuzzing(unittest.TestCase): @hypothesis.given(original=strategies.sampled_from(random_input_data()), level=strategies.integers(min_value=1, max_value=5), read_size=strategies.integers(min_value=1, max_value=4096), @@ -105,8 +150,9 @@ source = io.BytesIO(original) cctx = zstd.ZstdCompressor(level=level) - chunks = list(cctx.read_from(source, size=len(original), read_size=read_size, - write_size=write_size)) + chunks = list(cctx.read_to_iter(source, size=len(original), + read_size=read_size, + write_size=write_size)) self.assertEqual(b''.join(chunks), ref_frame) @@ -125,7 +171,6 @@ kwargs['dict_data'] = zstd.ZstdCompressionDict(original[0]) cctx = zstd.ZstdCompressor(level=1, - write_content_size=True, write_checksum=True, **kwargs) diff -r fb92df8b634c -r ed5448edcbfa contrib/python-zstandard/tests/test_data_structures.py --- a/contrib/python-zstandard/tests/test_data_structures.py Wed Apr 04 10:35:09 2018 -0400 +++ b/contrib/python-zstandard/tests/test_data_structures.py Wed Apr 18 15:32:08 2018 -0400 @@ -1,9 +1,7 @@ -try: - import unittest2 as unittest -except ImportError: - import unittest +import sys +import unittest -import zstd +import zstandard as zstd from . common import ( make_cffi, @@ -12,52 +10,104 @@ @make_cffi class TestCompressionParameters(unittest.TestCase): - def test_init_bad_arg_type(self): - with self.assertRaises(TypeError): - zstd.CompressionParameters() - - with self.assertRaises(TypeError): - zstd.CompressionParameters(0, 1) + def test_bounds(self): + zstd.ZstdCompressionParameters(window_log=zstd.WINDOWLOG_MIN, + chain_log=zstd.CHAINLOG_MIN, + hash_log=zstd.HASHLOG_MIN, + search_log=zstd.SEARCHLOG_MIN, + min_match=zstd.SEARCHLENGTH_MIN + 1, + target_length=zstd.TARGETLENGTH_MIN, + compression_strategy=zstd.STRATEGY_FAST) - def test_bounds(self): - zstd.CompressionParameters(zstd.WINDOWLOG_MIN, - zstd.CHAINLOG_MIN, - zstd.HASHLOG_MIN, - zstd.SEARCHLOG_MIN, - zstd.SEARCHLENGTH_MIN + 1, - zstd.TARGETLENGTH_MIN, - zstd.STRATEGY_FAST) + zstd.ZstdCompressionParameters(window_log=zstd.WINDOWLOG_MAX, + chain_log=zstd.CHAINLOG_MAX, + hash_log=zstd.HASHLOG_MAX, + search_log=zstd.SEARCHLOG_MAX, + min_match=zstd.SEARCHLENGTH_MAX - 1, + compression_strategy=zstd.STRATEGY_BTULTRA) - zstd.CompressionParameters(zstd.WINDOWLOG_MAX, - zstd.CHAINLOG_MAX, - zstd.HASHLOG_MAX, - zstd.SEARCHLOG_MAX, - zstd.SEARCHLENGTH_MAX - 1, - zstd.TARGETLENGTH_MAX, - zstd.STRATEGY_BTOPT) - - def test_get_compression_parameters(self): - p = zstd.get_compression_parameters(1) + def test_from_level(self): + p = zstd.ZstdCompressionParameters.from_level(1) self.assertIsInstance(p, zstd.CompressionParameters) self.assertEqual(p.window_log, 19) + p = zstd.ZstdCompressionParameters.from_level(-4) + self.assertEqual(p.window_log, 19) + self.assertEqual(p.compress_literals, 0) + def test_members(self): - p = zstd.CompressionParameters(10, 6, 7, 4, 5, 8, 1) + p = zstd.ZstdCompressionParameters(window_log=10, + chain_log=6, + hash_log=7, + search_log=4, + min_match=5, + target_length=8, + compression_strategy=1) self.assertEqual(p.window_log, 10) self.assertEqual(p.chain_log, 6) self.assertEqual(p.hash_log, 7) self.assertEqual(p.search_log, 4) - self.assertEqual(p.search_length, 5) + self.assertEqual(p.min_match, 5) self.assertEqual(p.target_length, 8) - self.assertEqual(p.strategy, 1) + self.assertEqual(p.compression_strategy, 1) + + p = zstd.ZstdCompressionParameters(compression_level=2) + self.assertEqual(p.compression_level, 2) + + p = zstd.ZstdCompressionParameters(threads=4) + self.assertEqual(p.threads, 4) + + p = zstd.ZstdCompressionParameters(threads=2, job_size=1048576, + overlap_size_log=6) + self.assertEqual(p.threads, 2) + self.assertEqual(p.job_size, 1048576) + self.assertEqual(p.overlap_size_log, 6) + + p = zstd.ZstdCompressionParameters(compression_level=2) + self.assertEqual(p.compress_literals, 1) + + p = zstd.ZstdCompressionParameters(compress_literals=False) + self.assertEqual(p.compress_literals, 0) + + p = zstd.ZstdCompressionParameters(compression_level=-1) + self.assertEqual(p.compression_level, -1) + self.assertEqual(p.compress_literals, 0) + + p = zstd.ZstdCompressionParameters(compression_level=-2, compress_literals=True) + self.assertEqual(p.compression_level, -2) + self.assertEqual(p.compress_literals, 1) + + p = zstd.ZstdCompressionParameters(force_max_window=True) + self.assertEqual(p.force_max_window, 1) + + p = zstd.ZstdCompressionParameters(enable_ldm=True) + self.assertEqual(p.enable_ldm, 1) + + p = zstd.ZstdCompressionParameters(ldm_hash_log=7) + self.assertEqual(p.ldm_hash_log, 7) + + p = zstd.ZstdCompressionParameters(ldm_min_match=6) + self.assertEqual(p.ldm_min_match, 6) + + p = zstd.ZstdCompressionParameters(ldm_bucket_size_log=7) + self.assertEqual(p.ldm_bucket_size_log, 7) + + p = zstd.ZstdCompressionParameters(ldm_hash_every_log=8) + self.assertEqual(p.ldm_hash_every_log, 8) def test_estimated_compression_context_size(self): - p = zstd.CompressionParameters(20, 16, 17, 1, 5, 16, zstd.STRATEGY_DFAST) + p = zstd.ZstdCompressionParameters(window_log=20, + chain_log=16, + hash_log=17, + search_log=1, + min_match=5, + target_length=16, + compression_strategy=zstd.STRATEGY_DFAST) # 32-bit has slightly different values from 64-bit. - self.assertAlmostEqual(p.estimated_compression_context_size(), 1287076, - delta=110) + self.assertAlmostEqual(p.estimated_compression_context_size(), 1294072, + delta=250) @make_cffi @@ -66,8 +116,18 @@ with self.assertRaises(TypeError): zstd.get_frame_parameters(None) - with self.assertRaises(TypeError): - zstd.get_frame_parameters(u'foobarbaz') + # Python 3 doesn't appear to convert unicode to Py_buffer. + if sys.version_info[0] >= 3: + with self.assertRaises(TypeError): + zstd.get_frame_parameters(u'foobarbaz') + else: + # CPython will convert unicode to Py_buffer. But CFFI won't. + if zstd.backend == 'cffi': + with self.assertRaises(TypeError): + zstd.get_frame_parameters(u'foobarbaz') + else: + with self.assertRaises(zstd.ZstdError): + zstd.get_frame_parameters(u'foobarbaz') def test_invalid_input_sizes(self): with self.assertRaisesRegexp(zstd.ZstdError, 'not enough data for frame'): @@ -82,21 +142,21 @@ def test_attributes(self): params = zstd.get_frame_parameters(zstd.FRAME_HEADER + b'\x00\x00') - self.assertEqual(params.content_size, 0) + self.assertEqual(params.content_size, zstd.CONTENTSIZE_UNKNOWN) self.assertEqual(params.window_size, 1024) self.assertEqual(params.dict_id, 0) self.assertFalse(params.has_checksum) # Lowest 2 bits indicate a dictionary and length. Here, the dict id is 1 byte. params = zstd.get_frame_parameters(zstd.FRAME_HEADER + b'\x01\x00\xff') - self.assertEqual(params.content_size, 0) + self.assertEqual(params.content_size, zstd.CONTENTSIZE_UNKNOWN) self.assertEqual(params.window_size, 1024) self.assertEqual(params.dict_id, 255) self.assertFalse(params.has_checksum) # Lowest 3rd bit indicates if checksum is present. params = zstd.get_frame_parameters(zstd.FRAME_HEADER + b'\x04\x00') - self.assertEqual(params.content_size, 0) + self.assertEqual(params.content_size, zstd.CONTENTSIZE_UNKNOWN) self.assertEqual(params.window_size, 1024) self.assertEqual(params.dict_id, 0) self.assertTrue(params.has_checksum) @@ -110,7 +170,7 @@ # Window descriptor is 2nd byte after frame header. params = zstd.get_frame_parameters(zstd.FRAME_HEADER + b'\x00\x40') - self.assertEqual(params.content_size, 0) + self.assertEqual(params.content_size, zstd.CONTENTSIZE_UNKNOWN) self.assertEqual(params.window_size, 262144) self.assertEqual(params.dict_id, 0) self.assertFalse(params.has_checksum) @@ -121,3 +181,22 @@ self.assertEqual(params.window_size, 262144) self.assertEqual(params.dict_id, 15) self.assertTrue(params.has_checksum) + + def test_input_types(self): + v = zstd.FRAME_HEADER + b'\x00\x00' + + mutable_array = bytearray(len(v)) + mutable_array[:] = v + + sources = [ + memoryview(v), + bytearray(v), + mutable_array, + ] + + for source in sources: + params = zstd.get_frame_parameters(source) + self.assertEqual(params.content_size, zstd.CONTENTSIZE_UNKNOWN) + self.assertEqual(params.window_size, 1024) + self.assertEqual(params.dict_id, 0) + self.assertFalse(params.has_checksum) diff -r fb92df8b634c -r ed5448edcbfa contrib/python-zstandard/tests/test_data_structures_fuzzing.py --- a/contrib/python-zstandard/tests/test_data_structures_fuzzing.py Wed Apr 04 10:35:09 2018 -0400 +++ b/contrib/python-zstandard/tests/test_data_structures_fuzzing.py Wed Apr 18 15:32:08 2018 -0400 @@ -1,10 +1,7 @@ import io import os - -try: - import unittest2 as unittest -except ImportError: - import unittest +import sys +import unittest try: import hypothesis @@ -12,7 +9,7 @@ except ImportError: raise unittest.SkipTest('hypothesis not available') -import zstd +import zstandard as zstd from .common import ( make_cffi, @@ -28,16 +25,17 @@ s_searchlog = strategies.integers(min_value=zstd.SEARCHLOG_MIN, max_value=zstd.SEARCHLOG_MAX) s_searchlength = strategies.integers(min_value=zstd.SEARCHLENGTH_MIN, - max_value=zstd.SEARCHLENGTH_MAX) + max_value=zstd.SEARCHLENGTH_MAX) s_targetlength = strategies.integers(min_value=zstd.TARGETLENGTH_MIN, - max_value=zstd.TARGETLENGTH_MAX) + max_value=2**32) s_strategy = strategies.sampled_from((zstd.STRATEGY_FAST, zstd.STRATEGY_DFAST, zstd.STRATEGY_GREEDY, zstd.STRATEGY_LAZY, zstd.STRATEGY_LAZY2, zstd.STRATEGY_BTLAZY2, - zstd.STRATEGY_BTOPT)) + zstd.STRATEGY_BTOPT, + zstd.STRATEGY_BTULTRA)) @make_cffi @@ -47,24 +45,17 @@ s_searchlength, s_targetlength, s_strategy) def test_valid_init(self, windowlog, chainlog, hashlog, searchlog, searchlength, targetlength, strategy): - # ZSTD_checkCParams moves the goal posts on us from what's advertised - # in the constants. So move along with them. - if searchlength == zstd.SEARCHLENGTH_MIN and strategy in (zstd.STRATEGY_FAST, zstd.STRATEGY_GREEDY): - searchlength += 1 - elif searchlength == zstd.SEARCHLENGTH_MAX and strategy != zstd.STRATEGY_FAST: - searchlength -= 1 - - p = zstd.CompressionParameters(windowlog, chainlog, hashlog, - searchlog, searchlength, - targetlength, strategy) - - cctx = zstd.ZstdCompressor(compression_params=p) - with cctx.write_to(io.BytesIO()): - pass + zstd.ZstdCompressionParameters(window_log=windowlog, + chain_log=chainlog, + hash_log=hashlog, + search_log=searchlog, + min_match=searchlength, + target_length=targetlength, + compression_strategy=strategy) @hypothesis.given(s_windowlog, s_chainlog, s_hashlog, s_searchlog, s_searchlength, s_targetlength, s_strategy) - def test_estimate_compression_context_size(self, windowlog, chainlog, + def test_estimated_compression_context_size(self, windowlog, chainlog, hashlog, searchlog, searchlength, targetlength, strategy): @@ -73,7 +64,12 @@ elif searchlength == zstd.SEARCHLENGTH_MAX and strategy != zstd.STRATEGY_FAST: searchlength -= 1 - p = zstd.CompressionParameters(windowlog, chainlog, hashlog, - searchlog, searchlength, - targetlength, strategy) - size = zstd.estimate_compression_context_size(p) + p = zstd.ZstdCompressionParameters(window_log=windowlog, + chain_log=chainlog, + hash_log=hashlog, + search_log=searchlog, + min_match=searchlength, + target_length=targetlength, + compression_strategy=strategy) + size = p.estimated_compression_context_size() + diff -r fb92df8b634c -r ed5448edcbfa contrib/python-zstandard/tests/test_decompressor.py --- a/contrib/python-zstandard/tests/test_decompressor.py Wed Apr 04 10:35:09 2018 -0400 +++ b/contrib/python-zstandard/tests/test_decompressor.py Wed Apr 18 15:32:08 2018 -0400 @@ -1,16 +1,14 @@ import io +import os import random import struct import sys +import unittest -try: - import unittest2 as unittest -except ImportError: - import unittest - -import zstd +import zstandard as zstd from .common import ( + generate_samples, make_cffi, OpCountingBytesIO, ) @@ -23,35 +21,124 @@ @make_cffi +class TestFrameHeaderSize(unittest.TestCase): + def test_empty(self): + with self.assertRaisesRegexp( + zstd.ZstdError, 'could not determine frame header size: Src size ' + 'is incorrect'): + zstd.frame_header_size(b'') + + def test_too_small(self): + with self.assertRaisesRegexp( + zstd.ZstdError, 'could not determine frame header size: Src size ' + 'is incorrect'): + zstd.frame_header_size(b'foob') + + def test_basic(self): + # It doesn't matter that it isn't a valid frame. + self.assertEqual(zstd.frame_header_size(b'long enough but no magic'), 6) + + +@make_cffi +class TestFrameContentSize(unittest.TestCase): + def test_empty(self): + with self.assertRaisesRegexp(zstd.ZstdError, + 'error when determining content size'): + zstd.frame_content_size(b'') + + def test_too_small(self): + with self.assertRaisesRegexp(zstd.ZstdError, + 'error when determining content size'): + zstd.frame_content_size(b'foob') + + def test_bad_frame(self): + with self.assertRaisesRegexp(zstd.ZstdError, + 'error when determining content size'): + zstd.frame_content_size(b'invalid frame header') + + def test_unknown(self): + cctx = zstd.ZstdCompressor(write_content_size=False) + frame = cctx.compress(b'foobar') + + self.assertEqual(zstd.frame_content_size(frame), -1) + + def test_empty(self): + cctx = zstd.ZstdCompressor() + frame = cctx.compress(b'') + + self.assertEqual(zstd.frame_content_size(frame), 0) + + def test_basic(self): + cctx = zstd.ZstdCompressor() + frame = cctx.compress(b'foobar') + + self.assertEqual(zstd.frame_content_size(frame), 6) + + +@make_cffi +class TestDecompressor(unittest.TestCase): + def test_memory_size(self): + dctx = zstd.ZstdDecompressor() + + self.assertGreater(dctx.memory_size(), 100) + + +@make_cffi class TestDecompressor_decompress(unittest.TestCase): def test_empty_input(self): dctx = zstd.ZstdDecompressor() - with self.assertRaisesRegexp(zstd.ZstdError, 'input data invalid'): + with self.assertRaisesRegexp(zstd.ZstdError, 'error determining content size from frame header'): dctx.decompress(b'') def test_invalid_input(self): dctx = zstd.ZstdDecompressor() - with self.assertRaisesRegexp(zstd.ZstdError, 'input data invalid'): + with self.assertRaisesRegexp(zstd.ZstdError, 'error determining content size from frame header'): dctx.decompress(b'foobar') + def test_input_types(self): + cctx = zstd.ZstdCompressor(level=1) + compressed = cctx.compress(b'foo') + + mutable_array = bytearray(len(compressed)) + mutable_array[:] = compressed + + sources = [ + memoryview(compressed), + bytearray(compressed), + mutable_array, + ] + + dctx = zstd.ZstdDecompressor() + for source in sources: + self.assertEqual(dctx.decompress(source), b'foo') + def test_no_content_size_in_frame(self): cctx = zstd.ZstdCompressor(write_content_size=False) compressed = cctx.compress(b'foobar') dctx = zstd.ZstdDecompressor() - with self.assertRaisesRegexp(zstd.ZstdError, 'input data invalid'): + with self.assertRaisesRegexp(zstd.ZstdError, 'could not determine content size in frame header'): dctx.decompress(compressed) def test_content_size_present(self): - cctx = zstd.ZstdCompressor(write_content_size=True) + cctx = zstd.ZstdCompressor() compressed = cctx.compress(b'foobar') dctx = zstd.ZstdDecompressor() decompressed = dctx.decompress(compressed) self.assertEqual(decompressed, b'foobar') + def test_empty_roundtrip(self): + cctx = zstd.ZstdCompressor() + compressed = cctx.compress(b'') + + dctx = zstd.ZstdDecompressor() + decompressed = dctx.decompress(compressed) + + self.assertEqual(decompressed, b'') + def test_max_output_size(self): cctx = zstd.ZstdCompressor(write_content_size=False) source = b'foobar' * 256 @@ -63,7 +150,8 @@ self.assertEqual(decompressed, source) # Input size - 1 fails - with self.assertRaisesRegexp(zstd.ZstdError, 'Destination buffer is too small'): + with self.assertRaisesRegexp(zstd.ZstdError, + 'decompression error: did not decompress full frame'): dctx.decompress(compressed, max_output_size=len(source) - 1) # Input size + 1 works @@ -94,7 +182,7 @@ d = zstd.train_dictionary(8192, samples) orig = b'foobar' * 16384 - cctx = zstd.ZstdCompressor(level=1, dict_data=d, write_content_size=True) + cctx = zstd.ZstdCompressor(level=1, dict_data=d) compressed = cctx.compress(orig) dctx = zstd.ZstdDecompressor(dict_data=d) @@ -113,7 +201,7 @@ sources = (b'foobar' * 8192, b'foo' * 8192, b'bar' * 8192) compressed = [] - cctx = zstd.ZstdCompressor(level=1, dict_data=d, write_content_size=True) + cctx = zstd.ZstdCompressor(level=1, dict_data=d) for source in sources: compressed.append(cctx.compress(source)) @@ -122,6 +210,21 @@ decompressed = dctx.decompress(compressed[i]) self.assertEqual(decompressed, sources[i]) + def test_max_window_size(self): + with open(__file__, 'rb') as fh: + source = fh.read() + + # If we write a content size, the decompressor engages single pass + # mode and the window size doesn't come into play. + cctx = zstd.ZstdCompressor(write_content_size=False) + frame = cctx.compress(source) + + dctx = zstd.ZstdDecompressor(max_window_size=1) + + with self.assertRaisesRegexp( + zstd.ZstdError, 'decompression error: Frame requires too much memory'): + dctx.decompress(frame, max_output_size=len(source)) + @make_cffi class TestDecompressor_copy_stream(unittest.TestCase): @@ -186,6 +289,211 @@ @make_cffi +class TestDecompressor_stream_reader(unittest.TestCase): + def test_context_manager(self): + dctx = zstd.ZstdDecompressor() + + reader = dctx.stream_reader(b'foo') + with self.assertRaisesRegexp(zstd.ZstdError, 'read\(\) must be called from an active'): + reader.read(1) + + with dctx.stream_reader(b'foo') as reader: + with self.assertRaisesRegexp(ValueError, 'cannot __enter__ multiple times'): + with reader as reader2: + pass + + def test_not_implemented(self): + dctx = zstd.ZstdDecompressor() + + with dctx.stream_reader(b'foo') as reader: + with self.assertRaises(NotImplementedError): + reader.readline() + + with self.assertRaises(NotImplementedError): + reader.readlines() + + with self.assertRaises(NotImplementedError): + reader.readall() + + with self.assertRaises(NotImplementedError): + iter(reader) + + with self.assertRaises(NotImplementedError): + next(reader) + + with self.assertRaises(io.UnsupportedOperation): + reader.write(b'foo') + + with self.assertRaises(io.UnsupportedOperation): + reader.writelines([]) + + def test_constant_methods(self): + dctx = zstd.ZstdDecompressor() + + with dctx.stream_reader(b'foo') as reader: + self.assertTrue(reader.readable()) + self.assertFalse(reader.writable()) + self.assertTrue(reader.seekable()) + self.assertFalse(reader.isatty()) + self.assertIsNone(reader.flush()) + + def test_read_closed(self): + dctx = zstd.ZstdDecompressor() + + with dctx.stream_reader(b'foo') as reader: + reader.close() + with self.assertRaisesRegexp(ValueError, 'stream is closed'): + reader.read(1) + + def test_bad_read_size(self): + dctx = zstd.ZstdDecompressor() + + with dctx.stream_reader(b'foo') as reader: + with self.assertRaisesRegexp(ValueError, 'cannot read negative or size 0 amounts'): + reader.read(-1) + + with self.assertRaisesRegexp(ValueError, 'cannot read negative or size 0 amounts'): + reader.read(0) + + def test_read_buffer(self): + cctx = zstd.ZstdCompressor() + + source = b''.join([b'foo' * 60, b'bar' * 60, b'baz' * 60]) + frame = cctx.compress(source) + + dctx = zstd.ZstdDecompressor() + + with dctx.stream_reader(frame) as reader: + self.assertEqual(reader.tell(), 0) + + # We should get entire frame in one read. + result = reader.read(8192) + self.assertEqual(result, source) + self.assertEqual(reader.tell(), len(source)) + + # Read after EOF should return empty bytes. + self.assertEqual(reader.read(), b'') + self.assertEqual(reader.tell(), len(result)) + + self.assertTrue(reader.closed()) + + def test_read_buffer_small_chunks(self): + cctx = zstd.ZstdCompressor() + source = b''.join([b'foo' * 60, b'bar' * 60, b'baz' * 60]) + frame = cctx.compress(source) + + dctx = zstd.ZstdDecompressor() + chunks = [] + + with dctx.stream_reader(frame, read_size=1) as reader: + while True: + chunk = reader.read(1) + if not chunk: + break + + chunks.append(chunk) + self.assertEqual(reader.tell(), sum(map(len, chunks))) + + self.assertEqual(b''.join(chunks), source) + + def test_read_stream(self): + cctx = zstd.ZstdCompressor() + source = b''.join([b'foo' * 60, b'bar' * 60, b'baz' * 60]) + frame = cctx.compress(source) + + dctx = zstd.ZstdDecompressor() + with dctx.stream_reader(io.BytesIO(frame)) as reader: + self.assertEqual(reader.tell(), 0) + + chunk = reader.read(8192) + self.assertEqual(chunk, source) + self.assertEqual(reader.tell(), len(source)) + self.assertEqual(reader.read(), b'') + self.assertEqual(reader.tell(), len(source)) + + def test_read_stream_small_chunks(self): + cctx = zstd.ZstdCompressor() + source = b''.join([b'foo' * 60, b'bar' * 60, b'baz' * 60]) + frame = cctx.compress(source) + + dctx = zstd.ZstdDecompressor() + chunks = [] + + with dctx.stream_reader(io.BytesIO(frame), read_size=1) as reader: + while True: + chunk = reader.read(1) + if not chunk: + break + + chunks.append(chunk) + self.assertEqual(reader.tell(), sum(map(len, chunks))) + + self.assertEqual(b''.join(chunks), source) + + def test_read_after_exit(self): + cctx = zstd.ZstdCompressor() + frame = cctx.compress(b'foo' * 60) + + dctx = zstd.ZstdDecompressor() + + with dctx.stream_reader(frame) as reader: + while reader.read(16): + pass + + with self.assertRaisesRegexp(zstd.ZstdError, 'read\(\) must be called from an active'): + reader.read(10) + + def test_illegal_seeks(self): + cctx = zstd.ZstdCompressor() + frame = cctx.compress(b'foo' * 60) + + dctx = zstd.ZstdDecompressor() + + with dctx.stream_reader(frame) as reader: + with self.assertRaisesRegexp(ValueError, + 'cannot seek to negative position'): + reader.seek(-1, os.SEEK_SET) + + reader.read(1) + + with self.assertRaisesRegexp( + ValueError, 'cannot seek zstd decompression stream backwards'): + reader.seek(0, os.SEEK_SET) + + with self.assertRaisesRegexp( + ValueError, 'cannot seek zstd decompression stream backwards'): + reader.seek(-1, os.SEEK_CUR) + + with self.assertRaisesRegexp( + ValueError, + 'zstd decompression streams cannot be seeked with SEEK_END'): + reader.seek(0, os.SEEK_END) + + reader.close() + + with self.assertRaisesRegexp(ValueError, 'stream is closed'): + reader.seek(4, os.SEEK_SET) + + with self.assertRaisesRegexp( + zstd.ZstdError, 'seek\(\) must be called from an active context'): + reader.seek(0) + + def test_seek(self): + source = b'foobar' * 60 + cctx = zstd.ZstdCompressor() + frame = cctx.compress(source) + + dctx = zstd.ZstdDecompressor() + + with dctx.stream_reader(frame) as reader: + reader.seek(3) + self.assertEqual(reader.read(3), b'bar') + + reader.seek(4, os.SEEK_CUR) + self.assertEqual(reader.read(2), b'ar') + + +@make_cffi class TestDecompressor_decompressobj(unittest.TestCase): def test_simple(self): data = zstd.ZstdCompressor(level=1).compress(b'foobar') @@ -194,6 +502,24 @@ dobj = dctx.decompressobj() self.assertEqual(dobj.decompress(data), b'foobar') + def test_input_types(self): + compressed = zstd.ZstdCompressor(level=1).compress(b'foo') + + dctx = zstd.ZstdDecompressor() + + mutable_array = bytearray(len(compressed)) + mutable_array[:] = compressed + + sources = [ + memoryview(compressed), + bytearray(compressed), + mutable_array, + ] + + for source in sources: + dobj = dctx.decompressobj() + self.assertEqual(dobj.decompress(source), b'foo') + def test_reuse(self): data = zstd.ZstdCompressor(level=1).compress(b'foobar') @@ -204,22 +530,58 @@ with self.assertRaisesRegexp(zstd.ZstdError, 'cannot use a decompressobj'): dobj.decompress(data) + def test_bad_write_size(self): + dctx = zstd.ZstdDecompressor() + + with self.assertRaisesRegexp(ValueError, 'write_size must be positive'): + dctx.decompressobj(write_size=0) + + def test_write_size(self): + source = b'foo' * 64 + b'bar' * 128 + data = zstd.ZstdCompressor(level=1).compress(source) + + dctx = zstd.ZstdDecompressor() + + for i in range(128): + dobj = dctx.decompressobj(write_size=i + 1) + self.assertEqual(dobj.decompress(data), source) def decompress_via_writer(data): buffer = io.BytesIO() dctx = zstd.ZstdDecompressor() - with dctx.write_to(buffer) as decompressor: + with dctx.stream_writer(buffer) as decompressor: decompressor.write(data) return buffer.getvalue() @make_cffi -class TestDecompressor_write_to(unittest.TestCase): +class TestDecompressor_stream_writer(unittest.TestCase): def test_empty_roundtrip(self): cctx = zstd.ZstdCompressor() empty = cctx.compress(b'') self.assertEqual(decompress_via_writer(empty), b'') + def test_input_types(self): + cctx = zstd.ZstdCompressor(level=1) + compressed = cctx.compress(b'foo') + + mutable_array = bytearray(len(compressed)) + mutable_array[:] = compressed + + sources = [ + memoryview(compressed), + bytearray(compressed), + mutable_array, + ] + + dctx = zstd.ZstdDecompressor() + for source in sources: + buffer = io.BytesIO() + with dctx.stream_writer(buffer) as decompressor: + decompressor.write(source) + + self.assertEqual(buffer.getvalue(), b'foo') + def test_large_roundtrip(self): chunks = [] for i in range(255): @@ -242,7 +604,7 @@ buffer = io.BytesIO() dctx = zstd.ZstdDecompressor() - with dctx.write_to(buffer) as decompressor: + with dctx.stream_writer(buffer) as decompressor: pos = 0 while pos < len(compressed): pos2 = pos + 8192 @@ -262,14 +624,14 @@ orig = b'foobar' * 16384 buffer = io.BytesIO() cctx = zstd.ZstdCompressor(dict_data=d) - with cctx.write_to(buffer) as compressor: - self.assertEqual(compressor.write(orig), 1544) + with cctx.stream_writer(buffer) as compressor: + self.assertEqual(compressor.write(orig), 0) compressed = buffer.getvalue() buffer = io.BytesIO() dctx = zstd.ZstdDecompressor(dict_data=d) - with dctx.write_to(buffer) as decompressor: + with dctx.stream_writer(buffer) as decompressor: self.assertEqual(decompressor.write(compressed), len(orig)) self.assertEqual(buffer.getvalue(), orig) @@ -277,7 +639,7 @@ def test_memory_size(self): dctx = zstd.ZstdDecompressor() buffer = io.BytesIO() - with dctx.write_to(buffer) as decompressor: + with dctx.stream_writer(buffer) as decompressor: size = decompressor.memory_size() self.assertGreater(size, 100000) @@ -286,7 +648,7 @@ source = zstd.ZstdCompressor().compress(b'foobarfoobar') dest = OpCountingBytesIO() dctx = zstd.ZstdDecompressor() - with dctx.write_to(dest, write_size=1) as decompressor: + with dctx.stream_writer(dest, write_size=1) as decompressor: s = struct.Struct('>B') for c in source: if not isinstance(c, str): @@ -298,29 +660,29 @@ @make_cffi -class TestDecompressor_read_from(unittest.TestCase): +class TestDecompressor_read_to_iter(unittest.TestCase): def test_type_validation(self): dctx = zstd.ZstdDecompressor() # Object with read() works. - dctx.read_from(io.BytesIO()) + dctx.read_to_iter(io.BytesIO()) # Buffer protocol works. - dctx.read_from(b'foobar') + dctx.read_to_iter(b'foobar') with self.assertRaisesRegexp(ValueError, 'must pass an object with a read'): - b''.join(dctx.read_from(True)) + b''.join(dctx.read_to_iter(True)) def test_empty_input(self): dctx = zstd.ZstdDecompressor() source = io.BytesIO() - it = dctx.read_from(source) + it = dctx.read_to_iter(source) # TODO this is arguably wrong. Should get an error about missing frame foo. with self.assertRaises(StopIteration): next(it) - it = dctx.read_from(b'') + it = dctx.read_to_iter(b'') with self.assertRaises(StopIteration): next(it) @@ -328,11 +690,11 @@ dctx = zstd.ZstdDecompressor() source = io.BytesIO(b'foobar') - it = dctx.read_from(source) + it = dctx.read_to_iter(source) with self.assertRaisesRegexp(zstd.ZstdError, 'Unknown frame descriptor'): next(it) - it = dctx.read_from(b'foobar') + it = dctx.read_to_iter(b'foobar') with self.assertRaisesRegexp(zstd.ZstdError, 'Unknown frame descriptor'): next(it) @@ -344,7 +706,7 @@ source.seek(0) dctx = zstd.ZstdDecompressor() - it = dctx.read_from(source) + it = dctx.read_to_iter(source) # No chunks should be emitted since there is no data. with self.assertRaises(StopIteration): @@ -358,17 +720,17 @@ dctx = zstd.ZstdDecompressor() with self.assertRaisesRegexp(ValueError, 'skip_bytes must be smaller than read_size'): - b''.join(dctx.read_from(b'', skip_bytes=1, read_size=1)) + b''.join(dctx.read_to_iter(b'', skip_bytes=1, read_size=1)) with self.assertRaisesRegexp(ValueError, 'skip_bytes larger than first input chunk'): - b''.join(dctx.read_from(b'foobar', skip_bytes=10)) + b''.join(dctx.read_to_iter(b'foobar', skip_bytes=10)) def test_skip_bytes(self): cctx = zstd.ZstdCompressor(write_content_size=False) compressed = cctx.compress(b'foobar') dctx = zstd.ZstdDecompressor() - output = b''.join(dctx.read_from(b'hdr' + compressed, skip_bytes=3)) + output = b''.join(dctx.read_to_iter(b'hdr' + compressed, skip_bytes=3)) self.assertEqual(output, b'foobar') def test_large_output(self): @@ -382,7 +744,7 @@ compressed.seek(0) dctx = zstd.ZstdDecompressor() - it = dctx.read_from(compressed) + it = dctx.read_to_iter(compressed) chunks = [] chunks.append(next(it)) @@ -395,7 +757,7 @@ self.assertEqual(decompressed, source.getvalue()) # And again with buffer protocol. - it = dctx.read_from(compressed.getvalue()) + it = dctx.read_to_iter(compressed.getvalue()) chunks = [] chunks.append(next(it)) chunks.append(next(it)) @@ -406,12 +768,13 @@ decompressed = b''.join(chunks) self.assertEqual(decompressed, source.getvalue()) + @unittest.skipUnless('ZSTD_SLOW_TESTS' in os.environ, 'ZSTD_SLOW_TESTS not set') def test_large_input(self): bytes = list(struct.Struct('>B').pack(i) for i in range(256)) compressed = io.BytesIO() input_size = 0 cctx = zstd.ZstdCompressor(level=1) - with cctx.write_to(compressed) as compressor: + with cctx.stream_writer(compressed) as compressor: while True: compressor.write(random.choice(bytes)) input_size += 1 @@ -426,7 +789,7 @@ zstd.DECOMPRESSION_RECOMMENDED_INPUT_SIZE) dctx = zstd.ZstdDecompressor() - it = dctx.read_from(compressed) + it = dctx.read_to_iter(compressed) chunks = [] chunks.append(next(it)) @@ -440,7 +803,7 @@ self.assertEqual(len(decompressed), input_size) # And again with buffer protocol. - it = dctx.read_from(compressed.getvalue()) + it = dctx.read_to_iter(compressed.getvalue()) chunks = [] chunks.append(next(it)) @@ -460,7 +823,7 @@ source = io.BytesIO() compressed = io.BytesIO() - with cctx.write_to(compressed) as compressor: + with cctx.stream_writer(compressed) as compressor: for i in range(256): chunk = b'\0' * 1024 compressor.write(chunk) @@ -473,17 +836,34 @@ self.assertEqual(simple, source.getvalue()) compressed.seek(0) - streamed = b''.join(dctx.read_from(compressed)) + streamed = b''.join(dctx.read_to_iter(compressed)) self.assertEqual(streamed, source.getvalue()) def test_read_write_size(self): source = OpCountingBytesIO(zstd.ZstdCompressor().compress(b'foobarfoobar')) dctx = zstd.ZstdDecompressor() - for chunk in dctx.read_from(source, read_size=1, write_size=1): + for chunk in dctx.read_to_iter(source, read_size=1, write_size=1): self.assertEqual(len(chunk), 1) self.assertEqual(source._read_count, len(source.getvalue())) + def test_magic_less(self): + params = zstd.CompressionParameters.from_level( + 1, format=zstd.FORMAT_ZSTD1_MAGICLESS) + cctx = zstd.ZstdCompressor(compression_params=params) + frame = cctx.compress(b'foobar') + + self.assertNotEqual(frame[0:4], b'\x28\xb5\x2f\xfd') + + dctx = zstd.ZstdDecompressor() + with self.assertRaisesRegexp( + zstd.ZstdError, 'error determining content size from frame header'): + dctx.decompress(frame) + + dctx = zstd.ZstdDecompressor(format=zstd.FORMAT_ZSTD1_MAGICLESS) + res = b''.join(dctx.read_to_iter(frame)) + self.assertEqual(res, b'foobar') + @make_cffi class TestDecompressor_content_dict_chain(unittest.TestCase): @@ -511,19 +891,20 @@ with self.assertRaisesRegexp(ValueError, 'chunk 0 is not a valid zstd frame'): dctx.decompress_content_dict_chain([b'foo' * 8]) - no_size = zstd.ZstdCompressor().compress(b'foo' * 64) + no_size = zstd.ZstdCompressor(write_content_size=False).compress(b'foo' * 64) with self.assertRaisesRegexp(ValueError, 'chunk 0 missing content size in frame'): dctx.decompress_content_dict_chain([no_size]) # Corrupt first frame. - frame = zstd.ZstdCompressor(write_content_size=True).compress(b'foo' * 64) + frame = zstd.ZstdCompressor().compress(b'foo' * 64) frame = frame[0:12] + frame[15:] - with self.assertRaisesRegexp(zstd.ZstdError, 'could not decompress chunk 0'): + with self.assertRaisesRegexp(zstd.ZstdError, + 'chunk 0 did not decompress full frame'): dctx.decompress_content_dict_chain([frame]) def test_bad_subsequent_input(self): - initial = zstd.ZstdCompressor(write_content_size=True).compress(b'foo' * 64) + initial = zstd.ZstdCompressor().compress(b'foo' * 64) dctx = zstd.ZstdDecompressor() @@ -539,17 +920,17 @@ with self.assertRaisesRegexp(ValueError, 'chunk 1 is not a valid zstd frame'): dctx.decompress_content_dict_chain([initial, b'foo' * 8]) - no_size = zstd.ZstdCompressor().compress(b'foo' * 64) + no_size = zstd.ZstdCompressor(write_content_size=False).compress(b'foo' * 64) with self.assertRaisesRegexp(ValueError, 'chunk 1 missing content size in frame'): dctx.decompress_content_dict_chain([initial, no_size]) # Corrupt second frame. - cctx = zstd.ZstdCompressor(write_content_size=True, dict_data=zstd.ZstdCompressionDict(b'foo' * 64)) + cctx = zstd.ZstdCompressor(dict_data=zstd.ZstdCompressionDict(b'foo' * 64)) frame = cctx.compress(b'bar' * 64) frame = frame[0:12] + frame[15:] - with self.assertRaisesRegexp(zstd.ZstdError, 'could not decompress chunk 1'): + with self.assertRaisesRegexp(zstd.ZstdError, 'chunk 1 did not decompress full frame'): dctx.decompress_content_dict_chain([initial, frame]) def test_simple(self): @@ -562,10 +943,10 @@ ] chunks = [] - chunks.append(zstd.ZstdCompressor(write_content_size=True).compress(original[0])) + chunks.append(zstd.ZstdCompressor().compress(original[0])) for i, chunk in enumerate(original[1:]): d = zstd.ZstdCompressionDict(original[i]) - cctx = zstd.ZstdCompressor(dict_data=d, write_content_size=True) + cctx = zstd.ZstdCompressor(dict_data=d) chunks.append(cctx.compress(chunk)) for i in range(1, len(original)): @@ -594,7 +975,7 @@ dctx.multi_decompress_to_buffer([b'foobarbaz']) def test_list_input(self): - cctx = zstd.ZstdCompressor(write_content_size=True) + cctx = zstd.ZstdCompressor() original = [b'foo' * 4, b'bar' * 6] frames = [cctx.compress(d) for d in original] @@ -614,7 +995,7 @@ self.assertEqual(len(result[1]), 18) def test_list_input_frame_sizes(self): - cctx = zstd.ZstdCompressor(write_content_size=False) + cctx = zstd.ZstdCompressor() original = [b'foo' * 4, b'bar' * 6, b'baz' * 8] frames = [cctx.compress(d) for d in original] @@ -630,7 +1011,7 @@ self.assertEqual(result[i].tobytes(), data) def test_buffer_with_segments_input(self): - cctx = zstd.ZstdCompressor(write_content_size=True) + cctx = zstd.ZstdCompressor() original = [b'foo' * 4, b'bar' * 6] frames = [cctx.compress(d) for d in original] @@ -669,7 +1050,7 @@ self.assertEqual(result[i].tobytes(), data) def test_buffer_with_segments_collection_input(self): - cctx = zstd.ZstdCompressor(write_content_size=True) + cctx = zstd.ZstdCompressor() original = [ b'foo0' * 2, @@ -711,8 +1092,18 @@ for i in range(5): self.assertEqual(decompressed[i].tobytes(), original[i]) + def test_dict(self): + d = zstd.train_dictionary(16384, generate_samples(), k=64, d=16) + + cctx = zstd.ZstdCompressor(dict_data=d, level=1) + frames = [cctx.compress(s) for s in generate_samples()] + + dctx = zstd.ZstdDecompressor(dict_data=d) + result = dctx.multi_decompress_to_buffer(frames) + self.assertEqual([o.tobytes() for o in result], generate_samples()) + def test_multiple_threads(self): - cctx = zstd.ZstdCompressor(write_content_size=True) + cctx = zstd.ZstdCompressor() frames = [] frames.extend(cctx.compress(b'x' * 64) for i in range(256)) @@ -727,15 +1118,22 @@ self.assertEqual(result[256].tobytes(), b'y' * 64) def test_item_failure(self): - cctx = zstd.ZstdCompressor(write_content_size=True) + cctx = zstd.ZstdCompressor() frames = [cctx.compress(b'x' * 128), cctx.compress(b'y' * 128)] - frames[1] = frames[1] + b'extra' + frames[1] = frames[1][0:15] + b'extra' + frames[1][15:] dctx = zstd.ZstdDecompressor() - with self.assertRaisesRegexp(zstd.ZstdError, 'error decompressing item 1: Src size incorrect'): + with self.assertRaisesRegexp(zstd.ZstdError, + 'error decompressing item 1: (' + 'Corrupted block|' + 'Destination buffer is too small)'): dctx.multi_decompress_to_buffer(frames) - with self.assertRaisesRegexp(zstd.ZstdError, 'error decompressing item 1: Src size incorrect'): + with self.assertRaisesRegexp(zstd.ZstdError, + 'error decompressing item 1: (' + 'Corrupted block|' + 'Destination buffer is too small)'): dctx.multi_decompress_to_buffer(frames, threads=2) + diff -r fb92df8b634c -r ed5448edcbfa contrib/python-zstandard/tests/test_decompressor_fuzzing.py --- a/contrib/python-zstandard/tests/test_decompressor_fuzzing.py Wed Apr 04 10:35:09 2018 -0400 +++ b/contrib/python-zstandard/tests/test_decompressor_fuzzing.py Wed Apr 18 15:32:08 2018 -0400 @@ -1,10 +1,6 @@ import io import os - -try: - import unittest2 as unittest -except ImportError: - import unittest +import unittest try: import hypothesis @@ -12,7 +8,7 @@ except ImportError: raise unittest.SkipTest('hypothesis not available') -import zstd +import zstandard as zstd from . common import ( make_cffi, @@ -22,15 +18,96 @@ @unittest.skipUnless('ZSTD_SLOW_TESTS' in os.environ, 'ZSTD_SLOW_TESTS not set') @make_cffi -class TestDecompressor_write_to_fuzzing(unittest.TestCase): +class TestDecompressor_stream_reader_fuzzing(unittest.TestCase): + @hypothesis.settings( + suppress_health_check=[hypothesis.HealthCheck.large_base_example]) + @hypothesis.given(original=strategies.sampled_from(random_input_data()), + level=strategies.integers(min_value=1, max_value=5), + source_read_size=strategies.integers(1, 16384), + read_sizes=strategies.data()) + def test_stream_source_read_variance(self, original, level, source_read_size, + read_sizes): + cctx = zstd.ZstdCompressor(level=level) + frame = cctx.compress(original) + + dctx = zstd.ZstdDecompressor() + source = io.BytesIO(frame) + + chunks = [] + with dctx.stream_reader(source, read_size=source_read_size) as reader: + while True: + read_size = read_sizes.draw(strategies.integers(1, 16384)) + chunk = reader.read(read_size) + if not chunk: + break + + chunks.append(chunk) + + self.assertEqual(b''.join(chunks), original) + + @hypothesis.settings( + suppress_health_check=[hypothesis.HealthCheck.large_base_example]) + @hypothesis.given(original=strategies.sampled_from(random_input_data()), + level=strategies.integers(min_value=1, max_value=5), + source_read_size=strategies.integers(1, 16384), + read_sizes=strategies.data()) + def test_buffer_source_read_variance(self, original, level, source_read_size, + read_sizes): + cctx = zstd.ZstdCompressor(level=level) + frame = cctx.compress(original) + + dctx = zstd.ZstdDecompressor() + chunks = [] + + with dctx.stream_reader(frame, read_size=source_read_size) as reader: + while True: + read_size = read_sizes.draw(strategies.integers(1, 16384)) + chunk = reader.read(read_size) + if not chunk: + break + + chunks.append(chunk) + + self.assertEqual(b''.join(chunks), original) + + @hypothesis.settings( + suppress_health_check=[hypothesis.HealthCheck.large_base_example]) + @hypothesis.given( + original=strategies.sampled_from(random_input_data()), + level=strategies.integers(min_value=1, max_value=5), + source_read_size=strategies.integers(1, 16384), + seek_amounts=strategies.data(), + read_sizes=strategies.data()) + def test_relative_seeks(self, original, level, source_read_size, seek_amounts, + read_sizes): + cctx = zstd.ZstdCompressor(level=level) + frame = cctx.compress(original) + + dctx = zstd.ZstdDecompressor() + + with dctx.stream_reader(frame, read_size=source_read_size) as reader: + while True: + amount = seek_amounts.draw(strategies.integers(0, 16384)) + reader.seek(amount, os.SEEK_CUR) + + offset = reader.tell() + read_amount = read_sizes.draw(strategies.integers(1, 16384)) + chunk = reader.read(read_amount) + + if not chunk: + break + + self.assertEqual(original[offset:offset + len(chunk)], chunk) + + +@unittest.skipUnless('ZSTD_SLOW_TESTS' in os.environ, 'ZSTD_SLOW_TESTS not set') +@make_cffi +class TestDecompressor_stream_writer_fuzzing(unittest.TestCase): @hypothesis.given(original=strategies.sampled_from(random_input_data()), level=strategies.integers(min_value=1, max_value=5), write_size=strategies.integers(min_value=1, max_value=8192), - input_sizes=strategies.streaming( - strategies.integers(min_value=1, max_value=4096))) + input_sizes=strategies.data()) def test_write_size_variance(self, original, level, write_size, input_sizes): - input_sizes = iter(input_sizes) - cctx = zstd.ZstdCompressor(level=level) frame = cctx.compress(original) @@ -38,9 +115,10 @@ source = io.BytesIO(frame) dest = io.BytesIO() - with dctx.write_to(dest, write_size=write_size) as decompressor: + with dctx.stream_writer(dest, write_size=write_size) as decompressor: while True: - chunk = source.read(next(input_sizes)) + input_size = input_sizes.draw(strategies.integers(1, 4096)) + chunk = source.read(input_size) if not chunk: break @@ -74,11 +152,8 @@ class TestDecompressor_decompressobj_fuzzing(unittest.TestCase): @hypothesis.given(original=strategies.sampled_from(random_input_data()), level=strategies.integers(min_value=1, max_value=5), - chunk_sizes=strategies.streaming( - strategies.integers(min_value=1, max_value=4096))) + chunk_sizes=strategies.data()) def test_random_input_sizes(self, original, level, chunk_sizes): - chunk_sizes = iter(chunk_sizes) - cctx = zstd.ZstdCompressor(level=level) frame = cctx.compress(original) @@ -89,7 +164,33 @@ chunks = [] while True: - chunk = source.read(next(chunk_sizes)) + chunk_size = chunk_sizes.draw(strategies.integers(1, 4096)) + chunk = source.read(chunk_size) + if not chunk: + break + + chunks.append(dobj.decompress(chunk)) + + self.assertEqual(b''.join(chunks), original) + + @hypothesis.given(original=strategies.sampled_from(random_input_data()), + level=strategies.integers(min_value=1, max_value=5), + write_size=strategies.integers(min_value=1, + max_value=4 * zstd.DECOMPRESSION_RECOMMENDED_OUTPUT_SIZE), + chunk_sizes=strategies.data()) + def test_random_output_sizes(self, original, level, write_size, chunk_sizes): + cctx = zstd.ZstdCompressor(level=level) + frame = cctx.compress(original) + + source = io.BytesIO(frame) + + dctx = zstd.ZstdDecompressor() + dobj = dctx.decompressobj(write_size=write_size) + + chunks = [] + while True: + chunk_size = chunk_sizes.draw(strategies.integers(1, 4096)) + chunk = source.read(chunk_size) if not chunk: break @@ -100,7 +201,7 @@ @unittest.skipUnless('ZSTD_SLOW_TESTS' in os.environ, 'ZSTD_SLOW_TESTS not set') @make_cffi -class TestDecompressor_read_from_fuzzing(unittest.TestCase): +class TestDecompressor_read_to_iter_fuzzing(unittest.TestCase): @hypothesis.given(original=strategies.sampled_from(random_input_data()), level=strategies.integers(min_value=1, max_value=5), read_size=strategies.integers(min_value=1, max_value=4096), @@ -112,7 +213,7 @@ source = io.BytesIO(frame) dctx = zstd.ZstdDecompressor() - chunks = list(dctx.read_from(source, read_size=read_size, write_size=write_size)) + chunks = list(dctx.read_to_iter(source, read_size=read_size, write_size=write_size)) self.assertEqual(b''.join(chunks), original) diff -r fb92df8b634c -r ed5448edcbfa contrib/python-zstandard/tests/test_estimate_sizes.py --- a/contrib/python-zstandard/tests/test_estimate_sizes.py Wed Apr 04 10:35:09 2018 -0400 +++ b/contrib/python-zstandard/tests/test_estimate_sizes.py Wed Apr 18 15:32:08 2018 -0400 @@ -1,9 +1,6 @@ -try: - import unittest2 as unittest -except ImportError: - import unittest +import unittest -import zstd +import zstandard as zstd from . common import ( make_cffi, @@ -16,7 +13,3 @@ size = zstd.estimate_decompression_context_size() self.assertGreater(size, 100000) - def test_compression_size(self): - params = zstd.get_compression_parameters(3) - size = zstd.estimate_compression_context_size(params) - self.assertGreater(size, 100000) diff -r fb92df8b634c -r ed5448edcbfa contrib/python-zstandard/tests/test_module_attributes.py --- a/contrib/python-zstandard/tests/test_module_attributes.py Wed Apr 04 10:35:09 2018 -0400 +++ b/contrib/python-zstandard/tests/test_module_attributes.py Wed Apr 18 15:32:08 2018 -0400 @@ -1,11 +1,8 @@ from __future__ import unicode_literals -try: - import unittest2 as unittest -except ImportError: - import unittest +import unittest -import zstd +import zstandard as zstd from . common import ( make_cffi, @@ -15,7 +12,7 @@ @make_cffi class TestModuleAttributes(unittest.TestCase): def test_version(self): - self.assertEqual(zstd.ZSTD_VERSION, (1, 1, 3)) + self.assertEqual(zstd.ZSTD_VERSION, (1, 3, 4)) def test_constants(self): self.assertEqual(zstd.MAX_COMPRESSION_LEVEL, 22) @@ -23,6 +20,8 @@ def test_hasattr(self): attrs = ( + 'CONTENTSIZE_UNKNOWN', + 'CONTENTSIZE_ERROR', 'COMPRESSION_RECOMMENDED_INPUT_SIZE', 'COMPRESSION_RECOMMENDED_OUTPUT_SIZE', 'DECOMPRESSION_RECOMMENDED_INPUT_SIZE', @@ -40,7 +39,9 @@ 'SEARCHLENGTH_MIN', 'SEARCHLENGTH_MAX', 'TARGETLENGTH_MIN', - 'TARGETLENGTH_MAX', + 'LDM_MINMATCH_MIN', + 'LDM_MINMATCH_MAX', + 'LDM_BUCKETSIZELOG_MAX', 'STRATEGY_FAST', 'STRATEGY_DFAST', 'STRATEGY_GREEDY', @@ -48,6 +49,10 @@ 'STRATEGY_LAZY2', 'STRATEGY_BTLAZY2', 'STRATEGY_BTOPT', + 'STRATEGY_BTULTRA', + 'DICT_TYPE_AUTO', + 'DICT_TYPE_RAWCONTENT', + 'DICT_TYPE_FULLDICT', ) for a in attrs: diff -r fb92df8b634c -r ed5448edcbfa contrib/python-zstandard/tests/test_train_dictionary.py --- a/contrib/python-zstandard/tests/test_train_dictionary.py Wed Apr 04 10:35:09 2018 -0400 +++ b/contrib/python-zstandard/tests/test_train_dictionary.py Wed Apr 18 15:32:08 2018 -0400 @@ -1,13 +1,11 @@ +import struct import sys +import unittest -try: - import unittest2 as unittest -except ImportError: - import unittest - -import zstd +import zstandard as zstd from . common import ( + generate_samples, make_cffi, ) @@ -30,55 +28,18 @@ with self.assertRaises(ValueError): zstd.train_dictionary(8192, [u'foo']) - def test_basic(self): - samples = [] - for i in range(128): - samples.append(b'foo' * 64) - samples.append(b'bar' * 64) - samples.append(b'foobar' * 64) - samples.append(b'baz' * 64) - samples.append(b'foobaz' * 64) - samples.append(b'bazfoo' * 64) + def test_no_params(self): + d = zstd.train_dictionary(8192, generate_samples()) + self.assertIsInstance(d.dict_id(), int_type) - d = zstd.train_dictionary(8192, samples) - self.assertLessEqual(len(d), 8192) - - dict_id = d.dict_id() - self.assertIsInstance(dict_id, int_type) + # The dictionary ID may be different across platforms. + expected = b'\x37\xa4\x30\xec' + struct.pack('= 3 + if (!PyArg_ParseTupleAndKeywords(args, kwargs, "y*:frame_content_size", +#else + if (!PyArg_ParseTupleAndKeywords(args, kwargs, "s*:frame_content_size", +#endif + kwlist, &source)) { + return NULL; + } + + if (!PyBuffer_IsContiguous(&source, 'C') || source.ndim > 1) { + PyErr_SetString(PyExc_ValueError, + "data buffer should be contiguous and have at most one dimension"); + goto finally; + } + + size = ZSTD_getFrameContentSize(source.buf, source.len); + + if (size == ZSTD_CONTENTSIZE_ERROR) { + PyErr_SetString(ZstdError, "error when determining content size"); + } + else if (size == ZSTD_CONTENTSIZE_UNKNOWN) { + result = PyLong_FromLong(-1); + } + else { + result = PyLong_FromUnsignedLongLong(size); + } + +finally: + PyBuffer_Release(&source); + + return result; +} + +PyDoc_STRVAR(frame_header_size__doc__, +"frame_header_size(data)\n" +"\n" +"Obtain the size of a frame header.\n" +); + +static PyObject* frame_header_size(PyObject* self, PyObject* args, PyObject* kwargs) { + static char* kwlist[] = { + "source", + NULL + }; + + Py_buffer source; + PyObject* result = NULL; + size_t zresult; + +#if PY_MAJOR_VERSION >= 3 + if (!PyArg_ParseTupleAndKeywords(args, kwargs, "y*:frame_header_size", +#else + if (!PyArg_ParseTupleAndKeywords(args, kwargs, "s*:frame_header_size", +#endif + kwlist, &source)) { + return NULL; + } + + if (!PyBuffer_IsContiguous(&source, 'C') || source.ndim > 1) { + PyErr_SetString(PyExc_ValueError, + "data buffer should be contiguous and have at most one dimension"); + goto finally; + } + + zresult = ZSTD_frameHeaderSize(source.buf, source.len); + if (ZSTD_isError(zresult)) { + PyErr_Format(ZstdError, "could not determine frame header size: %s", + ZSTD_getErrorName(zresult)); + } + else { + result = PyLong_FromSize_t(zresult); + } + +finally: + + PyBuffer_Release(&source); + + return result; +} PyDoc_STRVAR(get_frame_parameters__doc__, "get_frame_parameters(data)\n" @@ -48,43 +132,48 @@ "Obtains a ``FrameParameters`` instance by parsing data.\n"); PyDoc_STRVAR(train_dictionary__doc__, -"train_dictionary(dict_size, samples)\n" -"\n" -"Train a dictionary from sample data.\n" -"\n" -"A compression dictionary of size ``dict_size`` will be created from the\n" -"iterable of samples provided by ``samples``.\n" -"\n" -"The raw dictionary content will be returned\n"); - -PyDoc_STRVAR(train_cover_dictionary__doc__, -"train_cover_dictionary(dict_size, samples, k=None, d=None, notifications=0, dict_id=0, level=0)\n" +"train_dictionary(dict_size, samples, k=None, d=None, steps=None,\n" +" threads=None,notifications=0, dict_id=0, level=0)\n" "\n" "Train a dictionary from sample data using the COVER algorithm.\n" "\n" -"This behaves like ``train_dictionary()`` except a different algorithm is\n" -"used to create the dictionary. The algorithm has 2 parameters: ``k`` and\n" -"``d``. These control the *segment size* and *dmer size*. A reasonable range\n" -"for ``k`` is ``[16, 2048+]``. A reasonable range for ``d`` is ``[6, 16]``.\n" +"A compression dictionary of size ``dict_size`` will be created from the\n" +"iterable of ``samples``. The raw dictionary bytes will be returned.\n" +"\n" +"The COVER algorithm has 2 parameters: ``k`` and ``d``. These control the\n" +"*segment size* and *dmer size*. A reasonable range for ``k`` is\n" +"``[16, 2048+]``. A reasonable range for ``d`` is ``[6, 16]``.\n" "``d`` must be less than or equal to ``k``.\n" +"\n" +"``steps`` can be specified to control the number of steps through potential\n" +"values of ``k`` and ``d`` to try. ``k`` and ``d`` will only be varied if\n" +"those arguments are not defined. i.e. if ``d`` is ``8``, then only ``k``\n" +"will be varied in this mode.\n" +"\n" +"``threads`` can specify how many threads to use to test various ``k`` and\n" +"``d`` values. ``-1`` will use as many threads as available CPUs. By default,\n" +"a single thread is used.\n" +"\n" +"When ``k`` and ``d`` are not defined, default values are used and the\n" +"algorithm will perform multiple iterations - or steps - to try to find\n" +"ideal parameters. If both ``k`` and ``d`` are specified, then those values\n" +"will be used. ``steps`` or ``threads`` triggers optimization mode to test\n" +"multiple ``k`` and ``d`` variations.\n" ); static char zstd_doc[] = "Interface to zstandard"; static PyMethodDef zstd_methods[] = { - /* TODO remove since it is a method on CompressionParameters. */ - { "estimate_compression_context_size", (PyCFunction)estimate_compression_context_size, - METH_VARARGS, estimate_compression_context_size__doc__ }, { "estimate_decompression_context_size", (PyCFunction)estimate_decompression_context_size, METH_NOARGS, estimate_decompression_context_size__doc__ }, - { "get_compression_parameters", (PyCFunction)get_compression_parameters, - METH_VARARGS, get_compression_parameters__doc__ }, + { "frame_content_size", (PyCFunction)frame_content_size, + METH_VARARGS | METH_KEYWORDS, frame_content_size__doc__ }, + { "frame_header_size", (PyCFunction)frame_header_size, + METH_VARARGS | METH_KEYWORDS, frame_header_size__doc__ }, { "get_frame_parameters", (PyCFunction)get_frame_parameters, - METH_VARARGS, get_frame_parameters__doc__ }, + METH_VARARGS | METH_KEYWORDS, get_frame_parameters__doc__ }, { "train_dictionary", (PyCFunction)train_dictionary, METH_VARARGS | METH_KEYWORDS, train_dictionary__doc__ }, - { "train_cover_dictionary", (PyCFunction)train_cover_dictionary, - METH_VARARGS | METH_KEYWORDS, train_cover_dictionary__doc__ }, { NULL, NULL } }; @@ -94,10 +183,12 @@ void compressionparams_module_init(PyObject* mod); void constants_module_init(PyObject* mod); void compressiondict_module_init(PyObject* mod); +void compressionreader_module_init(PyObject* mod); void compressionwriter_module_init(PyObject* mod); void compressoriterator_module_init(PyObject* mod); void decompressor_module_init(PyObject* mod); void decompressobj_module_init(PyObject* mod); +void decompressionreader_module_init(PyObject *mod); void decompressionwriter_module_init(PyObject* mod); void decompressoriterator_module_init(PyObject* mod); void frameparams_module_init(PyObject* mod); @@ -118,7 +209,7 @@ We detect this mismatch here and refuse to load the module if this scenario is detected. */ - if (ZSTD_VERSION_NUMBER != 10103 || ZSTD_versionNumber() != 10103) { + if (ZSTD_VERSION_NUMBER != 10304 || ZSTD_versionNumber() != 10304) { PyErr_SetString(PyExc_ImportError, "zstd C API mismatch; Python bindings not compiled against expected zstd version"); return; } @@ -128,16 +219,24 @@ compressiondict_module_init(m); compressobj_module_init(m); compressor_module_init(m); + compressionreader_module_init(m); compressionwriter_module_init(m); compressoriterator_module_init(m); constants_module_init(m); decompressor_module_init(m); decompressobj_module_init(m); + decompressionreader_module_init(m); decompressionwriter_module_init(m); decompressoriterator_module_init(m); frameparams_module_init(m); } +#if defined(__GNUC__) && (__GNUC__ >= 4) +# define PYTHON_ZSTD_VISIBILITY __attribute__ ((visibility ("default"))) +#else +# define PYTHON_ZSTD_VISIBILITY +#endif + #if PY_MAJOR_VERSION >= 3 static struct PyModuleDef zstd_module = { PyModuleDef_HEAD_INIT, @@ -147,7 +246,7 @@ zstd_methods }; -PyMODINIT_FUNC PyInit_zstd(void) { +PYTHON_ZSTD_VISIBILITY PyMODINIT_FUNC PyInit_zstd(void) { PyObject *m = PyModule_Create(&zstd_module); if (m) { zstd_module_init(m); @@ -159,7 +258,7 @@ return m; } #else -PyMODINIT_FUNC initzstd(void) { +PYTHON_ZSTD_VISIBILITY PyMODINIT_FUNC initzstd(void) { PyObject *m = Py_InitModule3("zstd", zstd_methods, zstd_doc); if (m) { zstd_module_init(m); @@ -211,3 +310,33 @@ return i; } + +/* Safer version of _PyBytes_Resize(). + * + * _PyBytes_Resize() only works if the refcount is 1. In some scenarios, + * we can get an object with a refcount > 1, even if it was just created + * with PyBytes_FromStringAndSize()! That's because (at least) CPython + * pre-allocates PyBytes instances of size 1 for every possible byte value. + * + * If non-0 is returned, obj may or may not be NULL. + */ +int safe_pybytes_resize(PyObject** obj, Py_ssize_t size) { + PyObject* tmp; + + if ((*obj)->ob_refcnt == 1) { + return _PyBytes_Resize(obj, size); + } + + tmp = PyBytes_FromStringAndSize(NULL, size); + if (!tmp) { + return -1; + } + + memcpy(PyBytes_AS_STRING(tmp), PyBytes_AS_STRING(*obj), + PyBytes_GET_SIZE(*obj)); + + Py_DECREF(*obj); + *obj = tmp; + + return 0; +} \ No newline at end of file diff -r fb92df8b634c -r ed5448edcbfa contrib/python-zstandard/zstd/COPYING --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/contrib/python-zstandard/zstd/COPYING Wed Apr 18 15:32:08 2018 -0400 @@ -0,0 +1,339 @@ + GNU GENERAL PUBLIC LICENSE + Version 2, June 1991 + + Copyright (C) 1989, 1991 Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + Everyone is permitted to copy and distribute verbatim copies + of this license document, but changing it is not allowed. + + Preamble + + The licenses for most software are designed to take away your +freedom to share and change it. By contrast, the GNU General Public +License is intended to guarantee your freedom to share and change free +software--to make sure the software is free for all its users. This +General Public License applies to most of the Free Software +Foundation's software and to any other program whose authors commit to +using it. (Some other Free Software Foundation software is covered by +the GNU Lesser General Public License instead.) You can apply it to +your programs, too. + + When we speak of free software, we are referring to freedom, not +price. Our General Public Licenses are designed to make sure that you +have the freedom to distribute copies of free software (and charge for +this service if you wish), that you receive source code or can get it +if you want it, that you can change the software or use pieces of it +in new free programs; and that you know you can do these things. + + To protect your rights, we need to make restrictions that forbid +anyone to deny you these rights or to ask you to surrender the rights. +These restrictions translate to certain responsibilities for you if you +distribute copies of the software, or if you modify it. + + For example, if you distribute copies of such a program, whether +gratis or for a fee, you must give the recipients all the rights that +you have. You must make sure that they, too, receive or can get the +source code. And you must show them these terms so they know their +rights. + + We protect your rights with two steps: (1) copyright the software, and +(2) offer you this license which gives you legal permission to copy, +distribute and/or modify the software. + + Also, for each author's protection and ours, we want to make certain +that everyone understands that there is no warranty for this free +software. If the software is modified by someone else and passed on, we +want its recipients to know that what they have is not the original, so +that any problems introduced by others will not reflect on the original +authors' reputations. + + Finally, any free program is threatened constantly by software +patents. We wish to avoid the danger that redistributors of a free +program will individually obtain patent licenses, in effect making the +program proprietary. To prevent this, we have made it clear that any +patent must be licensed for everyone's free use or not licensed at all. + + The precise terms and conditions for copying, distribution and +modification follow. + + GNU GENERAL PUBLIC LICENSE + TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION + + 0. This License applies to any program or other work which contains +a notice placed by the copyright holder saying it may be distributed +under the terms of this General Public License. The "Program", below, +refers to any such program or work, and a "work based on the Program" +means either the Program or any derivative work under copyright law: +that is to say, a work containing the Program or a portion of it, +either verbatim or with modifications and/or translated into another +language. (Hereinafter, translation is included without limitation in +the term "modification".) Each licensee is addressed as "you". + +Activities other than copying, distribution and modification are not +covered by this License; they are outside its scope. The act of +running the Program is not restricted, and the output from the Program +is covered only if its contents constitute a work based on the +Program (independent of having been made by running the Program). +Whether that is true depends on what the Program does. + + 1. You may copy and distribute verbatim copies of the Program's +source code as you receive it, in any medium, provided that you +conspicuously and appropriately publish on each copy an appropriate +copyright notice and disclaimer of warranty; keep intact all the +notices that refer to this License and to the absence of any warranty; +and give any other recipients of the Program a copy of this License +along with the Program. + +You may charge a fee for the physical act of transferring a copy, and +you may at your option offer warranty protection in exchange for a fee. + + 2. You may modify your copy or copies of the Program or any portion +of it, thus forming a work based on the Program, and copy and +distribute such modifications or work under the terms of Section 1 +above, provided that you also meet all of these conditions: + + a) You must cause the modified files to carry prominent notices + stating that you changed the files and the date of any change. + + b) You must cause any work that you distribute or publish, that in + whole or in part contains or is derived from the Program or any + part thereof, to be licensed as a whole at no charge to all third + parties under the terms of this License. + + c) If the modified program normally reads commands interactively + when run, you must cause it, when started running for such + interactive use in the most ordinary way, to print or display an + announcement including an appropriate copyright notice and a + notice that there is no warranty (or else, saying that you provide + a warranty) and that users may redistribute the program under + these conditions, and telling the user how to view a copy of this + License. (Exception: if the Program itself is interactive but + does not normally print such an announcement, your work based on + the Program is not required to print an announcement.) + +These requirements apply to the modified work as a whole. If +identifiable sections of that work are not derived from the Program, +and can be reasonably considered independent and separate works in +themselves, then this License, and its terms, do not apply to those +sections when you distribute them as separate works. But when you +distribute the same sections as part of a whole which is a work based +on the Program, the distribution of the whole must be on the terms of +this License, whose permissions for other licensees extend to the +entire whole, and thus to each and every part regardless of who wrote it. + +Thus, it is not the intent of this section to claim rights or contest +your rights to work written entirely by you; rather, the intent is to +exercise the right to control the distribution of derivative or +collective works based on the Program. + +In addition, mere aggregation of another work not based on the Program +with the Program (or with a work based on the Program) on a volume of +a storage or distribution medium does not bring the other work under +the scope of this License. + + 3. You may copy and distribute the Program (or a work based on it, +under Section 2) in object code or executable form under the terms of +Sections 1 and 2 above provided that you also do one of the following: + + a) Accompany it with the complete corresponding machine-readable + source code, which must be distributed under the terms of Sections + 1 and 2 above on a medium customarily used for software interchange; or, + + b) Accompany it with a written offer, valid for at least three + years, to give any third party, for a charge no more than your + cost of physically performing source distribution, a complete + machine-readable copy of the corresponding source code, to be + distributed under the terms of Sections 1 and 2 above on a medium + customarily used for software interchange; or, + + c) Accompany it with the information you received as to the offer + to distribute corresponding source code. (This alternative is + allowed only for noncommercial distribution and only if you + received the program in object code or executable form with such + an offer, in accord with Subsection b above.) + +The source code for a work means the preferred form of the work for +making modifications to it. For an executable work, complete source +code means all the source code for all modules it contains, plus any +associated interface definition files, plus the scripts used to +control compilation and installation of the executable. However, as a +special exception, the source code distributed need not include +anything that is normally distributed (in either source or binary +form) with the major components (compiler, kernel, and so on) of the +operating system on which the executable runs, unless that component +itself accompanies the executable. + +If distribution of executable or object code is made by offering +access to copy from a designated place, then offering equivalent +access to copy the source code from the same place counts as +distribution of the source code, even though third parties are not +compelled to copy the source along with the object code. + + 4. You may not copy, modify, sublicense, or distribute the Program +except as expressly provided under this License. Any attempt +otherwise to copy, modify, sublicense or distribute the Program is +void, and will automatically terminate your rights under this License. +However, parties who have received copies, or rights, from you under +this License will not have their licenses terminated so long as such +parties remain in full compliance. + + 5. You are not required to accept this License, since you have not +signed it. However, nothing else grants you permission to modify or +distribute the Program or its derivative works. These actions are +prohibited by law if you do not accept this License. Therefore, by +modifying or distributing the Program (or any work based on the +Program), you indicate your acceptance of this License to do so, and +all its terms and conditions for copying, distributing or modifying +the Program or works based on it. + + 6. Each time you redistribute the Program (or any work based on the +Program), the recipient automatically receives a license from the +original licensor to copy, distribute or modify the Program subject to +these terms and conditions. You may not impose any further +restrictions on the recipients' exercise of the rights granted herein. +You are not responsible for enforcing compliance by third parties to +this License. + + 7. If, as a consequence of a court judgment or allegation of patent +infringement or for any other reason (not limited to patent issues), +conditions are imposed on you (whether by court order, agreement or +otherwise) that contradict the conditions of this License, they do not +excuse you from the conditions of this License. If you cannot +distribute so as to satisfy simultaneously your obligations under this +License and any other pertinent obligations, then as a consequence you +may not distribute the Program at all. For example, if a patent +license would not permit royalty-free redistribution of the Program by +all those who receive copies directly or indirectly through you, then +the only way you could satisfy both it and this License would be to +refrain entirely from distribution of the Program. + +If any portion of this section is held invalid or unenforceable under +any particular circumstance, the balance of the section is intended to +apply and the section as a whole is intended to apply in other +circumstances. + +It is not the purpose of this section to induce you to infringe any +patents or other property right claims or to contest validity of any +such claims; this section has the sole purpose of protecting the +integrity of the free software distribution system, which is +implemented by public license practices. Many people have made +generous contributions to the wide range of software distributed +through that system in reliance on consistent application of that +system; it is up to the author/donor to decide if he or she is willing +to distribute software through any other system and a licensee cannot +impose that choice. + +This section is intended to make thoroughly clear what is believed to +be a consequence of the rest of this License. + + 8. If the distribution and/or use of the Program is restricted in +certain countries either by patents or by copyrighted interfaces, the +original copyright holder who places the Program under this License +may add an explicit geographical distribution limitation excluding +those countries, so that distribution is permitted only in or among +countries not thus excluded. In such case, this License incorporates +the limitation as if written in the body of this License. + + 9. The Free Software Foundation may publish revised and/or new versions +of the General Public License from time to time. Such new versions will +be similar in spirit to the present version, but may differ in detail to +address new problems or concerns. + +Each version is given a distinguishing version number. If the Program +specifies a version number of this License which applies to it and "any +later version", you have the option of following the terms and conditions +either of that version or of any later version published by the Free +Software Foundation. If the Program does not specify a version number of +this License, you may choose any version ever published by the Free Software +Foundation. + + 10. If you wish to incorporate parts of the Program into other free +programs whose distribution conditions are different, write to the author +to ask for permission. For software which is copyrighted by the Free +Software Foundation, write to the Free Software Foundation; we sometimes +make exceptions for this. Our decision will be guided by the two goals +of preserving the free status of all derivatives of our free software and +of promoting the sharing and reuse of software generally. + + NO WARRANTY + + 11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY +FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN +OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES +PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED +OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF +MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS +TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU. SHOULD THE +PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING, +REPAIR OR CORRECTION. + + 12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING +WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR +REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, +INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING +OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED +TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY +YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER +PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE +POSSIBILITY OF SUCH DAMAGES. + + END OF TERMS AND CONDITIONS + + How to Apply These Terms to Your New Programs + + If you develop a new program, and you want it to be of the greatest +possible use to the public, the best way to achieve this is to make it +free software which everyone can redistribute and change under these terms. + + To do so, attach the following notices to the program. It is safest +to attach them to the start of each source file to most effectively +convey the exclusion of warranty; and each file should have at least +the "copyright" line and a pointer to where the full notice is found. + + + Copyright (C) + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + +Also add information on how to contact you by electronic and paper mail. + +If the program is interactive, make it output a short notice like this +when it starts in an interactive mode: + + Gnomovision version 69, Copyright (C) year name of author + Gnomovision comes with ABSOLUTELY NO WARRANTY; for details type `show w'. + This is free software, and you are welcome to redistribute it + under certain conditions; type `show c' for details. + +The hypothetical commands `show w' and `show c' should show the appropriate +parts of the General Public License. Of course, the commands you use may +be called something other than `show w' and `show c'; they could even be +mouse-clicks or menu items--whatever suits your program. + +You should also get your employer (if you work as a programmer) or your +school, if any, to sign a "copyright disclaimer" for the program, if +necessary. Here is a sample; alter the names: + + Yoyodyne, Inc., hereby disclaims all copyright interest in the program + `Gnomovision' (which makes passes at compilers) written by James Hacker. + + , 1 April 1989 + Ty Coon, President of Vice + +This General Public License does not permit incorporating your program into +proprietary programs. If your program is a subroutine library, you may +consider it more useful to permit linking proprietary applications with the +library. If this is what you want to do, use the GNU Lesser General +Public License instead of this License. \ No newline at end of file diff -r fb92df8b634c -r ed5448edcbfa contrib/python-zstandard/zstd/PATENTS --- a/contrib/python-zstandard/zstd/PATENTS Wed Apr 04 10:35:09 2018 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,33 +0,0 @@ -Additional Grant of Patent Rights Version 2 - -"Software" means the Zstandard software distributed by Facebook, Inc. - -Facebook, Inc. ("Facebook") hereby grants to each recipient of the Software -("you") a perpetual, worldwide, royalty-free, non-exclusive, irrevocable -(subject to the termination provision below) license under any Necessary -Claims, to make, have made, use, sell, offer to sell, import, and otherwise -transfer the Software. For avoidance of doubt, no license is granted under -Facebook’s rights in any patent claims that are infringed by (i) modifications -to the Software made by you or any third party or (ii) the Software in -combination with any software or other technology. - -The license granted hereunder will terminate, automatically and without notice, -if you (or any of your subsidiaries, corporate affiliates or agents) initiate -directly or indirectly, or take a direct financial interest in, any Patent -Assertion: (i) against Facebook or any of its subsidiaries or corporate -affiliates, (ii) against any party if such Patent Assertion arises in whole or -in part from any software, technology, product or service of Facebook or any of -its subsidiaries or corporate affiliates, or (iii) against any party relating -to the Software. Notwithstanding the foregoing, if Facebook or any of its -subsidiaries or corporate affiliates files a lawsuit alleging patent -infringement against you in the first instance, and you respond by filing a -patent infringement counterclaim in that lawsuit against that party that is -unrelated to the Software, the license granted hereunder will not terminate -under section (i) of this paragraph due to such counterclaim. - -A "Necessary Claim" is a claim of a patent owned by Facebook that is -necessarily infringed by the Software standing alone. - -A "Patent Assertion" is any lawsuit or other action alleging direct, indirect, -or contributory infringement or inducement to infringe any patent, including a -cross-claim or counterclaim. diff -r fb92df8b634c -r ed5448edcbfa contrib/python-zstandard/zstd/common/bitstream.h --- a/contrib/python-zstandard/zstd/common/bitstream.h Wed Apr 04 10:35:09 2018 -0400 +++ b/contrib/python-zstandard/zstd/common/bitstream.h Wed Apr 18 15:32:08 2018 -0400 @@ -2,7 +2,7 @@ bitstream Part of FSE library header file (to include) - Copyright (C) 2013-2016, Yann Collet. + Copyright (C) 2013-2017, Yann Collet. BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php) @@ -39,7 +39,6 @@ extern "C" { #endif - /* * This API consists of small unitary functions, which must be inlined for best performance. * Since link-time-optimization is not available for all compilers, @@ -53,6 +52,18 @@ #include "error_private.h" /* error codes and messages */ +/*-************************************* +* Debug +***************************************/ +#if defined(BIT_DEBUG) && (BIT_DEBUG>=1) +# include +#else +# ifndef assert +# define assert(condition) ((void)0) +# endif +#endif + + /*========================================= * Target specific =========================================*/ @@ -60,18 +71,22 @@ # include /* support for bextr (experimental) */ #endif +#define STREAM_ACCUMULATOR_MIN_32 25 +#define STREAM_ACCUMULATOR_MIN_64 57 +#define STREAM_ACCUMULATOR_MIN ((U32)(MEM_32bits() ? STREAM_ACCUMULATOR_MIN_32 : STREAM_ACCUMULATOR_MIN_64)) + /*-****************************************** * bitStream encoding API (write forward) ********************************************/ /* bitStream can mix input from multiple sources. -* A critical property of these streams is that they encode and decode in **reverse** direction. -* So the first bit sequence you add will be the last to be read, like a LIFO stack. -*/ + * A critical property of these streams is that they encode and decode in **reverse** direction. + * So the first bit sequence you add will be the last to be read, like a LIFO stack. + */ typedef struct { size_t bitContainer; - int bitPos; + unsigned bitPos; char* startPtr; char* ptr; char* endPtr; @@ -109,6 +124,7 @@ unsigned bitsConsumed; const char* ptr; const char* start; + const char* limitPtr; } BIT_DStream_t; typedef enum { BIT_DStream_unfinished = 0, @@ -151,140 +167,178 @@ /*-************************************************************** * Internal functions ****************************************************************/ -MEM_STATIC unsigned BIT_highbit32 (register U32 val) +MEM_STATIC unsigned BIT_highbit32 (U32 val) { + assert(val != 0); + { # if defined(_MSC_VER) /* Visual */ - unsigned long r=0; - _BitScanReverse ( &r, val ); - return (unsigned) r; + unsigned long r=0; + _BitScanReverse ( &r, val ); + return (unsigned) r; # elif defined(__GNUC__) && (__GNUC__ >= 3) /* Use GCC Intrinsic */ - return 31 - __builtin_clz (val); + return 31 - __builtin_clz (val); # else /* Software version */ - static const unsigned DeBruijnClz[32] = { 0, 9, 1, 10, 13, 21, 2, 29, 11, 14, 16, 18, 22, 25, 3, 30, 8, 12, 20, 28, 15, 17, 24, 7, 19, 27, 23, 6, 26, 5, 4, 31 }; - U32 v = val; - v |= v >> 1; - v |= v >> 2; - v |= v >> 4; - v |= v >> 8; - v |= v >> 16; - return DeBruijnClz[ (U32) (v * 0x07C4ACDDU) >> 27]; + static const unsigned DeBruijnClz[32] = { 0, 9, 1, 10, 13, 21, 2, 29, + 11, 14, 16, 18, 22, 25, 3, 30, + 8, 12, 20, 28, 15, 17, 24, 7, + 19, 27, 23, 6, 26, 5, 4, 31 }; + U32 v = val; + v |= v >> 1; + v |= v >> 2; + v |= v >> 4; + v |= v >> 8; + v |= v >> 16; + return DeBruijnClz[ (U32) (v * 0x07C4ACDDU) >> 27]; # endif + } } /*===== Local Constants =====*/ -static const unsigned BIT_mask[] = { 0, 1, 3, 7, 0xF, 0x1F, 0x3F, 0x7F, 0xFF, 0x1FF, 0x3FF, 0x7FF, 0xFFF, 0x1FFF, 0x3FFF, 0x7FFF, 0xFFFF, 0x1FFFF, 0x3FFFF, 0x7FFFF, 0xFFFFF, 0x1FFFFF, 0x3FFFFF, 0x7FFFFF, 0xFFFFFF, 0x1FFFFFF, 0x3FFFFFF }; /* up to 26 bits */ - +static const unsigned BIT_mask[] = { + 0, 1, 3, 7, 0xF, 0x1F, + 0x3F, 0x7F, 0xFF, 0x1FF, 0x3FF, 0x7FF, + 0xFFF, 0x1FFF, 0x3FFF, 0x7FFF, 0xFFFF, 0x1FFFF, + 0x3FFFF, 0x7FFFF, 0xFFFFF, 0x1FFFFF, 0x3FFFFF, 0x7FFFFF, + 0xFFFFFF, 0x1FFFFFF, 0x3FFFFFF, 0x7FFFFFF, 0xFFFFFFF, 0x1FFFFFFF, + 0x3FFFFFFF, 0x7FFFFFFF}; /* up to 31 bits */ +#define BIT_MASK_SIZE (sizeof(BIT_mask) / sizeof(BIT_mask[0])) /*-************************************************************** * bitStream encoding ****************************************************************/ /*! BIT_initCStream() : - * `dstCapacity` must be > sizeof(void*) + * `dstCapacity` must be > sizeof(size_t) * @return : 0 if success, - otherwise an error code (can be tested using ERR_isError() ) */ -MEM_STATIC size_t BIT_initCStream(BIT_CStream_t* bitC, void* startPtr, size_t dstCapacity) + * otherwise an error code (can be tested using ERR_isError()) */ +MEM_STATIC size_t BIT_initCStream(BIT_CStream_t* bitC, + void* startPtr, size_t dstCapacity) { bitC->bitContainer = 0; bitC->bitPos = 0; bitC->startPtr = (char*)startPtr; bitC->ptr = bitC->startPtr; - bitC->endPtr = bitC->startPtr + dstCapacity - sizeof(bitC->ptr); - if (dstCapacity <= sizeof(bitC->ptr)) return ERROR(dstSize_tooSmall); + bitC->endPtr = bitC->startPtr + dstCapacity - sizeof(bitC->bitContainer); + if (dstCapacity <= sizeof(bitC->bitContainer)) return ERROR(dstSize_tooSmall); return 0; } /*! BIT_addBits() : - can add up to 26 bits into `bitC`. - Does not check for register overflow ! */ -MEM_STATIC void BIT_addBits(BIT_CStream_t* bitC, size_t value, unsigned nbBits) + * can add up to 31 bits into `bitC`. + * Note : does not check for register overflow ! */ +MEM_STATIC void BIT_addBits(BIT_CStream_t* bitC, + size_t value, unsigned nbBits) { + MEM_STATIC_ASSERT(BIT_MASK_SIZE == 32); + assert(nbBits < BIT_MASK_SIZE); + assert(nbBits + bitC->bitPos < sizeof(bitC->bitContainer) * 8); bitC->bitContainer |= (value & BIT_mask[nbBits]) << bitC->bitPos; bitC->bitPos += nbBits; } /*! BIT_addBitsFast() : * works only if `value` is _clean_, meaning all high bits above nbBits are 0 */ -MEM_STATIC void BIT_addBitsFast(BIT_CStream_t* bitC, size_t value, unsigned nbBits) +MEM_STATIC void BIT_addBitsFast(BIT_CStream_t* bitC, + size_t value, unsigned nbBits) { + assert((value>>nbBits) == 0); + assert(nbBits + bitC->bitPos < sizeof(bitC->bitContainer) * 8); bitC->bitContainer |= value << bitC->bitPos; bitC->bitPos += nbBits; } /*! BIT_flushBitsFast() : + * assumption : bitContainer has not overflowed * unsafe version; does not check buffer overflow */ MEM_STATIC void BIT_flushBitsFast(BIT_CStream_t* bitC) { size_t const nbBytes = bitC->bitPos >> 3; + assert(bitC->bitPos < sizeof(bitC->bitContainer) * 8); MEM_writeLEST(bitC->ptr, bitC->bitContainer); bitC->ptr += nbBytes; + assert(bitC->ptr <= bitC->endPtr); bitC->bitPos &= 7; - bitC->bitContainer >>= nbBytes*8; /* if bitPos >= sizeof(bitContainer)*8 --> undefined behavior */ + bitC->bitContainer >>= nbBytes*8; } /*! BIT_flushBits() : + * assumption : bitContainer has not overflowed * safe version; check for buffer overflow, and prevents it. - * note : does not signal buffer overflow. This will be revealed later on using BIT_closeCStream() */ + * note : does not signal buffer overflow. + * overflow will be revealed later on using BIT_closeCStream() */ MEM_STATIC void BIT_flushBits(BIT_CStream_t* bitC) { size_t const nbBytes = bitC->bitPos >> 3; + assert(bitC->bitPos < sizeof(bitC->bitContainer) * 8); MEM_writeLEST(bitC->ptr, bitC->bitContainer); bitC->ptr += nbBytes; if (bitC->ptr > bitC->endPtr) bitC->ptr = bitC->endPtr; bitC->bitPos &= 7; - bitC->bitContainer >>= nbBytes*8; /* if bitPos >= sizeof(bitContainer)*8 --> undefined behavior */ + bitC->bitContainer >>= nbBytes*8; } /*! BIT_closeCStream() : * @return : size of CStream, in bytes, - or 0 if it could not fit into dstBuffer */ + * or 0 if it could not fit into dstBuffer */ MEM_STATIC size_t BIT_closeCStream(BIT_CStream_t* bitC) { BIT_addBitsFast(bitC, 1, 1); /* endMark */ BIT_flushBits(bitC); - - if (bitC->ptr >= bitC->endPtr) return 0; /* doesn't fit within authorized budget : cancel */ - + if (bitC->ptr >= bitC->endPtr) return 0; /* overflow detected */ return (bitC->ptr - bitC->startPtr) + (bitC->bitPos > 0); } /*-******************************************************** -* bitStream decoding +* bitStream decoding **********************************************************/ /*! BIT_initDStream() : -* Initialize a BIT_DStream_t. -* `bitD` : a pointer to an already allocated BIT_DStream_t structure. -* `srcSize` must be the *exact* size of the bitStream, in bytes. -* @return : size of stream (== srcSize) or an errorCode if a problem is detected -*/ + * Initialize a BIT_DStream_t. + * `bitD` : a pointer to an already allocated BIT_DStream_t structure. + * `srcSize` must be the *exact* size of the bitStream, in bytes. + * @return : size of stream (== srcSize), or an errorCode if a problem is detected + */ MEM_STATIC size_t BIT_initDStream(BIT_DStream_t* bitD, const void* srcBuffer, size_t srcSize) { if (srcSize < 1) { memset(bitD, 0, sizeof(*bitD)); return ERROR(srcSize_wrong); } + bitD->start = (const char*)srcBuffer; + bitD->limitPtr = bitD->start + sizeof(bitD->bitContainer); + if (srcSize >= sizeof(bitD->bitContainer)) { /* normal case */ - bitD->start = (const char*)srcBuffer; bitD->ptr = (const char*)srcBuffer + srcSize - sizeof(bitD->bitContainer); bitD->bitContainer = MEM_readLEST(bitD->ptr); { BYTE const lastByte = ((const BYTE*)srcBuffer)[srcSize-1]; bitD->bitsConsumed = lastByte ? 8 - BIT_highbit32(lastByte) : 0; /* ensures bitsConsumed is always set */ if (lastByte == 0) return ERROR(GENERIC); /* endMark not present */ } } else { - bitD->start = (const char*)srcBuffer; bitD->ptr = bitD->start; bitD->bitContainer = *(const BYTE*)(bitD->start); switch(srcSize) { - case 7: bitD->bitContainer += (size_t)(((const BYTE*)(srcBuffer))[6]) << (sizeof(bitD->bitContainer)*8 - 16); - case 6: bitD->bitContainer += (size_t)(((const BYTE*)(srcBuffer))[5]) << (sizeof(bitD->bitContainer)*8 - 24); - case 5: bitD->bitContainer += (size_t)(((const BYTE*)(srcBuffer))[4]) << (sizeof(bitD->bitContainer)*8 - 32); - case 4: bitD->bitContainer += (size_t)(((const BYTE*)(srcBuffer))[3]) << 24; - case 3: bitD->bitContainer += (size_t)(((const BYTE*)(srcBuffer))[2]) << 16; - case 2: bitD->bitContainer += (size_t)(((const BYTE*)(srcBuffer))[1]) << 8; - default:; + case 7: bitD->bitContainer += (size_t)(((const BYTE*)(srcBuffer))[6]) << (sizeof(bitD->bitContainer)*8 - 16); + /* fall-through */ + + case 6: bitD->bitContainer += (size_t)(((const BYTE*)(srcBuffer))[5]) << (sizeof(bitD->bitContainer)*8 - 24); + /* fall-through */ + + case 5: bitD->bitContainer += (size_t)(((const BYTE*)(srcBuffer))[4]) << (sizeof(bitD->bitContainer)*8 - 32); + /* fall-through */ + + case 4: bitD->bitContainer += (size_t)(((const BYTE*)(srcBuffer))[3]) << 24; + /* fall-through */ + + case 3: bitD->bitContainer += (size_t)(((const BYTE*)(srcBuffer))[2]) << 16; + /* fall-through */ + + case 2: bitD->bitContainer += (size_t)(((const BYTE*)(srcBuffer))[1]) << 8; + /* fall-through */ + + default: break; } - { BYTE const lastByte = ((const BYTE*)srcBuffer)[srcSize-1]; - bitD->bitsConsumed = lastByte ? 8 - BIT_highbit32(lastByte) : 0; - if (lastByte == 0) return ERROR(GENERIC); /* endMark not present */ } + { BYTE const lastByte = ((const BYTE*)srcBuffer)[srcSize-1]; + bitD->bitsConsumed = lastByte ? 8 - BIT_highbit32(lastByte) : 0; + if (lastByte == 0) return ERROR(corruption_detected); /* endMark not present */ + } bitD->bitsConsumed += (U32)(sizeof(bitD->bitContainer) - srcSize)*8; } @@ -306,12 +360,14 @@ # endif return _bextr_u32(bitContainer, start, nbBits); #else + assert(nbBits < BIT_MASK_SIZE); return (bitContainer >> start) & BIT_mask[nbBits]; #endif } MEM_STATIC size_t BIT_getLowerBits(size_t bitContainer, U32 const nbBits) { + assert(nbBits < BIT_MASK_SIZE); return bitContainer & BIT_mask[nbBits]; } @@ -320,24 +376,24 @@ * local register is not modified. * On 32-bits, maxNbBits==24. * On 64-bits, maxNbBits==56. - * @return : value extracted - */ - MEM_STATIC size_t BIT_lookBits(const BIT_DStream_t* bitD, U32 nbBits) + * @return : value extracted */ +MEM_STATIC size_t BIT_lookBits(const BIT_DStream_t* bitD, U32 nbBits) { #if defined(__BMI__) && defined(__GNUC__) /* experimental; fails if bitD->bitsConsumed + nbBits > sizeof(bitD->bitContainer)*8 */ return BIT_getMiddleBits(bitD->bitContainer, (sizeof(bitD->bitContainer)*8) - bitD->bitsConsumed - nbBits, nbBits); #else - U32 const bitMask = sizeof(bitD->bitContainer)*8 - 1; - return ((bitD->bitContainer << (bitD->bitsConsumed & bitMask)) >> 1) >> ((bitMask-nbBits) & bitMask); + U32 const regMask = sizeof(bitD->bitContainer)*8 - 1; + return ((bitD->bitContainer << (bitD->bitsConsumed & regMask)) >> 1) >> ((regMask-nbBits) & regMask); #endif } /*! BIT_lookBitsFast() : -* unsafe version; only works only if nbBits >= 1 */ + * unsafe version; only works if nbBits >= 1 */ MEM_STATIC size_t BIT_lookBitsFast(const BIT_DStream_t* bitD, U32 nbBits) { - U32 const bitMask = sizeof(bitD->bitContainer)*8 - 1; - return (bitD->bitContainer << (bitD->bitsConsumed & bitMask)) >> (((bitMask+1)-nbBits) & bitMask); + U32 const regMask = sizeof(bitD->bitContainer)*8 - 1; + assert(nbBits >= 1); + return (bitD->bitContainer << (bitD->bitsConsumed & regMask)) >> (((regMask+1)-nbBits) & regMask); } MEM_STATIC void BIT_skipBits(BIT_DStream_t* bitD, U32 nbBits) @@ -348,8 +404,7 @@ /*! BIT_readBits() : * Read (consume) next n bits from local register and update. * Pay attention to not read more than nbBits contained into local register. - * @return : extracted value. - */ + * @return : extracted value. */ MEM_STATIC size_t BIT_readBits(BIT_DStream_t* bitD, U32 nbBits) { size_t const value = BIT_lookBits(bitD, nbBits); @@ -358,25 +413,26 @@ } /*! BIT_readBitsFast() : -* unsafe version; only works only if nbBits >= 1 */ + * unsafe version; only works only if nbBits >= 1 */ MEM_STATIC size_t BIT_readBitsFast(BIT_DStream_t* bitD, U32 nbBits) { size_t const value = BIT_lookBitsFast(bitD, nbBits); + assert(nbBits >= 1); BIT_skipBits(bitD, nbBits); return value; } /*! BIT_reloadDStream() : -* Refill `bitD` from buffer previously set in BIT_initDStream() . -* This function is safe, it guarantees it will not read beyond src buffer. -* @return : status of `BIT_DStream_t` internal register. - if status == BIT_DStream_unfinished, internal register is filled with >= (sizeof(bitD->bitContainer)*8 - 7) bits */ + * Refill `bitD` from buffer previously set in BIT_initDStream() . + * This function is safe, it guarantees it will not read beyond src buffer. + * @return : status of `BIT_DStream_t` internal register. + * when status == BIT_DStream_unfinished, internal register is filled with at least 25 or 57 bits */ MEM_STATIC BIT_DStream_status BIT_reloadDStream(BIT_DStream_t* bitD) { - if (bitD->bitsConsumed > (sizeof(bitD->bitContainer)*8)) /* should not happen => corruption detected */ - return BIT_DStream_overflow; + if (bitD->bitsConsumed > (sizeof(bitD->bitContainer)*8)) /* overflow detected, like end of stream */ + return BIT_DStream_overflow; - if (bitD->ptr >= bitD->start + sizeof(bitD->bitContainer)) { + if (bitD->ptr >= bitD->limitPtr) { bitD->ptr -= bitD->bitsConsumed >> 3; bitD->bitsConsumed &= 7; bitD->bitContainer = MEM_readLEST(bitD->ptr); @@ -386,6 +442,7 @@ if (bitD->bitsConsumed < sizeof(bitD->bitContainer)*8) return BIT_DStream_endOfBuffer; return BIT_DStream_completed; } + /* start < ptr < limitPtr */ { U32 nbBytes = bitD->bitsConsumed >> 3; BIT_DStream_status result = BIT_DStream_unfinished; if (bitD->ptr - nbBytes < bitD->start) { @@ -394,14 +451,14 @@ } bitD->ptr -= nbBytes; bitD->bitsConsumed -= nbBytes*8; - bitD->bitContainer = MEM_readLEST(bitD->ptr); /* reminder : srcSize > sizeof(bitD) */ + bitD->bitContainer = MEM_readLEST(bitD->ptr); /* reminder : srcSize > sizeof(bitD->bitContainer), otherwise bitD->ptr == bitD->start */ return result; } } /*! BIT_endOfDStream() : -* @return Tells if DStream has exactly reached its end (all bits consumed). -*/ + * @return : 1 if DStream has _exactly_ reached its end (all bits consumed). + */ MEM_STATIC unsigned BIT_endOfDStream(const BIT_DStream_t* DStream) { return ((DStream->ptr == DStream->start) && (DStream->bitsConsumed == sizeof(DStream->bitContainer)*8)); diff -r fb92df8b634c -r ed5448edcbfa contrib/python-zstandard/zstd/common/compiler.h --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/contrib/python-zstandard/zstd/common/compiler.h Wed Apr 18 15:32:08 2018 -0400 @@ -0,0 +1,111 @@ +/* + * Copyright (c) 2016-present, Yann Collet, Facebook, Inc. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the + * LICENSE file in the root directory of this source tree) and the GPLv2 (found + * in the COPYING file in the root directory of this source tree). + * You may select, at your option, one of the above-listed licenses. + */ + +#ifndef ZSTD_COMPILER_H +#define ZSTD_COMPILER_H + +/*-******************************************************* +* Compiler specifics +*********************************************************/ +/* force inlining */ +#if defined (__GNUC__) || defined(__cplusplus) || defined(__STDC_VERSION__) && __STDC_VERSION__ >= 199901L /* C99 */ +# define INLINE_KEYWORD inline +#else +# define INLINE_KEYWORD +#endif + +#if defined(__GNUC__) +# define FORCE_INLINE_ATTR __attribute__((always_inline)) +#elif defined(_MSC_VER) +# define FORCE_INLINE_ATTR __forceinline +#else +# define FORCE_INLINE_ATTR +#endif + +/** + * FORCE_INLINE_TEMPLATE is used to define C "templates", which take constant + * parameters. They must be inlined for the compiler to elimininate the constant + * branches. + */ +#define FORCE_INLINE_TEMPLATE static INLINE_KEYWORD FORCE_INLINE_ATTR +/** + * HINT_INLINE is used to help the compiler generate better code. It is *not* + * used for "templates", so it can be tweaked based on the compilers + * performance. + * + * gcc-4.8 and gcc-4.9 have been shown to benefit from leaving off the + * always_inline attribute. + * + * clang up to 5.0.0 (trunk) benefit tremendously from the always_inline + * attribute. + */ +#if !defined(__clang__) && defined(__GNUC__) && __GNUC__ >= 4 && __GNUC_MINOR__ >= 8 && __GNUC__ < 5 +# define HINT_INLINE static INLINE_KEYWORD +#else +# define HINT_INLINE static INLINE_KEYWORD FORCE_INLINE_ATTR +#endif + +/* force no inlining */ +#ifdef _MSC_VER +# define FORCE_NOINLINE static __declspec(noinline) +#else +# ifdef __GNUC__ +# define FORCE_NOINLINE static __attribute__((__noinline__)) +# else +# define FORCE_NOINLINE static +# endif +#endif + +/* target attribute */ +#ifndef __has_attribute + #define __has_attribute(x) 0 /* Compatibility with non-clang compilers. */ +#endif +#if defined(__GNUC__) +# define TARGET_ATTRIBUTE(target) __attribute__((__target__(target))) +#else +# define TARGET_ATTRIBUTE(target) +#endif + +/* Enable runtime BMI2 dispatch based on the CPU. + * Enabled for clang & gcc >=4.8 on x86 when BMI2 isn't enabled by default. + */ +#ifndef DYNAMIC_BMI2 + #if (defined(__clang__) && __has_attribute(__target__)) \ + || (defined(__GNUC__) \ + && (__GNUC__ >= 5 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 8))) \ + && (defined(__x86_64__) || defined(_M_X86)) \ + && !defined(__BMI2__) + # define DYNAMIC_BMI2 1 + #else + # define DYNAMIC_BMI2 0 + #endif +#endif + +/* prefetch */ +#if defined(_MSC_VER) && (defined(_M_X64) || defined(_M_I86)) /* _mm_prefetch() is not defined outside of x86/x64 */ +# include /* https://msdn.microsoft.com/fr-fr/library/84szxsww(v=vs.90).aspx */ +# define PREFETCH(ptr) _mm_prefetch((const char*)ptr, _MM_HINT_T0) +#elif defined(__GNUC__) +# define PREFETCH(ptr) __builtin_prefetch(ptr, 0, 0) +#else +# define PREFETCH(ptr) /* disabled */ +#endif + +/* disable warnings */ +#ifdef _MSC_VER /* Visual Studio */ +# include /* For Visual 2005 */ +# pragma warning(disable : 4100) /* disable: C4100: unreferenced formal parameter */ +# pragma warning(disable : 4127) /* disable: C4127: conditional expression is constant */ +# pragma warning(disable : 4204) /* disable: C4204: non-constant aggregate initializer */ +# pragma warning(disable : 4214) /* disable: C4214: non-int bitfields */ +# pragma warning(disable : 4324) /* disable: C4324: padded structure */ +#endif + +#endif /* ZSTD_COMPILER_H */ diff -r fb92df8b634c -r ed5448edcbfa contrib/python-zstandard/zstd/common/cpu.h --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/contrib/python-zstandard/zstd/common/cpu.h Wed Apr 18 15:32:08 2018 -0400 @@ -0,0 +1,216 @@ +/* + * Copyright (c) 2018-present, Facebook, Inc. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the + * LICENSE file in the root directory of this source tree) and the GPLv2 (found + * in the COPYING file in the root directory of this source tree). + * You may select, at your option, one of the above-listed licenses. + */ + +#ifndef ZSTD_COMMON_CPU_H +#define ZSTD_COMMON_CPU_H + +/** + * Implementation taken from folly/CpuId.h + * https://github.com/facebook/folly/blob/master/folly/CpuId.h + */ + +#include + +#include "mem.h" + +#ifdef _MSC_VER +#include +#endif + +typedef struct { + U32 f1c; + U32 f1d; + U32 f7b; + U32 f7c; +} ZSTD_cpuid_t; + +MEM_STATIC ZSTD_cpuid_t ZSTD_cpuid(void) { + U32 f1c = 0; + U32 f1d = 0; + U32 f7b = 0; + U32 f7c = 0; +#ifdef _MSC_VER + int reg[4]; + __cpuid((int*)reg, 0); + { + int const n = reg[0]; + if (n >= 1) { + __cpuid((int*)reg, 1); + f1c = (U32)reg[2]; + f1d = (U32)reg[3]; + } + if (n >= 7) { + __cpuidex((int*)reg, 7, 0); + f7b = (U32)reg[1]; + f7c = (U32)reg[2]; + } + } +#elif defined(__i386__) && defined(__PIC__) && !defined(__clang__) && defined(__GNUC__) + /* The following block like the normal cpuid branch below, but gcc + * reserves ebx for use of its pic register so we must specially + * handle the save and restore to avoid clobbering the register + */ + U32 n; + __asm__( + "pushl %%ebx\n\t" + "cpuid\n\t" + "popl %%ebx\n\t" + : "=a"(n) + : "a"(0) + : "ecx", "edx"); + if (n >= 1) { + U32 f1a; + __asm__( + "pushl %%ebx\n\t" + "cpuid\n\t" + "popl %%ebx\n\t" + : "=a"(f1a), "=c"(f1c), "=d"(f1d) + : "a"(1) + :); + } + if (n >= 7) { + __asm__( + "pushl %%ebx\n\t" + "cpuid\n\t" + "movl %%ebx, %%eax\n\r" + "popl %%ebx" + : "=a"(f7b), "=c"(f7c) + : "a"(7), "c"(0) + : "edx"); + } +#elif defined(__x86_64__) || defined(_M_X64) || defined(__i386__) + U32 n; + __asm__("cpuid" : "=a"(n) : "a"(0) : "ebx", "ecx", "edx"); + if (n >= 1) { + U32 f1a; + __asm__("cpuid" : "=a"(f1a), "=c"(f1c), "=d"(f1d) : "a"(1) : "ebx"); + } + if (n >= 7) { + U32 f7a; + __asm__("cpuid" + : "=a"(f7a), "=b"(f7b), "=c"(f7c) + : "a"(7), "c"(0) + : "edx"); + } +#endif + { + ZSTD_cpuid_t cpuid; + cpuid.f1c = f1c; + cpuid.f1d = f1d; + cpuid.f7b = f7b; + cpuid.f7c = f7c; + return cpuid; + } +} + +#define X(name, r, bit) \ + MEM_STATIC int ZSTD_cpuid_##name(ZSTD_cpuid_t const cpuid) { \ + return ((cpuid.r) & (1U << bit)) != 0; \ + } + +/* cpuid(1): Processor Info and Feature Bits. */ +#define C(name, bit) X(name, f1c, bit) + C(sse3, 0) + C(pclmuldq, 1) + C(dtes64, 2) + C(monitor, 3) + C(dscpl, 4) + C(vmx, 5) + C(smx, 6) + C(eist, 7) + C(tm2, 8) + C(ssse3, 9) + C(cnxtid, 10) + C(fma, 12) + C(cx16, 13) + C(xtpr, 14) + C(pdcm, 15) + C(pcid, 17) + C(dca, 18) + C(sse41, 19) + C(sse42, 20) + C(x2apic, 21) + C(movbe, 22) + C(popcnt, 23) + C(tscdeadline, 24) + C(aes, 25) + C(xsave, 26) + C(osxsave, 27) + C(avx, 28) + C(f16c, 29) + C(rdrand, 30) +#undef C +#define D(name, bit) X(name, f1d, bit) + D(fpu, 0) + D(vme, 1) + D(de, 2) + D(pse, 3) + D(tsc, 4) + D(msr, 5) + D(pae, 6) + D(mce, 7) + D(cx8, 8) + D(apic, 9) + D(sep, 11) + D(mtrr, 12) + D(pge, 13) + D(mca, 14) + D(cmov, 15) + D(pat, 16) + D(pse36, 17) + D(psn, 18) + D(clfsh, 19) + D(ds, 21) + D(acpi, 22) + D(mmx, 23) + D(fxsr, 24) + D(sse, 25) + D(sse2, 26) + D(ss, 27) + D(htt, 28) + D(tm, 29) + D(pbe, 31) +#undef D + +/* cpuid(7): Extended Features. */ +#define B(name, bit) X(name, f7b, bit) + B(bmi1, 3) + B(hle, 4) + B(avx2, 5) + B(smep, 7) + B(bmi2, 8) + B(erms, 9) + B(invpcid, 10) + B(rtm, 11) + B(mpx, 14) + B(avx512f, 16) + B(avx512dq, 17) + B(rdseed, 18) + B(adx, 19) + B(smap, 20) + B(avx512ifma, 21) + B(pcommit, 22) + B(clflushopt, 23) + B(clwb, 24) + B(avx512pf, 26) + B(avx512er, 27) + B(avx512cd, 28) + B(sha, 29) + B(avx512bw, 30) + B(avx512vl, 31) +#undef B +#define C(name, bit) X(name, f7c, bit) + C(prefetchwt1, 0) + C(avx512vbmi, 1) +#undef C + +#undef X + +#endif /* ZSTD_COMMON_CPU_H */ diff -r fb92df8b634c -r ed5448edcbfa contrib/python-zstandard/zstd/common/entropy_common.c --- a/contrib/python-zstandard/zstd/common/entropy_common.c Wed Apr 04 10:35:09 2018 -0400 +++ b/contrib/python-zstandard/zstd/common/entropy_common.c Wed Apr 18 15:32:08 2018 -0400 @@ -43,27 +43,21 @@ #include "huf.h" -/*-**************************************** -* FSE Error Management -******************************************/ +/*=== Version ===*/ +unsigned FSE_versionNumber(void) { return FSE_VERSION_NUMBER; } + + +/*=== Error Management ===*/ unsigned FSE_isError(size_t code) { return ERR_isError(code); } - const char* FSE_getErrorName(size_t code) { return ERR_getErrorName(code); } - -/* ************************************************************** -* HUF Error Management -****************************************************************/ unsigned HUF_isError(size_t code) { return ERR_isError(code); } - const char* HUF_getErrorName(size_t code) { return ERR_getErrorName(code); } /*-************************************************************** * FSE NCount encoding-decoding ****************************************************************/ -static short FSE_abs(short a) { return (short)(a<0 ? -a : a); } - size_t FSE_readNCount (short* normalizedCounter, unsigned* maxSVPtr, unsigned* tableLogPtr, const void* headerBuffer, size_t hbSize) { @@ -117,21 +111,21 @@ } else { bitStream >>= 2; } } - { short const max = (short)((2*threshold-1)-remaining); - short count; + { int const max = (2*threshold-1) - remaining; + int count; if ((bitStream & (threshold-1)) < (U32)max) { - count = (short)(bitStream & (threshold-1)); - bitCount += nbBits-1; + count = bitStream & (threshold-1); + bitCount += nbBits-1; } else { - count = (short)(bitStream & (2*threshold-1)); + count = bitStream & (2*threshold-1); if (count >= threshold) count -= max; - bitCount += nbBits; + bitCount += nbBits; } count--; /* extra accuracy */ - remaining -= FSE_abs(count); - normalizedCounter[charnum++] = count; + remaining -= count < 0 ? -count : count; /* -1 means +1 */ + normalizedCounter[charnum++] = (short)count; previous0 = !count; while (remaining < threshold) { nbBits--; diff -r fb92df8b634c -r ed5448edcbfa contrib/python-zstandard/zstd/common/error_private.c --- a/contrib/python-zstandard/zstd/common/error_private.c Wed Apr 04 10:35:09 2018 -0400 +++ b/contrib/python-zstandard/zstd/common/error_private.c Wed Apr 18 15:32:08 2018 -0400 @@ -1,10 +1,11 @@ -/** +/* * Copyright (c) 2016-present, Yann Collet, Facebook, Inc. * All rights reserved. * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. An additional grant - * of patent rights can be found in the PATENTS file in the same directory. + * This source code is licensed under both the BSD-style license (found in the + * LICENSE file in the root directory of this source tree) and the GPLv2 (found + * in the COPYING file in the root directory of this source tree). + * You may select, at your option, one of the above-listed licenses. */ /* The purpose of this file is to have a single list of error strings embedded in binary */ @@ -20,23 +21,27 @@ case PREFIX(GENERIC): return "Error (generic)"; case PREFIX(prefix_unknown): return "Unknown frame descriptor"; case PREFIX(version_unsupported): return "Version not supported"; - case PREFIX(parameter_unknown): return "Unknown parameter type"; case PREFIX(frameParameter_unsupported): return "Unsupported frame parameter"; - case PREFIX(frameParameter_unsupportedBy32bits): return "Frame parameter unsupported in 32-bits mode"; case PREFIX(frameParameter_windowTooLarge): return "Frame requires too much memory for decoding"; - case PREFIX(compressionParameter_unsupported): return "Compression parameter is out of bound"; + case PREFIX(corruption_detected): return "Corrupted block detected"; + case PREFIX(checksum_wrong): return "Restored data doesn't match checksum"; + case PREFIX(parameter_unsupported): return "Unsupported parameter"; + case PREFIX(parameter_outOfBound): return "Parameter is out of bound"; case PREFIX(init_missing): return "Context should be init first"; case PREFIX(memory_allocation): return "Allocation error : not enough memory"; + case PREFIX(workSpace_tooSmall): return "workSpace buffer is not large enough"; case PREFIX(stage_wrong): return "Operation not authorized at current processing stage"; - case PREFIX(dstSize_tooSmall): return "Destination buffer is too small"; - case PREFIX(srcSize_wrong): return "Src size incorrect"; - case PREFIX(corruption_detected): return "Corrupted block detected"; - case PREFIX(checksum_wrong): return "Restored data doesn't match checksum"; case PREFIX(tableLog_tooLarge): return "tableLog requires too much memory : unsupported"; case PREFIX(maxSymbolValue_tooLarge): return "Unsupported max Symbol Value : too large"; case PREFIX(maxSymbolValue_tooSmall): return "Specified maxSymbolValue is too small"; case PREFIX(dictionary_corrupted): return "Dictionary is corrupted"; case PREFIX(dictionary_wrong): return "Dictionary mismatch"; + case PREFIX(dictionaryCreation_failed): return "Cannot create Dictionary from provided samples"; + case PREFIX(dstSize_tooSmall): return "Destination buffer is too small"; + case PREFIX(srcSize_wrong): return "Src size is incorrect"; + /* following error codes are not stable and may be removed or changed in a future version */ + case PREFIX(frameIndex_tooLarge): return "Frame index is too large"; + case PREFIX(seekableIO): return "An I/O error occurred when reading/seeking"; case PREFIX(maxCode): default: return notErrorCode; } diff -r fb92df8b634c -r ed5448edcbfa contrib/python-zstandard/zstd/common/error_private.h --- a/contrib/python-zstandard/zstd/common/error_private.h Wed Apr 04 10:35:09 2018 -0400 +++ b/contrib/python-zstandard/zstd/common/error_private.h Wed Apr 18 15:32:08 2018 -0400 @@ -1,10 +1,11 @@ -/** +/* * Copyright (c) 2016-present, Yann Collet, Facebook, Inc. * All rights reserved. * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. An additional grant - * of patent rights can be found in the PATENTS file in the same directory. + * This source code is licensed under both the BSD-style license (found in the + * LICENSE file in the root directory of this source tree) and the GPLv2 (found + * in the COPYING file in the root directory of this source tree). + * You may select, at your option, one of the above-listed licenses. */ /* Note : this module is expected to remain private, do not expose it */ @@ -48,10 +49,9 @@ /*-**************************************** * Error codes handling ******************************************/ -#ifdef ERROR -# undef ERROR /* reported already defined on VS 2015 (Rich Geldreich) */ -#endif -#define ERROR(name) ((size_t)-PREFIX(name)) +#undef ERROR /* reported already defined on VS 2015 (Rich Geldreich) */ +#define ERROR(name) ZSTD_ERROR(name) +#define ZSTD_ERROR(name) ((size_t)-PREFIX(name)) ERR_STATIC unsigned ERR_isError(size_t code) { return (code > ERROR(maxCode)); } diff -r fb92df8b634c -r ed5448edcbfa contrib/python-zstandard/zstd/common/fse.h --- a/contrib/python-zstandard/zstd/common/fse.h Wed Apr 04 10:35:09 2018 -0400 +++ b/contrib/python-zstandard/zstd/common/fse.h Wed Apr 18 15:32:08 2018 -0400 @@ -31,13 +31,14 @@ You can contact the author at : - Source repository : https://github.com/Cyan4973/FiniteStateEntropy ****************************************************************** */ -#ifndef FSE_H -#define FSE_H #if defined (__cplusplus) extern "C" { #endif +#ifndef FSE_H +#define FSE_H + /*-***************************************** * Dependencies @@ -45,6 +46,32 @@ #include /* size_t, ptrdiff_t */ +/*-***************************************** +* FSE_PUBLIC_API : control library symbols visibility +******************************************/ +#if defined(FSE_DLL_EXPORT) && (FSE_DLL_EXPORT==1) && defined(__GNUC__) && (__GNUC__ >= 4) +# define FSE_PUBLIC_API __attribute__ ((visibility ("default"))) +#elif defined(FSE_DLL_EXPORT) && (FSE_DLL_EXPORT==1) /* Visual expected */ +# define FSE_PUBLIC_API __declspec(dllexport) +#elif defined(FSE_DLL_IMPORT) && (FSE_DLL_IMPORT==1) +# define FSE_PUBLIC_API __declspec(dllimport) /* It isn't required but allows to generate better code, saving a function pointer load from the IAT and an indirect jump.*/ +#else +# define FSE_PUBLIC_API +#endif + +/*------ Version ------*/ +#define FSE_VERSION_MAJOR 0 +#define FSE_VERSION_MINOR 9 +#define FSE_VERSION_RELEASE 0 + +#define FSE_LIB_VERSION FSE_VERSION_MAJOR.FSE_VERSION_MINOR.FSE_VERSION_RELEASE +#define FSE_QUOTE(str) #str +#define FSE_EXPAND_AND_QUOTE(str) FSE_QUOTE(str) +#define FSE_VERSION_STRING FSE_EXPAND_AND_QUOTE(FSE_LIB_VERSION) + +#define FSE_VERSION_NUMBER (FSE_VERSION_MAJOR *100*100 + FSE_VERSION_MINOR *100 + FSE_VERSION_RELEASE) +FSE_PUBLIC_API unsigned FSE_versionNumber(void); /**< library version number; to be used when checking dll version */ + /*-**************************************** * FSE simple functions ******************************************/ @@ -56,8 +83,8 @@ if return == 1, srcData is a single byte symbol * srcSize times. Use RLE compression instead. if FSE_isError(return), compression failed (more details using FSE_getErrorName()) */ -size_t FSE_compress(void* dst, size_t dstCapacity, - const void* src, size_t srcSize); +FSE_PUBLIC_API size_t FSE_compress(void* dst, size_t dstCapacity, + const void* src, size_t srcSize); /*! FSE_decompress(): Decompress FSE data from buffer 'cSrc', of size 'cSrcSize', @@ -69,18 +96,18 @@ Why ? : making this distinction requires a header. Header management is intentionally delegated to the user layer, which can better manage special cases. */ -size_t FSE_decompress(void* dst, size_t dstCapacity, - const void* cSrc, size_t cSrcSize); +FSE_PUBLIC_API size_t FSE_decompress(void* dst, size_t dstCapacity, + const void* cSrc, size_t cSrcSize); /*-***************************************** * Tool functions ******************************************/ -size_t FSE_compressBound(size_t size); /* maximum compressed size */ +FSE_PUBLIC_API size_t FSE_compressBound(size_t size); /* maximum compressed size */ /* Error Management */ -unsigned FSE_isError(size_t code); /* tells if a return value is an error code */ -const char* FSE_getErrorName(size_t code); /* provides error code string (useful for debugging) */ +FSE_PUBLIC_API unsigned FSE_isError(size_t code); /* tells if a return value is an error code */ +FSE_PUBLIC_API const char* FSE_getErrorName(size_t code); /* provides error code string (useful for debugging) */ /*-***************************************** @@ -94,7 +121,7 @@ if return == 1, srcData is a single byte symbol * srcSize times. Use RLE compression. if FSE_isError(return), it's an error code. */ -size_t FSE_compress2 (void* dst, size_t dstSize, const void* src, size_t srcSize, unsigned maxSymbolValue, unsigned tableLog); +FSE_PUBLIC_API size_t FSE_compress2 (void* dst, size_t dstSize, const void* src, size_t srcSize, unsigned maxSymbolValue, unsigned tableLog); /*-***************************************** @@ -127,50 +154,50 @@ @return : the count of the most frequent symbol (which is not identified). if return == srcSize, there is only one symbol. Can also return an error code, which can be tested with FSE_isError(). */ -size_t FSE_count(unsigned* count, unsigned* maxSymbolValuePtr, const void* src, size_t srcSize); +FSE_PUBLIC_API size_t FSE_count(unsigned* count, unsigned* maxSymbolValuePtr, const void* src, size_t srcSize); /*! FSE_optimalTableLog(): dynamically downsize 'tableLog' when conditions are met. It saves CPU time, by using smaller tables, while preserving or even improving compression ratio. @return : recommended tableLog (necessarily <= 'maxTableLog') */ -unsigned FSE_optimalTableLog(unsigned maxTableLog, size_t srcSize, unsigned maxSymbolValue); +FSE_PUBLIC_API unsigned FSE_optimalTableLog(unsigned maxTableLog, size_t srcSize, unsigned maxSymbolValue); /*! FSE_normalizeCount(): normalize counts so that sum(count[]) == Power_of_2 (2^tableLog) 'normalizedCounter' is a table of short, of minimum size (maxSymbolValue+1). @return : tableLog, or an errorCode, which can be tested using FSE_isError() */ -size_t FSE_normalizeCount(short* normalizedCounter, unsigned tableLog, const unsigned* count, size_t srcSize, unsigned maxSymbolValue); +FSE_PUBLIC_API size_t FSE_normalizeCount(short* normalizedCounter, unsigned tableLog, const unsigned* count, size_t srcSize, unsigned maxSymbolValue); /*! FSE_NCountWriteBound(): Provides the maximum possible size of an FSE normalized table, given 'maxSymbolValue' and 'tableLog'. Typically useful for allocation purpose. */ -size_t FSE_NCountWriteBound(unsigned maxSymbolValue, unsigned tableLog); +FSE_PUBLIC_API size_t FSE_NCountWriteBound(unsigned maxSymbolValue, unsigned tableLog); /*! FSE_writeNCount(): Compactly save 'normalizedCounter' into 'buffer'. @return : size of the compressed table, or an errorCode, which can be tested using FSE_isError(). */ -size_t FSE_writeNCount (void* buffer, size_t bufferSize, const short* normalizedCounter, unsigned maxSymbolValue, unsigned tableLog); +FSE_PUBLIC_API size_t FSE_writeNCount (void* buffer, size_t bufferSize, const short* normalizedCounter, unsigned maxSymbolValue, unsigned tableLog); /*! Constructor and Destructor of FSE_CTable. Note that FSE_CTable size depends on 'tableLog' and 'maxSymbolValue' */ typedef unsigned FSE_CTable; /* don't allocate that. It's only meant to be more restrictive than void* */ -FSE_CTable* FSE_createCTable (unsigned tableLog, unsigned maxSymbolValue); -void FSE_freeCTable (FSE_CTable* ct); +FSE_PUBLIC_API FSE_CTable* FSE_createCTable (unsigned maxSymbolValue, unsigned tableLog); +FSE_PUBLIC_API void FSE_freeCTable (FSE_CTable* ct); /*! FSE_buildCTable(): Builds `ct`, which must be already allocated, using FSE_createCTable(). @return : 0, or an errorCode, which can be tested using FSE_isError() */ -size_t FSE_buildCTable(FSE_CTable* ct, const short* normalizedCounter, unsigned maxSymbolValue, unsigned tableLog); +FSE_PUBLIC_API size_t FSE_buildCTable(FSE_CTable* ct, const short* normalizedCounter, unsigned maxSymbolValue, unsigned tableLog); /*! FSE_compress_usingCTable(): Compress `src` using `ct` into `dst` which must be already allocated. @return : size of compressed data (<= `dstCapacity`), or 0 if compressed data could not fit into `dst`, or an errorCode, which can be tested using FSE_isError() */ -size_t FSE_compress_usingCTable (void* dst, size_t dstCapacity, const void* src, size_t srcSize, const FSE_CTable* ct); +FSE_PUBLIC_API size_t FSE_compress_usingCTable (void* dst, size_t dstCapacity, const void* src, size_t srcSize, const FSE_CTable* ct); /*! Tutorial : @@ -223,25 +250,25 @@ @return : size read from 'rBuffer', or an errorCode, which can be tested using FSE_isError(). maxSymbolValuePtr[0] and tableLogPtr[0] will also be updated with their respective values */ -size_t FSE_readNCount (short* normalizedCounter, unsigned* maxSymbolValuePtr, unsigned* tableLogPtr, const void* rBuffer, size_t rBuffSize); +FSE_PUBLIC_API size_t FSE_readNCount (short* normalizedCounter, unsigned* maxSymbolValuePtr, unsigned* tableLogPtr, const void* rBuffer, size_t rBuffSize); /*! Constructor and Destructor of FSE_DTable. Note that its size depends on 'tableLog' */ typedef unsigned FSE_DTable; /* don't allocate that. It's just a way to be more restrictive than void* */ -FSE_DTable* FSE_createDTable(unsigned tableLog); -void FSE_freeDTable(FSE_DTable* dt); +FSE_PUBLIC_API FSE_DTable* FSE_createDTable(unsigned tableLog); +FSE_PUBLIC_API void FSE_freeDTable(FSE_DTable* dt); /*! FSE_buildDTable(): Builds 'dt', which must be already allocated, using FSE_createDTable(). return : 0, or an errorCode, which can be tested using FSE_isError() */ -size_t FSE_buildDTable (FSE_DTable* dt, const short* normalizedCounter, unsigned maxSymbolValue, unsigned tableLog); +FSE_PUBLIC_API size_t FSE_buildDTable (FSE_DTable* dt, const short* normalizedCounter, unsigned maxSymbolValue, unsigned tableLog); /*! FSE_decompress_usingDTable(): Decompress compressed source `cSrc` of size `cSrcSize` using `dt` into `dst` which must be already allocated. @return : size of regenerated data (necessarily <= `dstCapacity`), or an errorCode, which can be tested using FSE_isError() */ -size_t FSE_decompress_usingDTable(void* dst, size_t dstCapacity, const void* cSrc, size_t cSrcSize, const FSE_DTable* dt); +FSE_PUBLIC_API size_t FSE_decompress_usingDTable(void* dst, size_t dstCapacity, const void* cSrc, size_t cSrcSize, const FSE_DTable* dt); /*! Tutorial : @@ -271,8 +298,10 @@ If there is an error, the function will return an error code, which can be tested using FSE_isError(). (ex: dst buffer too small) */ +#endif /* FSE_H */ -#ifdef FSE_STATIC_LINKING_ONLY +#if defined(FSE_STATIC_LINKING_ONLY) && !defined(FSE_H_FSE_STATIC_LINKING_ONLY) +#define FSE_H_FSE_STATIC_LINKING_ONLY /* *** Dependency *** */ #include "bitstream.h" @@ -290,6 +319,10 @@ #define FSE_CTABLE_SIZE_U32(maxTableLog, maxSymbolValue) (1 + (1<<(maxTableLog-1)) + ((maxSymbolValue+1)*2)) #define FSE_DTABLE_SIZE_U32(maxTableLog) (1 + (1< *maxSymbolValuePtr` (presuming it's also the size of `count`). */ @@ -327,7 +360,7 @@ * Same as FSE_compress2(), but using an externally allocated scratch buffer (`workSpace`). * FSE_WKSP_SIZE_U32() provides the minimum size required for `workSpace` as a table of FSE_CTable. */ -#define FSE_WKSP_SIZE_U32(maxTableLog, maxSymbolValue) ( FSE_CTABLE_SIZE_U32(maxTableLog, maxSymbolValue) + (1<<((maxTableLog>2)?(maxTableLog-2):0)) ) +#define FSE_WKSP_SIZE_U32(maxTableLog, maxSymbolValue) ( FSE_CTABLE_SIZE_U32(maxTableLog, maxSymbolValue) + ((maxTableLog > 12) ? (1 << (maxTableLog - 2)) : 1024) ) size_t FSE_compress_wksp (void* dst, size_t dstSize, const void* src, size_t srcSize, unsigned maxSymbolValue, unsigned tableLog, void* workSpace, size_t wkspSize); size_t FSE_buildCTable_raw (FSE_CTable* ct, unsigned nbBits); @@ -351,6 +384,11 @@ size_t FSE_decompress_wksp(void* dst, size_t dstCapacity, const void* cSrc, size_t cSrcSize, FSE_DTable* workSpace, unsigned maxLog); /**< same as FSE_decompress(), using an externally allocated `workSpace` produced with `FSE_DTABLE_SIZE_U32(maxLog)` */ +typedef enum { + FSE_repeat_none, /**< Cannot use the previous table */ + FSE_repeat_check, /**< Can use the previous table but it must be checked */ + FSE_repeat_valid /**< Can use the previous table and it is asumed to be valid */ + } FSE_repeat; /* ***************************************** * FSE symbol compression API @@ -524,9 +562,9 @@ MEM_STATIC void FSE_encodeSymbol(BIT_CStream_t* bitC, FSE_CState_t* statePtr, U32 symbol) { - const FSE_symbolCompressionTransform symbolTT = ((const FSE_symbolCompressionTransform*)(statePtr->symbolTT))[symbol]; + FSE_symbolCompressionTransform const symbolTT = ((const FSE_symbolCompressionTransform*)(statePtr->symbolTT))[symbol]; const U16* const stateTable = (const U16*)(statePtr->stateTable); - U32 nbBitsOut = (U32)((statePtr->value + symbolTT.deltaNbBits) >> 16); + U32 const nbBitsOut = (U32)((statePtr->value + symbolTT.deltaNbBits) >> 16); BIT_addBits(bitC, statePtr->value, nbBitsOut); statePtr->value = stateTable[ (statePtr->value >> nbBitsOut) + symbolTT.deltaFindState]; } @@ -664,5 +702,3 @@ #if defined (__cplusplus) } #endif - -#endif /* FSE_H */ diff -r fb92df8b634c -r ed5448edcbfa contrib/python-zstandard/zstd/common/fse_decompress.c --- a/contrib/python-zstandard/zstd/common/fse_decompress.c Wed Apr 04 10:35:09 2018 -0400 +++ b/contrib/python-zstandard/zstd/common/fse_decompress.c Wed Apr 18 15:32:08 2018 -0400 @@ -34,35 +34,15 @@ /* ************************************************************** -* Compiler specifics -****************************************************************/ -#ifdef _MSC_VER /* Visual Studio */ -# define FORCE_INLINE static __forceinline -# include /* For Visual 2005 */ -# pragma warning(disable : 4127) /* disable: C4127: conditional expression is constant */ -# pragma warning(disable : 4214) /* disable: C4214: non-int bitfields */ -#else -# if defined (__cplusplus) || defined (__STDC_VERSION__) && __STDC_VERSION__ >= 199901L /* C99 */ -# ifdef __GNUC__ -# define FORCE_INLINE static inline __attribute__((always_inline)) -# else -# define FORCE_INLINE static inline -# endif -# else -# define FORCE_INLINE static -# endif /* __STDC_VERSION__ */ -#endif - - -/* ************************************************************** * Includes ****************************************************************/ #include /* malloc, free, qsort */ #include /* memcpy, memset */ -#include /* printf (debug) */ #include "bitstream.h" +#include "compiler.h" #define FSE_STATIC_LINKING_ONLY #include "fse.h" +#include "error_private.h" /* ************************************************************** @@ -159,8 +139,8 @@ { U32 u; for (u=0; u /* size_t */ -/* *** simple functions *** */ -/** -HUF_compress() : - Compress content from buffer 'src', of size 'srcSize', into buffer 'dst'. - 'dst' buffer must be already allocated. - Compression runs faster if `dstCapacity` >= HUF_compressBound(srcSize). - `srcSize` must be <= `HUF_BLOCKSIZE_MAX` == 128 KB. - @return : size of compressed data (<= `dstCapacity`). - Special values : if return == 0, srcData is not compressible => Nothing is stored within dst !!! - if return == 1, srcData is a single repeated byte symbol (RLE compression). - if HUF_isError(return), compression failed (more details using HUF_getErrorName()) -*/ -size_t HUF_compress(void* dst, size_t dstCapacity, - const void* src, size_t srcSize); +/* *** library symbols visibility *** */ +/* Note : when linking with -fvisibility=hidden on gcc, or by default on Visual, + * HUF symbols remain "private" (internal symbols for library only). + * Set macro FSE_DLL_EXPORT to 1 if you want HUF symbols visible on DLL interface */ +#if defined(FSE_DLL_EXPORT) && (FSE_DLL_EXPORT==1) && defined(__GNUC__) && (__GNUC__ >= 4) +# define HUF_PUBLIC_API __attribute__ ((visibility ("default"))) +#elif defined(FSE_DLL_EXPORT) && (FSE_DLL_EXPORT==1) /* Visual expected */ +# define HUF_PUBLIC_API __declspec(dllexport) +#elif defined(FSE_DLL_IMPORT) && (FSE_DLL_IMPORT==1) +# define HUF_PUBLIC_API __declspec(dllimport) /* not required, just to generate faster code (saves a function pointer load from IAT and an indirect jump) */ +#else +# define HUF_PUBLIC_API +#endif + + +/* ========================== */ +/* *** simple functions *** */ +/* ========================== */ -/** -HUF_decompress() : - Decompress HUF data from buffer 'cSrc', of size 'cSrcSize', - into already allocated buffer 'dst', of minimum size 'dstSize'. - `originalSize` : **must** be the ***exact*** size of original (uncompressed) data. - Note : in contrast with FSE, HUF_decompress can regenerate - RLE (cSrcSize==1) and uncompressed (cSrcSize==dstSize) data, - because it knows size to regenerate. - @return : size of regenerated data (== originalSize), - or an error code, which can be tested using HUF_isError() -*/ -size_t HUF_decompress(void* dst, size_t originalSize, - const void* cSrc, size_t cSrcSize); +/** HUF_compress() : + * Compress content from buffer 'src', of size 'srcSize', into buffer 'dst'. + * 'dst' buffer must be already allocated. + * Compression runs faster if `dstCapacity` >= HUF_compressBound(srcSize). + * `srcSize` must be <= `HUF_BLOCKSIZE_MAX` == 128 KB. + * @return : size of compressed data (<= `dstCapacity`). + * Special values : if return == 0, srcData is not compressible => Nothing is stored within dst !!! + * if HUF_isError(return), compression failed (more details using HUF_getErrorName()) + */ +HUF_PUBLIC_API size_t HUF_compress(void* dst, size_t dstCapacity, + const void* src, size_t srcSize); + +/** HUF_decompress() : + * Decompress HUF data from buffer 'cSrc', of size 'cSrcSize', + * into already allocated buffer 'dst', of minimum size 'dstSize'. + * `originalSize` : **must** be the ***exact*** size of original (uncompressed) data. + * Note : in contrast with FSE, HUF_decompress can regenerate + * RLE (cSrcSize==1) and uncompressed (cSrcSize==dstSize) data, + * because it knows size to regenerate (originalSize). + * @return : size of regenerated data (== originalSize), + * or an error code, which can be tested using HUF_isError() + */ +HUF_PUBLIC_API size_t HUF_decompress(void* dst, size_t originalSize, + const void* cSrc, size_t cSrcSize); /* *** Tool functions *** */ -#define HUF_BLOCKSIZE_MAX (128 * 1024) /**< maximum input size for a single block compressed with HUF_compress */ -size_t HUF_compressBound(size_t size); /**< maximum compressed size (worst case) */ +#define HUF_BLOCKSIZE_MAX (128 * 1024) /**< maximum input size for a single block compressed with HUF_compress */ +HUF_PUBLIC_API size_t HUF_compressBound(size_t size); /**< maximum compressed size (worst case) */ /* Error Management */ -unsigned HUF_isError(size_t code); /**< tells if a return value is an error code */ -const char* HUF_getErrorName(size_t code); /**< provides error code string (useful for debugging) */ +HUF_PUBLIC_API unsigned HUF_isError(size_t code); /**< tells if a return value is an error code */ +HUF_PUBLIC_API const char* HUF_getErrorName(size_t code); /**< provides error code string (useful for debugging) */ /* *** Advanced function *** */ /** HUF_compress2() : - * Same as HUF_compress(), but offers direct control over `maxSymbolValue` and `tableLog` . - * `tableLog` must be `<= HUF_TABLELOG_MAX` . */ -size_t HUF_compress2 (void* dst, size_t dstSize, const void* src, size_t srcSize, unsigned maxSymbolValue, unsigned tableLog); + * Same as HUF_compress(), but offers control over `maxSymbolValue` and `tableLog`. + * `maxSymbolValue` must be <= HUF_SYMBOLVALUE_MAX . + * `tableLog` must be `<= HUF_TABLELOG_MAX` . */ +HUF_PUBLIC_API size_t HUF_compress2 (void* dst, size_t dstCapacity, + const void* src, size_t srcSize, + unsigned maxSymbolValue, unsigned tableLog); /** HUF_compress4X_wksp() : -* Same as HUF_compress2(), but uses externally allocated `workSpace`, which must be a table of >= 1024 unsigned */ -size_t HUF_compress4X_wksp (void* dst, size_t dstSize, const void* src, size_t srcSize, unsigned maxSymbolValue, unsigned tableLog, void* workSpace, size_t wkspSize); /**< `workSpace` must be a table of at least 1024 unsigned */ + * Same as HUF_compress2(), but uses externally allocated `workSpace`. + * `workspace` must have minimum alignment of 4, and be at least as large as HUF_WORKSPACE_SIZE */ +#define HUF_WORKSPACE_SIZE (6 << 10) +#define HUF_WORKSPACE_SIZE_U32 (HUF_WORKSPACE_SIZE / sizeof(U32)) +HUF_PUBLIC_API size_t HUF_compress4X_wksp (void* dst, size_t dstCapacity, + const void* src, size_t srcSize, + unsigned maxSymbolValue, unsigned tableLog, + void* workSpace, size_t wkspSize); - +#endif /* HUF_H_298734234 */ -#ifdef HUF_STATIC_LINKING_ONLY +/* ****************************************************************** + * WARNING !! + * The following section contains advanced and experimental definitions + * which shall never be used in the context of a dynamic library, + * because they are not guaranteed to remain stable in the future. + * Only consider them in association with static linking. + * *****************************************************************/ +#if defined(HUF_STATIC_LINKING_ONLY) && !defined(HUF_H_HUF_STATIC_LINKING_ONLY) +#define HUF_H_HUF_STATIC_LINKING_ONLY /* *** Dependencies *** */ #include "mem.h" /* U32 */ /* *** Constants *** */ -#define HUF_TABLELOG_ABSOLUTEMAX 15 /* absolute limit of HUF_MAX_TABLELOG. Beyond that value, code does not work */ -#define HUF_TABLELOG_MAX 12 /* max configured tableLog (for static allocation); can be modified up to HUF_ABSOLUTEMAX_TABLELOG */ -#define HUF_TABLELOG_DEFAULT 11 /* tableLog by default, when not specified */ -#define HUF_SYMBOLVALUE_MAX 255 +#define HUF_TABLELOG_MAX 12 /* max runtime value of tableLog (due to static allocation); can be modified up to HUF_ABSOLUTEMAX_TABLELOG */ +#define HUF_TABLELOG_DEFAULT 11 /* default tableLog value when none specified */ +#define HUF_SYMBOLVALUE_MAX 255 + +#define HUF_TABLELOG_ABSOLUTEMAX 15 /* absolute limit of HUF_MAX_TABLELOG. Beyond that value, code does not work */ #if (HUF_TABLELOG_MAX > HUF_TABLELOG_ABSOLUTEMAX) # error "HUF_TABLELOG_MAX is too large !" #endif @@ -116,12 +149,14 @@ ******************************************/ /* HUF buffer bounds */ #define HUF_CTABLEBOUND 129 -#define HUF_BLOCKBOUND(size) (size + (size>>8) + 8) /* only true if incompressible pre-filtered with fast heuristic */ +#define HUF_BLOCKBOUND(size) (size + (size>>8) + 8) /* only true when incompressible is pre-filtered with fast heuristic */ #define HUF_COMPRESSBOUND(size) (HUF_CTABLEBOUND + HUF_BLOCKBOUND(size)) /* Macro version, useful for static allocation */ /* static allocation of HUF's Compression Table */ +#define HUF_CTABLE_SIZE_U32(maxSymbolValue) ((maxSymbolValue)+1) /* Use tables of U32, for proper alignment */ +#define HUF_CTABLE_SIZE(maxSymbolValue) (HUF_CTABLE_SIZE_U32(maxSymbolValue) * sizeof(U32)) #define HUF_CREATE_STATIC_CTABLE(name, maxSymbolValue) \ - U32 name##hb[maxSymbolValue+1]; \ + U32 name##hb[HUF_CTABLE_SIZE_U32(maxSymbolValue)]; \ void* name##hv = &(name##hb); \ HUF_CElt* name = (HUF_CElt*)(name##hv) /* no final ; */ @@ -142,97 +177,151 @@ size_t HUF_decompress4X_DCtx (HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize); /**< decodes RLE and uncompressed */ size_t HUF_decompress4X_hufOnly(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize); /**< considers RLE and uncompressed as errors */ +size_t HUF_decompress4X_hufOnly_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize); /**< considers RLE and uncompressed as errors */ size_t HUF_decompress4X2_DCtx(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize); /**< single-symbol decoder */ +size_t HUF_decompress4X2_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize); /**< single-symbol decoder */ size_t HUF_decompress4X4_DCtx(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize); /**< double-symbols decoder */ +size_t HUF_decompress4X4_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize); /**< double-symbols decoder */ /* **************************************** -* HUF detailed API -******************************************/ -/*! -HUF_compress() does the following: -1. count symbol occurrence from source[] into table count[] using FSE_count() -2. (optional) refine tableLog using HUF_optimalTableLog() -3. build Huffman table from count using HUF_buildCTable() -4. save Huffman table to memory buffer using HUF_writeCTable() -5. encode the data stream using HUF_compress4X_usingCTable() + * HUF detailed API + * ****************************************/ -The following API allows targeting specific sub-functions for advanced tasks. -For example, it's possible to compress several blocks using the same 'CTable', -or to save and regenerate 'CTable' using external methods. -*/ -/* FSE_count() : find it within "fse.h" */ +/*! HUF_compress() does the following: + * 1. count symbol occurrence from source[] into table count[] using FSE_count() (exposed within "fse.h") + * 2. (optional) refine tableLog using HUF_optimalTableLog() + * 3. build Huffman table from count using HUF_buildCTable() + * 4. save Huffman table to memory buffer using HUF_writeCTable() + * 5. encode the data stream using HUF_compress4X_usingCTable() + * + * The following API allows targeting specific sub-functions for advanced tasks. + * For example, it's possible to compress several blocks using the same 'CTable', + * or to save and regenerate 'CTable' using external methods. + */ unsigned HUF_optimalTableLog(unsigned maxTableLog, size_t srcSize, unsigned maxSymbolValue); typedef struct HUF_CElt_s HUF_CElt; /* incomplete type */ -size_t HUF_buildCTable (HUF_CElt* CTable, const unsigned* count, unsigned maxSymbolValue, unsigned maxNbBits); +size_t HUF_buildCTable (HUF_CElt* CTable, const unsigned* count, unsigned maxSymbolValue, unsigned maxNbBits); /* @return : maxNbBits; CTable and count can overlap. In which case, CTable will overwrite count content */ size_t HUF_writeCTable (void* dst, size_t maxDstSize, const HUF_CElt* CTable, unsigned maxSymbolValue, unsigned huffLog); size_t HUF_compress4X_usingCTable(void* dst, size_t dstSize, const void* src, size_t srcSize, const HUF_CElt* CTable); +typedef enum { + HUF_repeat_none, /**< Cannot use the previous table */ + HUF_repeat_check, /**< Can use the previous table but it must be checked. Note : The previous table must have been constructed by HUF_compress{1, 4}X_repeat */ + HUF_repeat_valid /**< Can use the previous table and it is asumed to be valid */ + } HUF_repeat; +/** HUF_compress4X_repeat() : + * Same as HUF_compress4X_wksp(), but considers using hufTable if *repeat != HUF_repeat_none. + * If it uses hufTable it does not modify hufTable or repeat. + * If it doesn't, it sets *repeat = HUF_repeat_none, and it sets hufTable to the table used. + * If preferRepeat then the old table will always be used if valid. */ +size_t HUF_compress4X_repeat(void* dst, size_t dstSize, + const void* src, size_t srcSize, + unsigned maxSymbolValue, unsigned tableLog, + void* workSpace, size_t wkspSize, /**< `workSpace` must be aligned on 4-bytes boundaries, `wkspSize` must be >= HUF_WORKSPACE_SIZE */ + HUF_CElt* hufTable, HUF_repeat* repeat, int preferRepeat, int bmi2); /** HUF_buildCTable_wksp() : * Same as HUF_buildCTable(), but using externally allocated scratch buffer. - * `workSpace` must be aligned on 4-bytes boundaries, and be at least as large as a table of 1024 unsigned. + * `workSpace` must be aligned on 4-bytes boundaries, and its size must be >= HUF_CTABLE_WORKSPACE_SIZE. */ +#define HUF_CTABLE_WORKSPACE_SIZE_U32 (2*HUF_SYMBOLVALUE_MAX +1 +1) +#define HUF_CTABLE_WORKSPACE_SIZE (HUF_CTABLE_WORKSPACE_SIZE_U32 * sizeof(unsigned)) size_t HUF_buildCTable_wksp (HUF_CElt* tree, const U32* count, U32 maxSymbolValue, U32 maxNbBits, void* workSpace, size_t wkspSize); /*! HUF_readStats() : - Read compact Huffman tree, saved by HUF_writeCTable(). - `huffWeight` is destination buffer. - @return : size read from `src` , or an error Code . - Note : Needed by HUF_readCTable() and HUF_readDTableXn() . */ -size_t HUF_readStats(BYTE* huffWeight, size_t hwSize, U32* rankStats, - U32* nbSymbolsPtr, U32* tableLogPtr, + * Read compact Huffman tree, saved by HUF_writeCTable(). + * `huffWeight` is destination buffer. + * @return : size read from `src` , or an error Code . + * Note : Needed by HUF_readCTable() and HUF_readDTableXn() . */ +size_t HUF_readStats(BYTE* huffWeight, size_t hwSize, + U32* rankStats, U32* nbSymbolsPtr, U32* tableLogPtr, const void* src, size_t srcSize); /** HUF_readCTable() : -* Loading a CTable saved with HUF_writeCTable() */ -size_t HUF_readCTable (HUF_CElt* CTable, unsigned maxSymbolValue, const void* src, size_t srcSize); + * Loading a CTable saved with HUF_writeCTable() */ +size_t HUF_readCTable (HUF_CElt* CTable, unsigned* maxSymbolValuePtr, const void* src, size_t srcSize); /* -HUF_decompress() does the following: -1. select the decompression algorithm (X2, X4) based on pre-computed heuristics -2. build Huffman table from save, using HUF_readDTableXn() -3. decode 1 or 4 segments in parallel using HUF_decompressSXn_usingDTable -*/ + * HUF_decompress() does the following: + * 1. select the decompression algorithm (X2, X4) based on pre-computed heuristics + * 2. build Huffman table from save, using HUF_readDTableX?() + * 3. decode 1 or 4 segments in parallel using HUF_decompress?X?_usingDTable() + */ /** HUF_selectDecoder() : -* Tells which decoder is likely to decode faster, -* based on a set of pre-determined metrics. -* @return : 0==HUF_decompress4X2, 1==HUF_decompress4X4 . -* Assumption : 0 < cSrcSize < dstSize <= 128 KB */ + * Tells which decoder is likely to decode faster, + * based on a set of pre-computed metrics. + * @return : 0==HUF_decompress4X2, 1==HUF_decompress4X4 . + * Assumption : 0 < dstSize <= 128 KB */ U32 HUF_selectDecoder (size_t dstSize, size_t cSrcSize); +/** + * The minimum workspace size for the `workSpace` used in + * HUF_readDTableX2_wksp() and HUF_readDTableX4_wksp(). + * + * The space used depends on HUF_TABLELOG_MAX, ranging from ~1500 bytes when + * HUF_TABLE_LOG_MAX=12 to ~1850 bytes when HUF_TABLE_LOG_MAX=15. + * Buffer overflow errors may potentially occur if code modifications result in + * a required workspace size greater than that specified in the following + * macro. + */ +#define HUF_DECOMPRESS_WORKSPACE_SIZE (2 << 10) +#define HUF_DECOMPRESS_WORKSPACE_SIZE_U32 (HUF_DECOMPRESS_WORKSPACE_SIZE / sizeof(U32)) + size_t HUF_readDTableX2 (HUF_DTable* DTable, const void* src, size_t srcSize); +size_t HUF_readDTableX2_wksp (HUF_DTable* DTable, const void* src, size_t srcSize, void* workSpace, size_t wkspSize); size_t HUF_readDTableX4 (HUF_DTable* DTable, const void* src, size_t srcSize); +size_t HUF_readDTableX4_wksp (HUF_DTable* DTable, const void* src, size_t srcSize, void* workSpace, size_t wkspSize); size_t HUF_decompress4X_usingDTable(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable); size_t HUF_decompress4X2_usingDTable(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable); size_t HUF_decompress4X4_usingDTable(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable); +/* ====================== */ /* single stream variants */ +/* ====================== */ size_t HUF_compress1X (void* dst, size_t dstSize, const void* src, size_t srcSize, unsigned maxSymbolValue, unsigned tableLog); -size_t HUF_compress1X_wksp (void* dst, size_t dstSize, const void* src, size_t srcSize, unsigned maxSymbolValue, unsigned tableLog, void* workSpace, size_t wkspSize); /**< `workSpace` must be a table of at least 1024 unsigned */ +size_t HUF_compress1X_wksp (void* dst, size_t dstSize, const void* src, size_t srcSize, unsigned maxSymbolValue, unsigned tableLog, void* workSpace, size_t wkspSize); /**< `workSpace` must be a table of at least HUF_WORKSPACE_SIZE_U32 unsigned */ size_t HUF_compress1X_usingCTable(void* dst, size_t dstSize, const void* src, size_t srcSize, const HUF_CElt* CTable); +/** HUF_compress1X_repeat() : + * Same as HUF_compress1X_wksp(), but considers using hufTable if *repeat != HUF_repeat_none. + * If it uses hufTable it does not modify hufTable or repeat. + * If it doesn't, it sets *repeat = HUF_repeat_none, and it sets hufTable to the table used. + * If preferRepeat then the old table will always be used if valid. */ +size_t HUF_compress1X_repeat(void* dst, size_t dstSize, + const void* src, size_t srcSize, + unsigned maxSymbolValue, unsigned tableLog, + void* workSpace, size_t wkspSize, /**< `workSpace` must be aligned on 4-bytes boundaries, `wkspSize` must be >= HUF_WORKSPACE_SIZE */ + HUF_CElt* hufTable, HUF_repeat* repeat, int preferRepeat, int bmi2); size_t HUF_decompress1X2 (void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize); /* single-symbol decoder */ size_t HUF_decompress1X4 (void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize); /* double-symbol decoder */ size_t HUF_decompress1X_DCtx (HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize); +size_t HUF_decompress1X_DCtx_wksp (HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize); size_t HUF_decompress1X2_DCtx(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize); /**< single-symbol decoder */ +size_t HUF_decompress1X2_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize); /**< single-symbol decoder */ size_t HUF_decompress1X4_DCtx(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize); /**< double-symbols decoder */ +size_t HUF_decompress1X4_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize); /**< double-symbols decoder */ size_t HUF_decompress1X_usingDTable(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable); /**< automatic selection of sing or double symbol decoder, based on DTable */ size_t HUF_decompress1X2_usingDTable(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable); size_t HUF_decompress1X4_usingDTable(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable); +/* BMI2 variants. + * If the CPU has BMI2 support, pass bmi2=1, otherwise pass bmi2=0. + */ +size_t HUF_decompress1X_usingDTable_bmi2(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable, int bmi2); +size_t HUF_decompress1X2_DCtx_wksp_bmi2(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize, int bmi2); +size_t HUF_decompress4X_usingDTable_bmi2(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable, int bmi2); +size_t HUF_decompress4X_hufOnly_wksp_bmi2(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize, int bmi2); + #endif /* HUF_STATIC_LINKING_ONLY */ - #if defined (__cplusplus) } #endif - -#endif /* HUF_H_298734234 */ diff -r fb92df8b634c -r ed5448edcbfa contrib/python-zstandard/zstd/common/mem.h --- a/contrib/python-zstandard/zstd/common/mem.h Wed Apr 04 10:35:09 2018 -0400 +++ b/contrib/python-zstandard/zstd/common/mem.h Wed Apr 18 15:32:08 2018 -0400 @@ -1,10 +1,11 @@ -/** +/* * Copyright (c) 2016-present, Yann Collet, Facebook, Inc. * All rights reserved. * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. An additional grant - * of patent rights can be found in the PATENTS file in the same directory. + * This source code is licensed under both the BSD-style license (found in the + * LICENSE file in the root directory of this source tree) and the GPLv2 (found + * in the COPYING file in the root directory of this source tree). + * You may select, at your option, one of the above-listed licenses. */ #ifndef MEM_H_MODULE @@ -48,14 +49,13 @@ *****************************************************************/ #if !defined (__VMS) && (defined (__cplusplus) || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */) ) # include - typedef uint8_t BYTE; - typedef uint16_t U16; - typedef int16_t S16; - typedef uint32_t U32; - typedef int32_t S32; - typedef uint64_t U64; - typedef int64_t S64; - typedef intptr_t iPtrDiff; + typedef uint8_t BYTE; + typedef uint16_t U16; + typedef int16_t S16; + typedef uint32_t U32; + typedef int32_t S32; + typedef uint64_t U64; + typedef int64_t S64; #else typedef unsigned char BYTE; typedef unsigned short U16; @@ -64,7 +64,6 @@ typedef signed int S32; typedef unsigned long long U64; typedef signed long long S64; - typedef ptrdiff_t iPtrDiff; #endif @@ -76,19 +75,18 @@ * Unfortunately, on some target/compiler combinations, the generated assembly is sub-optimal. * The below switch allow to select different access method for improved performance. * Method 0 (default) : use `memcpy()`. Safe and portable. - * Method 1 : `__packed` statement. It depends on compiler extension (ie, not portable). + * Method 1 : `__packed` statement. It depends on compiler extension (i.e., not portable). * This method is safe if your compiler supports it, and *generally* as fast or faster than `memcpy`. * Method 2 : direct access. This method is portable but violate C standard. * It can generate buggy code on targets depending on alignment. - * In some circumstances, it's the only known way to get the most performance (ie GCC + ARMv6) + * In some circumstances, it's the only known way to get the most performance (i.e. GCC + ARMv6) * See http://fastcompression.blogspot.fr/2015/08/accessing-unaligned-memory.html for details. * Prefer these methods in priority order (0 > 1 > 2) */ #ifndef MEM_FORCE_MEMORY_ACCESS /* can be defined externally, on command line for example */ # if defined(__GNUC__) && ( defined(__ARM_ARCH_6__) || defined(__ARM_ARCH_6J__) || defined(__ARM_ARCH_6K__) || defined(__ARM_ARCH_6Z__) || defined(__ARM_ARCH_6ZK__) || defined(__ARM_ARCH_6T2__) ) # define MEM_FORCE_MEMORY_ACCESS 2 -# elif defined(__INTEL_COMPILER) /*|| defined(_MSC_VER)*/ || \ - (defined(__GNUC__) && ( defined(__ARM_ARCH_7__) || defined(__ARM_ARCH_7A__) || defined(__ARM_ARCH_7R__) || defined(__ARM_ARCH_7M__) || defined(__ARM_ARCH_7S__) )) +# elif defined(__INTEL_COMPILER) || defined(__GNUC__) # define MEM_FORCE_MEMORY_ACCESS 1 # endif #endif @@ -109,7 +107,7 @@ MEM_STATIC U16 MEM_read16(const void* memPtr) { return *(const U16*) memPtr; } MEM_STATIC U32 MEM_read32(const void* memPtr) { return *(const U32*) memPtr; } MEM_STATIC U64 MEM_read64(const void* memPtr) { return *(const U64*) memPtr; } -MEM_STATIC U64 MEM_readST(const void* memPtr) { return *(const size_t*) memPtr; } +MEM_STATIC size_t MEM_readST(const void* memPtr) { return *(const size_t*) memPtr; } MEM_STATIC void MEM_write16(void* memPtr, U16 value) { *(U16*)memPtr = value; } MEM_STATIC void MEM_write32(void* memPtr, U32 value) { *(U32*)memPtr = value; } @@ -120,21 +118,27 @@ /* __pack instructions are safer, but compiler specific, hence potentially problematic for some compilers */ /* currently only defined for gcc and icc */ #if defined(_MSC_VER) || (defined(__INTEL_COMPILER) && defined(WIN32)) - __pragma( pack(push, 1) ) - typedef union { U16 u16; U32 u32; U64 u64; size_t st; } unalign; + __pragma( pack(push, 1) ) + typedef struct { U16 v; } unalign16; + typedef struct { U32 v; } unalign32; + typedef struct { U64 v; } unalign64; + typedef struct { size_t v; } unalignArch; __pragma( pack(pop) ) #else - typedef union { U16 u16; U32 u32; U64 u64; size_t st; } __attribute__((packed)) unalign; + typedef struct { U16 v; } __attribute__((packed)) unalign16; + typedef struct { U32 v; } __attribute__((packed)) unalign32; + typedef struct { U64 v; } __attribute__((packed)) unalign64; + typedef struct { size_t v; } __attribute__((packed)) unalignArch; #endif -MEM_STATIC U16 MEM_read16(const void* ptr) { return ((const unalign*)ptr)->u16; } -MEM_STATIC U32 MEM_read32(const void* ptr) { return ((const unalign*)ptr)->u32; } -MEM_STATIC U64 MEM_read64(const void* ptr) { return ((const unalign*)ptr)->u64; } -MEM_STATIC U64 MEM_readST(const void* ptr) { return ((const unalign*)ptr)->st; } +MEM_STATIC U16 MEM_read16(const void* ptr) { return ((const unalign16*)ptr)->v; } +MEM_STATIC U32 MEM_read32(const void* ptr) { return ((const unalign32*)ptr)->v; } +MEM_STATIC U64 MEM_read64(const void* ptr) { return ((const unalign64*)ptr)->v; } +MEM_STATIC size_t MEM_readST(const void* ptr) { return ((const unalignArch*)ptr)->v; } -MEM_STATIC void MEM_write16(void* memPtr, U16 value) { ((unalign*)memPtr)->u16 = value; } -MEM_STATIC void MEM_write32(void* memPtr, U32 value) { ((unalign*)memPtr)->u32 = value; } -MEM_STATIC void MEM_write64(void* memPtr, U64 value) { ((unalign*)memPtr)->u64 = value; } +MEM_STATIC void MEM_write16(void* memPtr, U16 value) { ((unalign16*)memPtr)->v = value; } +MEM_STATIC void MEM_write32(void* memPtr, U32 value) { ((unalign32*)memPtr)->v = value; } +MEM_STATIC void MEM_write64(void* memPtr, U64 value) { ((unalign64*)memPtr)->v = value; } #else @@ -182,7 +186,7 @@ { #if defined(_MSC_VER) /* Visual Studio */ return _byteswap_ulong(in); -#elif defined (__GNUC__) +#elif defined (__GNUC__) && (__GNUC__ * 100 + __GNUC_MINOR__ >= 403) return __builtin_bswap32(in); #else return ((in << 24) & 0xff000000 ) | @@ -196,7 +200,7 @@ { #if defined(_MSC_VER) /* Visual Studio */ return _byteswap_uint64(in); -#elif defined (__GNUC__) +#elif defined (__GNUC__) && (__GNUC__ * 100 + __GNUC_MINOR__ >= 403) return __builtin_bswap64(in); #else return ((in << 56) & 0xff00000000000000ULL) | @@ -351,20 +355,6 @@ } -/* function safe only for comparisons */ -MEM_STATIC U32 MEM_readMINMATCH(const void* memPtr, U32 length) -{ - switch (length) - { - default : - case 4 : return MEM_read32(memPtr); - case 3 : if (MEM_isLittleEndian()) - return MEM_read32(memPtr)<<8; - else - return MEM_read32(memPtr)>>8; - } -} - #if defined (__cplusplus) } #endif diff -r fb92df8b634c -r ed5448edcbfa contrib/python-zstandard/zstd/common/pool.c --- a/contrib/python-zstandard/zstd/common/pool.c Wed Apr 04 10:35:09 2018 -0400 +++ b/contrib/python-zstandard/zstd/common/pool.c Wed Apr 18 15:32:08 2018 -0400 @@ -1,17 +1,18 @@ -/** - * Copyright (c) 2016-present, Facebook, Inc. +/* + * Copyright (c) 2016-present, Yann Collet, Facebook, Inc. * All rights reserved. * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. An additional grant - * of patent rights can be found in the PATENTS file in the same directory. + * This source code is licensed under both the BSD-style license (found in the + * LICENSE file in the root directory of this source tree) and the GPLv2 (found + * in the COPYING file in the root directory of this source tree). + * You may select, at your option, one of the above-listed licenses. */ /* ====== Dependencies ======= */ #include /* size_t */ -#include /* malloc, calloc, free */ #include "pool.h" +#include "zstd_internal.h" /* ZSTD_malloc, ZSTD_free */ /* ====== Compiler specifics ====== */ #if defined(_MSC_VER) @@ -25,13 +26,14 @@ /* A job is a function and an opaque argument */ typedef struct POOL_job_s { - POOL_function function; - void *opaque; + POOL_function function; + void *opaque; } POOL_job; struct POOL_ctx_s { + ZSTD_customMem customMem; /* Keep track of the threads */ - pthread_t *threads; + ZSTD_pthread_t *threads; size_t numThreads; /* The queue is a circular buffer */ @@ -39,12 +41,18 @@ size_t queueHead; size_t queueTail; size_t queueSize; + + /* The number of threads working on jobs */ + size_t numThreadsBusy; + /* Indicates if the queue is empty */ + int queueEmpty; + /* The mutex protects the queue */ - pthread_mutex_t queueMutex; + ZSTD_pthread_mutex_t queueMutex; /* Condition variable for pushers to wait on when the queue is full */ - pthread_cond_t queuePushCond; + ZSTD_pthread_cond_t queuePushCond; /* Condition variables for poppers to wait on when the queue is empty */ - pthread_cond_t queuePopCond; + ZSTD_pthread_cond_t queuePopCond; /* Indicates if the queue is shutting down */ int shutdown; }; @@ -59,55 +67,73 @@ if (!ctx) { return NULL; } for (;;) { /* Lock the mutex and wait for a non-empty queue or until shutdown */ - pthread_mutex_lock(&ctx->queueMutex); - while (ctx->queueHead == ctx->queueTail && !ctx->shutdown) { - pthread_cond_wait(&ctx->queuePopCond, &ctx->queueMutex); + ZSTD_pthread_mutex_lock(&ctx->queueMutex); + + while (ctx->queueEmpty && !ctx->shutdown) { + ZSTD_pthread_cond_wait(&ctx->queuePopCond, &ctx->queueMutex); } /* empty => shutting down: so stop */ - if (ctx->queueHead == ctx->queueTail) { - pthread_mutex_unlock(&ctx->queueMutex); + if (ctx->queueEmpty) { + ZSTD_pthread_mutex_unlock(&ctx->queueMutex); return opaque; } /* Pop a job off the queue */ { POOL_job const job = ctx->queue[ctx->queueHead]; ctx->queueHead = (ctx->queueHead + 1) % ctx->queueSize; + ctx->numThreadsBusy++; + ctx->queueEmpty = ctx->queueHead == ctx->queueTail; /* Unlock the mutex, signal a pusher, and run the job */ - pthread_mutex_unlock(&ctx->queueMutex); - pthread_cond_signal(&ctx->queuePushCond); + ZSTD_pthread_mutex_unlock(&ctx->queueMutex); + ZSTD_pthread_cond_signal(&ctx->queuePushCond); + job.function(job.opaque); - } - } + + /* If the intended queue size was 0, signal after finishing job */ + if (ctx->queueSize == 1) { + ZSTD_pthread_mutex_lock(&ctx->queueMutex); + ctx->numThreadsBusy--; + ZSTD_pthread_mutex_unlock(&ctx->queueMutex); + ZSTD_pthread_cond_signal(&ctx->queuePushCond); + } } + } /* for (;;) */ /* Unreachable */ } -POOL_ctx *POOL_create(size_t numThreads, size_t queueSize) { - POOL_ctx *ctx; +POOL_ctx* POOL_create(size_t numThreads, size_t queueSize) { + return POOL_create_advanced(numThreads, queueSize, ZSTD_defaultCMem); +} + +POOL_ctx* POOL_create_advanced(size_t numThreads, size_t queueSize, ZSTD_customMem customMem) { + POOL_ctx* ctx; /* Check the parameters */ - if (!numThreads || !queueSize) { return NULL; } + if (!numThreads) { return NULL; } /* Allocate the context and zero initialize */ - ctx = (POOL_ctx *)calloc(1, sizeof(POOL_ctx)); + ctx = (POOL_ctx*)ZSTD_calloc(sizeof(POOL_ctx), customMem); if (!ctx) { return NULL; } /* Initialize the job queue. * It needs one extra space since one space is wasted to differentiate empty * and full queues. */ ctx->queueSize = queueSize + 1; - ctx->queue = (POOL_job *)malloc(ctx->queueSize * sizeof(POOL_job)); + ctx->queue = (POOL_job*)ZSTD_malloc(ctx->queueSize * sizeof(POOL_job), customMem); ctx->queueHead = 0; ctx->queueTail = 0; - pthread_mutex_init(&ctx->queueMutex, NULL); - pthread_cond_init(&ctx->queuePushCond, NULL); - pthread_cond_init(&ctx->queuePopCond, NULL); + ctx->numThreadsBusy = 0; + ctx->queueEmpty = 1; + (void)ZSTD_pthread_mutex_init(&ctx->queueMutex, NULL); + (void)ZSTD_pthread_cond_init(&ctx->queuePushCond, NULL); + (void)ZSTD_pthread_cond_init(&ctx->queuePopCond, NULL); ctx->shutdown = 0; /* Allocate space for the thread handles */ - ctx->threads = (pthread_t *)malloc(numThreads * sizeof(pthread_t)); + ctx->threads = (ZSTD_pthread_t*)ZSTD_malloc(numThreads * sizeof(ZSTD_pthread_t), customMem); ctx->numThreads = 0; + ctx->customMem = customMem; /* Check for errors */ if (!ctx->threads || !ctx->queue) { POOL_free(ctx); return NULL; } /* Initialize the threads */ { size_t i; for (i = 0; i < numThreads; ++i) { - if (pthread_create(&ctx->threads[i], NULL, &POOL_thread, ctx)) { + if (ZSTD_pthread_create(&ctx->threads[i], NULL, &POOL_thread, ctx)) { ctx->numThreads = i; POOL_free(ctx); return NULL; @@ -120,75 +146,138 @@ /*! POOL_join() : Shutdown the queue, wake any sleeping threads, and join all of the threads. */ -static void POOL_join(POOL_ctx *ctx) { +static void POOL_join(POOL_ctx* ctx) { /* Shut down the queue */ - pthread_mutex_lock(&ctx->queueMutex); + ZSTD_pthread_mutex_lock(&ctx->queueMutex); ctx->shutdown = 1; - pthread_mutex_unlock(&ctx->queueMutex); + ZSTD_pthread_mutex_unlock(&ctx->queueMutex); /* Wake up sleeping threads */ - pthread_cond_broadcast(&ctx->queuePushCond); - pthread_cond_broadcast(&ctx->queuePopCond); + ZSTD_pthread_cond_broadcast(&ctx->queuePushCond); + ZSTD_pthread_cond_broadcast(&ctx->queuePopCond); /* Join all of the threads */ { size_t i; for (i = 0; i < ctx->numThreads; ++i) { - pthread_join(ctx->threads[i], NULL); + ZSTD_pthread_join(ctx->threads[i], NULL); } } } void POOL_free(POOL_ctx *ctx) { if (!ctx) { return; } POOL_join(ctx); - pthread_mutex_destroy(&ctx->queueMutex); - pthread_cond_destroy(&ctx->queuePushCond); - pthread_cond_destroy(&ctx->queuePopCond); - if (ctx->queue) free(ctx->queue); - if (ctx->threads) free(ctx->threads); - free(ctx); + ZSTD_pthread_mutex_destroy(&ctx->queueMutex); + ZSTD_pthread_cond_destroy(&ctx->queuePushCond); + ZSTD_pthread_cond_destroy(&ctx->queuePopCond); + ZSTD_free(ctx->queue, ctx->customMem); + ZSTD_free(ctx->threads, ctx->customMem); + ZSTD_free(ctx, ctx->customMem); +} + +size_t POOL_sizeof(POOL_ctx *ctx) { + if (ctx==NULL) return 0; /* supports sizeof NULL */ + return sizeof(*ctx) + + ctx->queueSize * sizeof(POOL_job) + + ctx->numThreads * sizeof(ZSTD_pthread_t); +} + +/** + * Returns 1 if the queue is full and 0 otherwise. + * + * If the queueSize is 1 (the pool was created with an intended queueSize of 0), + * then a queue is empty if there is a thread free and no job is waiting. + */ +static int isQueueFull(POOL_ctx const* ctx) { + if (ctx->queueSize > 1) { + return ctx->queueHead == ((ctx->queueTail + 1) % ctx->queueSize); + } else { + return ctx->numThreadsBusy == ctx->numThreads || + !ctx->queueEmpty; + } +} + + +static void POOL_add_internal(POOL_ctx* ctx, POOL_function function, void *opaque) +{ + POOL_job const job = {function, opaque}; + assert(ctx != NULL); + if (ctx->shutdown) return; + + ctx->queueEmpty = 0; + ctx->queue[ctx->queueTail] = job; + ctx->queueTail = (ctx->queueTail + 1) % ctx->queueSize; + ZSTD_pthread_cond_signal(&ctx->queuePopCond); +} + +void POOL_add(POOL_ctx* ctx, POOL_function function, void* opaque) +{ + assert(ctx != NULL); + ZSTD_pthread_mutex_lock(&ctx->queueMutex); + /* Wait until there is space in the queue for the new job */ + while (isQueueFull(ctx) && (!ctx->shutdown)) { + ZSTD_pthread_cond_wait(&ctx->queuePushCond, &ctx->queueMutex); + } + POOL_add_internal(ctx, function, opaque); + ZSTD_pthread_mutex_unlock(&ctx->queueMutex); } -void POOL_add(void *ctxVoid, POOL_function function, void *opaque) { - POOL_ctx *ctx = (POOL_ctx *)ctxVoid; - if (!ctx) { return; } - pthread_mutex_lock(&ctx->queueMutex); - { POOL_job const job = {function, opaque}; - /* Wait until there is space in the queue for the new job */ - size_t newTail = (ctx->queueTail + 1) % ctx->queueSize; - while (ctx->queueHead == newTail && !ctx->shutdown) { - pthread_cond_wait(&ctx->queuePushCond, &ctx->queueMutex); - newTail = (ctx->queueTail + 1) % ctx->queueSize; - } - /* The queue is still going => there is space */ - if (!ctx->shutdown) { - ctx->queue[ctx->queueTail] = job; - ctx->queueTail = newTail; - } +int POOL_tryAdd(POOL_ctx* ctx, POOL_function function, void* opaque) +{ + assert(ctx != NULL); + ZSTD_pthread_mutex_lock(&ctx->queueMutex); + if (isQueueFull(ctx)) { + ZSTD_pthread_mutex_unlock(&ctx->queueMutex); + return 0; } - pthread_mutex_unlock(&ctx->queueMutex); - pthread_cond_signal(&ctx->queuePopCond); + POOL_add_internal(ctx, function, opaque); + ZSTD_pthread_mutex_unlock(&ctx->queueMutex); + return 1; } + #else /* ZSTD_MULTITHREAD not defined */ + +/* ========================== */ /* No multi-threading support */ +/* ========================== */ -/* We don't need any data, but if it is empty malloc() might return NULL. */ + +/* We don't need any data, but if it is empty, malloc() might return NULL. */ struct POOL_ctx_s { - int data; + int dummy; }; +static POOL_ctx g_ctx; -POOL_ctx *POOL_create(size_t numThreads, size_t queueSize) { - (void)numThreads; - (void)queueSize; - return (POOL_ctx *)malloc(sizeof(POOL_ctx)); +POOL_ctx* POOL_create(size_t numThreads, size_t queueSize) { + return POOL_create_advanced(numThreads, queueSize, ZSTD_defaultCMem); } -void POOL_free(POOL_ctx *ctx) { - if (ctx) free(ctx); +POOL_ctx* POOL_create_advanced(size_t numThreads, size_t queueSize, ZSTD_customMem customMem) { + (void)numThreads; + (void)queueSize; + (void)customMem; + return &g_ctx; +} + +void POOL_free(POOL_ctx* ctx) { + assert(!ctx || ctx == &g_ctx); + (void)ctx; } -void POOL_add(void *ctx, POOL_function function, void *opaque) { - (void)ctx; - function(opaque); +void POOL_add(POOL_ctx* ctx, POOL_function function, void* opaque) { + (void)ctx; + function(opaque); +} + +int POOL_tryAdd(POOL_ctx* ctx, POOL_function function, void* opaque) { + (void)ctx; + function(opaque); + return 1; +} + +size_t POOL_sizeof(POOL_ctx* ctx) { + if (ctx==NULL) return 0; /* supports sizeof NULL */ + assert(ctx == &g_ctx); + return sizeof(*ctx); } #endif /* ZSTD_MULTITHREAD */ diff -r fb92df8b634c -r ed5448edcbfa contrib/python-zstandard/zstd/common/pool.h --- a/contrib/python-zstandard/zstd/common/pool.h Wed Apr 04 10:35:09 2018 -0400 +++ b/contrib/python-zstandard/zstd/common/pool.h Wed Apr 18 15:32:08 2018 -0400 @@ -1,11 +1,13 @@ -/** - * Copyright (c) 2016-present, Facebook, Inc. +/* + * Copyright (c) 2016-present, Yann Collet, Facebook, Inc. * All rights reserved. * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. An additional grant - * of patent rights can be found in the PATENTS file in the same directory. + * This source code is licensed under both the BSD-style license (found in the + * LICENSE file in the root directory of this source tree) and the GPLv2 (found + * in the COPYING file in the root directory of this source tree). + * You may select, at your option, one of the above-listed licenses. */ + #ifndef POOL_H #define POOL_H @@ -15,38 +17,54 @@ #include /* size_t */ +#define ZSTD_STATIC_LINKING_ONLY /* ZSTD_customMem */ +#include "zstd.h" typedef struct POOL_ctx_s POOL_ctx; /*! POOL_create() : - Create a thread pool with at most `numThreads` threads. - `numThreads` must be at least 1. - The maximum number of queued jobs before blocking is `queueSize`. - `queueSize` must be at least 1. - @return : The POOL_ctx pointer on success else NULL. + * Create a thread pool with at most `numThreads` threads. + * `numThreads` must be at least 1. + * The maximum number of queued jobs before blocking is `queueSize`. + * @return : POOL_ctx pointer on success, else NULL. */ -POOL_ctx *POOL_create(size_t numThreads, size_t queueSize); +POOL_ctx* POOL_create(size_t numThreads, size_t queueSize); + +POOL_ctx* POOL_create_advanced(size_t numThreads, size_t queueSize, ZSTD_customMem customMem); /*! POOL_free() : Free a thread pool returned by POOL_create(). */ -void POOL_free(POOL_ctx *ctx); +void POOL_free(POOL_ctx* ctx); + +/*! POOL_sizeof() : + return memory usage of pool returned by POOL_create(). +*/ +size_t POOL_sizeof(POOL_ctx* ctx); /*! POOL_function : The function type that can be added to a thread pool. */ -typedef void (*POOL_function)(void *); +typedef void (*POOL_function)(void*); /*! POOL_add_function : The function type for a generic thread pool add function. */ -typedef void (*POOL_add_function)(void *, POOL_function, void *); +typedef void (*POOL_add_function)(void*, POOL_function, void*); /*! POOL_add() : - Add the job `function(opaque)` to the thread pool. + Add the job `function(opaque)` to the thread pool. `ctx` must be valid. Possibly blocks until there is room in the queue. Note : The function may be executed asynchronously, so `opaque` must live until the function has been completed. */ -void POOL_add(void *ctx, POOL_function function, void *opaque); +void POOL_add(POOL_ctx* ctx, POOL_function function, void* opaque); + + +/*! POOL_tryAdd() : + Add the job `function(opaque)` to the thread pool if a worker is available. + return immediately otherwise. + @return : 1 if successful, 0 if not. +*/ +int POOL_tryAdd(POOL_ctx* ctx, POOL_function function, void* opaque); #if defined (__cplusplus) diff -r fb92df8b634c -r ed5448edcbfa contrib/python-zstandard/zstd/common/threading.c --- a/contrib/python-zstandard/zstd/common/threading.c Wed Apr 04 10:35:09 2018 -0400 +++ b/contrib/python-zstandard/zstd/common/threading.c Wed Apr 18 15:32:08 2018 -0400 @@ -1,11 +1,10 @@ - /** * Copyright (c) 2016 Tino Reichardt * All rights reserved. * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. An additional grant - * of patent rights can be found in the PATENTS file in the same directory. + * This source code is licensed under both the BSD-style license (found in the + * LICENSE file in the root directory of this source tree) and the GPLv2 (found + * in the COPYING file in the root directory of this source tree). * * You can contact the author at: * - zstdmt source repository: https://github.com/mcmilk/zstdmt @@ -15,11 +14,8 @@ * This file will hold wrapper for systems, which do not support pthreads */ -/* ====== Compiler specifics ====== */ -#if defined(_MSC_VER) -# pragma warning(disable : 4206) /* disable: C4206: translation unit is empty (when ZSTD_MULTITHREAD is not defined) */ -#endif - +/* create fake symbol to avoid empty trnaslation unit warning */ +int g_ZSTD_threading_useles_symbol; #if defined(ZSTD_MULTITHREAD) && defined(_WIN32) @@ -39,12 +35,12 @@ static unsigned __stdcall worker(void *arg) { - pthread_t* const thread = (pthread_t*) arg; + ZSTD_pthread_t* const thread = (ZSTD_pthread_t*) arg; thread->arg = thread->start_routine(thread->arg); return 0; } -int pthread_create(pthread_t* thread, const void* unused, +int ZSTD_pthread_create(ZSTD_pthread_t* thread, const void* unused, void* (*start_routine) (void*), void* arg) { (void)unused; @@ -58,16 +54,16 @@ return 0; } -int _pthread_join(pthread_t * thread, void **value_ptr) +int ZSTD_pthread_join(ZSTD_pthread_t thread, void **value_ptr) { DWORD result; - if (!thread->handle) return 0; + if (!thread.handle) return 0; - result = WaitForSingleObject(thread->handle, INFINITE); + result = WaitForSingleObject(thread.handle, INFINITE); switch (result) { case WAIT_OBJECT_0: - if (value_ptr) *value_ptr = thread->arg; + if (value_ptr) *value_ptr = thread.arg; return 0; case WAIT_ABANDONED: return EINVAL; diff -r fb92df8b634c -r ed5448edcbfa contrib/python-zstandard/zstd/common/threading.h --- a/contrib/python-zstandard/zstd/common/threading.h Wed Apr 04 10:35:09 2018 -0400 +++ b/contrib/python-zstandard/zstd/common/threading.h Wed Apr 18 15:32:08 2018 -0400 @@ -1,11 +1,10 @@ - /** * Copyright (c) 2016 Tino Reichardt * All rights reserved. * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. An additional grant - * of patent rights can be found in the PATENTS file in the same directory. + * This source code is licensed under both the BSD-style license (found in the + * LICENSE file in the root directory of this source tree) and the GPLv2 (found + * in the COPYING file in the root directory of this source tree). * * You can contact the author at: * - zstdmt source repository: https://github.com/mcmilk/zstdmt @@ -38,62 +37,82 @@ # define WIN32_LEAN_AND_MEAN #endif +#undef ERROR /* reported already defined on VS 2015 (Rich Geldreich) */ #include +#undef ERROR +#define ERROR(name) ZSTD_ERROR(name) + /* mutex */ -#define pthread_mutex_t CRITICAL_SECTION -#define pthread_mutex_init(a,b) InitializeCriticalSection((a)) -#define pthread_mutex_destroy(a) DeleteCriticalSection((a)) -#define pthread_mutex_lock(a) EnterCriticalSection((a)) -#define pthread_mutex_unlock(a) LeaveCriticalSection((a)) +#define ZSTD_pthread_mutex_t CRITICAL_SECTION +#define ZSTD_pthread_mutex_init(a, b) ((void)(b), InitializeCriticalSection((a)), 0) +#define ZSTD_pthread_mutex_destroy(a) DeleteCriticalSection((a)) +#define ZSTD_pthread_mutex_lock(a) EnterCriticalSection((a)) +#define ZSTD_pthread_mutex_unlock(a) LeaveCriticalSection((a)) /* condition variable */ -#define pthread_cond_t CONDITION_VARIABLE -#define pthread_cond_init(a, b) InitializeConditionVariable((a)) -#define pthread_cond_destroy(a) /* No delete */ -#define pthread_cond_wait(a, b) SleepConditionVariableCS((a), (b), INFINITE) -#define pthread_cond_signal(a) WakeConditionVariable((a)) -#define pthread_cond_broadcast(a) WakeAllConditionVariable((a)) +#define ZSTD_pthread_cond_t CONDITION_VARIABLE +#define ZSTD_pthread_cond_init(a, b) ((void)(b), InitializeConditionVariable((a)), 0) +#define ZSTD_pthread_cond_destroy(a) ((void)(a)) +#define ZSTD_pthread_cond_wait(a, b) SleepConditionVariableCS((a), (b), INFINITE) +#define ZSTD_pthread_cond_signal(a) WakeConditionVariable((a)) +#define ZSTD_pthread_cond_broadcast(a) WakeAllConditionVariable((a)) -/* pthread_create() and pthread_join() */ +/* ZSTD_pthread_create() and ZSTD_pthread_join() */ typedef struct { HANDLE handle; void* (*start_routine)(void*); void* arg; -} pthread_t; +} ZSTD_pthread_t; -int pthread_create(pthread_t* thread, const void* unused, +int ZSTD_pthread_create(ZSTD_pthread_t* thread, const void* unused, void* (*start_routine) (void*), void* arg); -#define pthread_join(a, b) _pthread_join(&(a), (b)) -int _pthread_join(pthread_t* thread, void** value_ptr); +int ZSTD_pthread_join(ZSTD_pthread_t thread, void** value_ptr); /** * add here more wrappers as required */ -#elif defined(ZSTD_MULTITHREAD) /* posix assumed ; need a better detection mathod */ +#elif defined(ZSTD_MULTITHREAD) /* posix assumed ; need a better detection method */ /* === POSIX Systems === */ # include +#define ZSTD_pthread_mutex_t pthread_mutex_t +#define ZSTD_pthread_mutex_init(a, b) pthread_mutex_init((a), (b)) +#define ZSTD_pthread_mutex_destroy(a) pthread_mutex_destroy((a)) +#define ZSTD_pthread_mutex_lock(a) pthread_mutex_lock((a)) +#define ZSTD_pthread_mutex_unlock(a) pthread_mutex_unlock((a)) + +#define ZSTD_pthread_cond_t pthread_cond_t +#define ZSTD_pthread_cond_init(a, b) pthread_cond_init((a), (b)) +#define ZSTD_pthread_cond_destroy(a) pthread_cond_destroy((a)) +#define ZSTD_pthread_cond_wait(a, b) pthread_cond_wait((a), (b)) +#define ZSTD_pthread_cond_signal(a) pthread_cond_signal((a)) +#define ZSTD_pthread_cond_broadcast(a) pthread_cond_broadcast((a)) + +#define ZSTD_pthread_t pthread_t +#define ZSTD_pthread_create(a, b, c, d) pthread_create((a), (b), (c), (d)) +#define ZSTD_pthread_join(a, b) pthread_join((a),(b)) + #else /* ZSTD_MULTITHREAD not defined */ /* No multithreading support */ -#define pthread_mutex_t int /* #define rather than typedef, as sometimes pthread support is implicit, resulting in duplicated symbols */ -#define pthread_mutex_init(a,b) -#define pthread_mutex_destroy(a) -#define pthread_mutex_lock(a) -#define pthread_mutex_unlock(a) +typedef int ZSTD_pthread_mutex_t; +#define ZSTD_pthread_mutex_init(a, b) ((void)(a), (void)(b), 0) +#define ZSTD_pthread_mutex_destroy(a) ((void)(a)) +#define ZSTD_pthread_mutex_lock(a) ((void)(a)) +#define ZSTD_pthread_mutex_unlock(a) ((void)(a)) -#define pthread_cond_t int -#define pthread_cond_init(a,b) -#define pthread_cond_destroy(a) -#define pthread_cond_wait(a,b) -#define pthread_cond_signal(a) -#define pthread_cond_broadcast(a) +typedef int ZSTD_pthread_cond_t; +#define ZSTD_pthread_cond_init(a, b) ((void)(a), (void)(b), 0) +#define ZSTD_pthread_cond_destroy(a) ((void)(a)) +#define ZSTD_pthread_cond_wait(a, b) ((void)(a), (void)(b)) +#define ZSTD_pthread_cond_signal(a) ((void)(a)) +#define ZSTD_pthread_cond_broadcast(a) ((void)(a)) -/* do not use pthread_t */ +/* do not use ZSTD_pthread_t */ #endif /* ZSTD_MULTITHREAD */ diff -r fb92df8b634c -r ed5448edcbfa contrib/python-zstandard/zstd/common/xxhash.c --- a/contrib/python-zstandard/zstd/common/xxhash.c Wed Apr 04 10:35:09 2018 -0400 +++ b/contrib/python-zstandard/zstd/common/xxhash.c Wed Apr 18 15:32:08 2018 -0400 @@ -104,26 +104,34 @@ #include static void* XXH_memcpy(void* dest, const void* src, size_t size) { return memcpy(dest,src,size); } -#define XXH_STATIC_LINKING_ONLY +#ifndef XXH_STATIC_LINKING_ONLY +# define XXH_STATIC_LINKING_ONLY +#endif #include "xxhash.h" /* ************************************* * Compiler Specific Options ***************************************/ -#ifdef _MSC_VER /* Visual Studio */ -# pragma warning(disable : 4127) /* disable: C4127: conditional expression is constant */ -# define FORCE_INLINE static __forceinline +#if defined (__GNUC__) || defined(__cplusplus) || defined(__STDC_VERSION__) && __STDC_VERSION__ >= 199901L /* C99 */ +# define INLINE_KEYWORD inline #else -# if defined (__cplusplus) || defined (__STDC_VERSION__) && __STDC_VERSION__ >= 199901L /* C99 */ -# ifdef __GNUC__ -# define FORCE_INLINE static inline __attribute__((always_inline)) -# else -# define FORCE_INLINE static inline -# endif -# else -# define FORCE_INLINE static -# endif /* __STDC_VERSION__ */ +# define INLINE_KEYWORD +#endif + +#if defined(__GNUC__) +# define FORCE_INLINE_ATTR __attribute__((always_inline)) +#elif defined(_MSC_VER) +# define FORCE_INLINE_ATTR __forceinline +#else +# define FORCE_INLINE_ATTR +#endif + +#define FORCE_INLINE_TEMPLATE static INLINE_KEYWORD FORCE_INLINE_ATTR + + +#ifdef _MSC_VER +# pragma warning(disable : 4127) /* disable: C4127: conditional expression is constant */ #endif @@ -246,7 +254,7 @@ *****************************/ typedef enum { XXH_aligned, XXH_unaligned } XXH_alignment; -FORCE_INLINE U32 XXH_readLE32_align(const void* ptr, XXH_endianess endian, XXH_alignment align) +FORCE_INLINE_TEMPLATE U32 XXH_readLE32_align(const void* ptr, XXH_endianess endian, XXH_alignment align) { if (align==XXH_unaligned) return endian==XXH_littleEndian ? XXH_read32(ptr) : XXH_swap32(XXH_read32(ptr)); @@ -254,7 +262,7 @@ return endian==XXH_littleEndian ? *(const U32*)ptr : XXH_swap32(*(const U32*)ptr); } -FORCE_INLINE U32 XXH_readLE32(const void* ptr, XXH_endianess endian) +FORCE_INLINE_TEMPLATE U32 XXH_readLE32(const void* ptr, XXH_endianess endian) { return XXH_readLE32_align(ptr, endian, XXH_unaligned); } @@ -264,7 +272,7 @@ return XXH_CPU_LITTLE_ENDIAN ? XXH_swap32(XXH_read32(ptr)) : XXH_read32(ptr); } -FORCE_INLINE U64 XXH_readLE64_align(const void* ptr, XXH_endianess endian, XXH_alignment align) +FORCE_INLINE_TEMPLATE U64 XXH_readLE64_align(const void* ptr, XXH_endianess endian, XXH_alignment align) { if (align==XXH_unaligned) return endian==XXH_littleEndian ? XXH_read64(ptr) : XXH_swap64(XXH_read64(ptr)); @@ -272,7 +280,7 @@ return endian==XXH_littleEndian ? *(const U64*)ptr : XXH_swap64(*(const U64*)ptr); } -FORCE_INLINE U64 XXH_readLE64(const void* ptr, XXH_endianess endian) +FORCE_INLINE_TEMPLATE U64 XXH_readLE64(const void* ptr, XXH_endianess endian) { return XXH_readLE64_align(ptr, endian, XXH_unaligned); } @@ -333,7 +341,7 @@ return seed; } -FORCE_INLINE U32 XXH32_endian_align(const void* input, size_t len, U32 seed, XXH_endianess endian, XXH_alignment align) +FORCE_INLINE_TEMPLATE U32 XXH32_endian_align(const void* input, size_t len, U32 seed, XXH_endianess endian, XXH_alignment align) { const BYTE* p = (const BYTE*)input; const BYTE* bEnd = p + len; @@ -433,7 +441,7 @@ return acc; } -FORCE_INLINE U64 XXH64_endian_align(const void* input, size_t len, U64 seed, XXH_endianess endian, XXH_alignment align) +FORCE_INLINE_TEMPLATE U64 XXH64_endian_align(const void* input, size_t len, U64 seed, XXH_endianess endian, XXH_alignment align) { const BYTE* p = (const BYTE*)input; const BYTE* const bEnd = p + len; @@ -582,7 +590,7 @@ } -FORCE_INLINE XXH_errorcode XXH32_update_endian (XXH32_state_t* state, const void* input, size_t len, XXH_endianess endian) +FORCE_INLINE_TEMPLATE XXH_errorcode XXH32_update_endian (XXH32_state_t* state, const void* input, size_t len, XXH_endianess endian) { const BYTE* p = (const BYTE*)input; const BYTE* const bEnd = p + len; @@ -652,7 +660,7 @@ -FORCE_INLINE U32 XXH32_digest_endian (const XXH32_state_t* state, XXH_endianess endian) +FORCE_INLINE_TEMPLATE U32 XXH32_digest_endian (const XXH32_state_t* state, XXH_endianess endian) { const BYTE * p = (const BYTE*)state->mem32; const BYTE* const bEnd = (const BYTE*)(state->mem32) + state->memsize; @@ -702,7 +710,7 @@ /* **** XXH64 **** */ -FORCE_INLINE XXH_errorcode XXH64_update_endian (XXH64_state_t* state, const void* input, size_t len, XXH_endianess endian) +FORCE_INLINE_TEMPLATE XXH_errorcode XXH64_update_endian (XXH64_state_t* state, const void* input, size_t len, XXH_endianess endian) { const BYTE* p = (const BYTE*)input; const BYTE* const bEnd = p + len; @@ -769,7 +777,7 @@ -FORCE_INLINE U64 XXH64_digest_endian (const XXH64_state_t* state, XXH_endianess endian) +FORCE_INLINE_TEMPLATE U64 XXH64_digest_endian (const XXH64_state_t* state, XXH_endianess endian) { const BYTE * p = (const BYTE*)state->mem64; const BYTE* const bEnd = (const BYTE*)state->mem64 + state->memsize; diff -r fb92df8b634c -r ed5448edcbfa contrib/python-zstandard/zstd/common/xxhash.h --- a/contrib/python-zstandard/zstd/common/xxhash.h Wed Apr 04 10:35:09 2018 -0400 +++ b/contrib/python-zstandard/zstd/common/xxhash.h Wed Apr 18 15:32:08 2018 -0400 @@ -64,16 +64,12 @@ XXH32 6.8 GB/s 6.0 GB/s */ -#ifndef XXHASH_H_5627135585666179 -#define XXHASH_H_5627135585666179 1 - #if defined (__cplusplus) extern "C" { #endif -#ifndef XXH_NAMESPACE -# define XXH_NAMESPACE ZSTD_ /* Zstandard specific */ -#endif +#ifndef XXHASH_H_5627135585666179 +#define XXHASH_H_5627135585666179 1 /* **************************** @@ -242,6 +238,11 @@ /* ************************** * Canonical representation ****************************/ +/* Default result type for XXH functions are primitive unsigned 32 and 64 bits. +* The canonical representation uses human-readable write convention, aka big-endian (large digits first). +* These functions allow transformation of hash result into and from its canonical format. +* This way, hash values can be written into a file / memory, and remain comparable on different systems and programs. +*/ typedef struct { unsigned char digest[4]; } XXH32_canonical_t; typedef struct { unsigned char digest[8]; } XXH64_canonical_t; @@ -251,14 +252,9 @@ XXH_PUBLIC_API XXH32_hash_t XXH32_hashFromCanonical(const XXH32_canonical_t* src); XXH_PUBLIC_API XXH64_hash_t XXH64_hashFromCanonical(const XXH64_canonical_t* src); -/* Default result type for XXH functions are primitive unsigned 32 and 64 bits. -* The canonical representation uses human-readable write convention, aka big-endian (large digits first). -* These functions allow transformation of hash result into and from its canonical format. -* This way, hash values can be written into a file / memory, and remain comparable on different systems and programs. -*/ +#endif /* XXHASH_H_5627135585666179 */ -#ifdef XXH_STATIC_LINKING_ONLY /* ================================================================================================ This section contains definitions which are not guaranteed to remain stable. @@ -266,6 +262,8 @@ They shall only be used with static linking. Never use these definitions in association with dynamic linking ! =================================================================================================== */ +#if defined(XXH_STATIC_LINKING_ONLY) && !defined(XXH_STATIC_H_3543687687345) +#define XXH_STATIC_H_3543687687345 /* These definitions are only meant to allow allocation of XXH state statically, on stack, or in a struct for example. @@ -299,11 +297,9 @@ # include "xxhash.c" /* include xxhash functions as `static`, for inlining */ # endif -#endif /* XXH_STATIC_LINKING_ONLY */ +#endif /* XXH_STATIC_LINKING_ONLY && XXH_STATIC_H_3543687687345 */ #if defined (__cplusplus) } #endif - -#endif /* XXHASH_H_5627135585666179 */ diff -r fb92df8b634c -r ed5448edcbfa contrib/python-zstandard/zstd/common/zstd_common.c --- a/contrib/python-zstandard/zstd/common/zstd_common.c Wed Apr 04 10:35:09 2018 -0400 +++ b/contrib/python-zstandard/zstd/common/zstd_common.c Wed Apr 18 15:32:08 2018 -0400 @@ -1,10 +1,11 @@ -/** +/* * Copyright (c) 2016-present, Yann Collet, Facebook, Inc. * All rights reserved. * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. An additional grant - * of patent rights can be found in the PATENTS file in the same directory. + * This source code is licensed under both the BSD-style license (found in the + * LICENSE file in the root directory of this source tree) and the GPLv2 (found + * in the COPYING file in the root directory of this source tree). + * You may select, at your option, one of the above-listed licenses. */ @@ -12,62 +13,74 @@ /*-************************************* * Dependencies ***************************************/ -#include /* malloc */ +#include /* malloc, calloc, free */ +#include /* memset */ #include "error_private.h" -#define ZSTD_STATIC_LINKING_ONLY -#include "zstd.h" /* declaration of ZSTD_isError, ZSTD_getErrorName, ZSTD_getErrorCode, ZSTD_getErrorString, ZSTD_versionNumber */ +#include "zstd_internal.h" /*-**************************************** * Version ******************************************/ -unsigned ZSTD_versionNumber (void) { return ZSTD_VERSION_NUMBER; } +unsigned ZSTD_versionNumber(void) { return ZSTD_VERSION_NUMBER; } + +const char* ZSTD_versionString(void) { return ZSTD_VERSION_STRING; } /*-**************************************** * ZSTD Error Management ******************************************/ /*! ZSTD_isError() : -* tells if a return value is an error code */ + * tells if a return value is an error code */ unsigned ZSTD_isError(size_t code) { return ERR_isError(code); } /*! ZSTD_getErrorName() : -* provides error code string from function result (useful for debugging) */ + * provides error code string from function result (useful for debugging) */ const char* ZSTD_getErrorName(size_t code) { return ERR_getErrorName(code); } /*! ZSTD_getError() : -* convert a `size_t` function result into a proper ZSTD_errorCode enum */ + * convert a `size_t` function result into a proper ZSTD_errorCode enum */ ZSTD_ErrorCode ZSTD_getErrorCode(size_t code) { return ERR_getErrorCode(code); } /*! ZSTD_getErrorString() : -* provides error code string from enum */ -const char* ZSTD_getErrorString(ZSTD_ErrorCode code) { return ERR_getErrorName(code); } + * provides error code string from enum */ +const char* ZSTD_getErrorString(ZSTD_ErrorCode code) { return ERR_getErrorString(code); } + +/*! g_debuglog_enable : + * turn on/off debug traces (global switch) */ +#if defined(ZSTD_DEBUG) && (ZSTD_DEBUG >= 2) +int g_debuglog_enable = 1; +#endif /*=************************************************************** * Custom allocator ****************************************************************/ -/* default uses stdlib */ -void* ZSTD_defaultAllocFunction(void* opaque, size_t size) +void* ZSTD_malloc(size_t size, ZSTD_customMem customMem) { - void* address = malloc(size); - (void)opaque; - return address; + if (customMem.customAlloc) + return customMem.customAlloc(customMem.opaque, size); + return malloc(size); } -void ZSTD_defaultFreeFunction(void* opaque, void* address) +void* ZSTD_calloc(size_t size, ZSTD_customMem customMem) { - (void)opaque; - free(address); -} - -void* ZSTD_malloc(size_t size, ZSTD_customMem customMem) -{ - return customMem.customAlloc(customMem.opaque, size); + if (customMem.customAlloc) { + /* calloc implemented as malloc+memset; + * not as efficient as calloc, but next best guess for custom malloc */ + void* const ptr = customMem.customAlloc(customMem.opaque, size); + memset(ptr, 0, size); + return ptr; + } + return calloc(1, size); } void ZSTD_free(void* ptr, ZSTD_customMem customMem) { - if (ptr!=NULL) - customMem.customFree(customMem.opaque, ptr); + if (ptr!=NULL) { + if (customMem.customFree) + customMem.customFree(customMem.opaque, ptr); + else + free(ptr); + } } diff -r fb92df8b634c -r ed5448edcbfa contrib/python-zstandard/zstd/common/zstd_errors.h --- a/contrib/python-zstandard/zstd/common/zstd_errors.h Wed Apr 04 10:35:09 2018 -0400 +++ b/contrib/python-zstandard/zstd/common/zstd_errors.h Wed Apr 18 15:32:08 2018 -0400 @@ -1,10 +1,11 @@ -/** +/* * Copyright (c) 2016-present, Yann Collet, Facebook, Inc. * All rights reserved. * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. An additional grant - * of patent rights can be found in the PATENTS file in the same directory. + * This source code is licensed under both the BSD-style license (found in the + * LICENSE file in the root directory of this source tree) and the GPLv2 (found + * in the COPYING file in the root directory of this source tree). + * You may select, at your option, one of the above-listed licenses. */ #ifndef ZSTD_ERRORS_H_398273423 @@ -19,10 +20,12 @@ /* ===== ZSTDERRORLIB_API : control library symbols visibility ===== */ -#if defined(__GNUC__) && (__GNUC__ >= 4) -# define ZSTDERRORLIB_VISIBILITY __attribute__ ((visibility ("default"))) -#else -# define ZSTDERRORLIB_VISIBILITY +#ifndef ZSTDERRORLIB_VISIBILITY +# if defined(__GNUC__) && (__GNUC__ >= 4) +# define ZSTDERRORLIB_VISIBILITY __attribute__ ((visibility ("default"))) +# else +# define ZSTDERRORLIB_VISIBILITY +# endif #endif #if defined(ZSTD_DLL_EXPORT) && (ZSTD_DLL_EXPORT==1) # define ZSTDERRORLIB_API __declspec(dllexport) ZSTDERRORLIB_VISIBILITY @@ -32,39 +35,54 @@ # define ZSTDERRORLIB_API ZSTDERRORLIB_VISIBILITY #endif -/*-**************************************** -* error codes list -******************************************/ +/*-********************************************* + * Error codes list + *-********************************************* + * Error codes _values_ are pinned down since v1.3.1 only. + * Therefore, don't rely on values if you may link to any version < v1.3.1. + * + * Only values < 100 are considered stable. + * + * note 1 : this API shall be used with static linking only. + * dynamic linking is not yet officially supported. + * note 2 : Prefer relying on the enum than on its value whenever possible + * This is the only supported way to use the error list < v1.3.1 + * note 3 : ZSTD_isError() is always correct, whatever the library version. + **********************************************/ typedef enum { - ZSTD_error_no_error, - ZSTD_error_GENERIC, - ZSTD_error_prefix_unknown, - ZSTD_error_version_unsupported, - ZSTD_error_parameter_unknown, - ZSTD_error_frameParameter_unsupported, - ZSTD_error_frameParameter_unsupportedBy32bits, - ZSTD_error_frameParameter_windowTooLarge, - ZSTD_error_compressionParameter_unsupported, - ZSTD_error_init_missing, - ZSTD_error_memory_allocation, - ZSTD_error_stage_wrong, - ZSTD_error_dstSize_tooSmall, - ZSTD_error_srcSize_wrong, - ZSTD_error_corruption_detected, - ZSTD_error_checksum_wrong, - ZSTD_error_tableLog_tooLarge, - ZSTD_error_maxSymbolValue_tooLarge, - ZSTD_error_maxSymbolValue_tooSmall, - ZSTD_error_dictionary_corrupted, - ZSTD_error_dictionary_wrong, - ZSTD_error_maxCode + ZSTD_error_no_error = 0, + ZSTD_error_GENERIC = 1, + ZSTD_error_prefix_unknown = 10, + ZSTD_error_version_unsupported = 12, + ZSTD_error_frameParameter_unsupported = 14, + ZSTD_error_frameParameter_windowTooLarge = 16, + ZSTD_error_corruption_detected = 20, + ZSTD_error_checksum_wrong = 22, + ZSTD_error_dictionary_corrupted = 30, + ZSTD_error_dictionary_wrong = 32, + ZSTD_error_dictionaryCreation_failed = 34, + ZSTD_error_parameter_unsupported = 40, + ZSTD_error_parameter_outOfBound = 42, + ZSTD_error_tableLog_tooLarge = 44, + ZSTD_error_maxSymbolValue_tooLarge = 46, + ZSTD_error_maxSymbolValue_tooSmall = 48, + ZSTD_error_stage_wrong = 60, + ZSTD_error_init_missing = 62, + ZSTD_error_memory_allocation = 64, + ZSTD_error_workSpace_tooSmall= 66, + ZSTD_error_dstSize_tooSmall = 70, + ZSTD_error_srcSize_wrong = 72, + /* following error codes are __NOT STABLE__, they can be removed or changed in future versions */ + ZSTD_error_frameIndex_tooLarge = 100, + ZSTD_error_seekableIO = 102, + ZSTD_error_maxCode = 120 /* never EVER use this value directly, it can change in future versions! Use ZSTD_isError() instead */ } ZSTD_ErrorCode; /*! ZSTD_getErrorCode() : convert a `size_t` function result into a `ZSTD_ErrorCode` enum type, - which can be used to compare directly with enum list published into "error_public.h" */ + which can be used to compare with enum list published above */ ZSTDERRORLIB_API ZSTD_ErrorCode ZSTD_getErrorCode(size_t functionResult); -ZSTDERRORLIB_API const char* ZSTD_getErrorString(ZSTD_ErrorCode code); +ZSTDERRORLIB_API const char* ZSTD_getErrorString(ZSTD_ErrorCode code); /**< Same as ZSTD_getErrorName, but using a `ZSTD_ErrorCode` enum argument */ #if defined (__cplusplus) diff -r fb92df8b634c -r ed5448edcbfa contrib/python-zstandard/zstd/common/zstd_internal.h --- a/contrib/python-zstandard/zstd/common/zstd_internal.h Wed Apr 04 10:35:09 2018 -0400 +++ b/contrib/python-zstandard/zstd/common/zstd_internal.h Wed Apr 18 15:32:08 2018 -0400 @@ -1,59 +1,87 @@ -/** +/* * Copyright (c) 2016-present, Yann Collet, Facebook, Inc. * All rights reserved. * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. An additional grant - * of patent rights can be found in the PATENTS file in the same directory. + * This source code is licensed under both the BSD-style license (found in the + * LICENSE file in the root directory of this source tree) and the GPLv2 (found + * in the COPYING file in the root directory of this source tree). + * You may select, at your option, one of the above-listed licenses. */ #ifndef ZSTD_CCOMMON_H_MODULE #define ZSTD_CCOMMON_H_MODULE -/*-******************************************************* -* Compiler specifics -*********************************************************/ -#ifdef _MSC_VER /* Visual Studio */ -# define FORCE_INLINE static __forceinline -# include /* For Visual 2005 */ -# pragma warning(disable : 4127) /* disable: C4127: conditional expression is constant */ -# pragma warning(disable : 4324) /* disable: C4324: padded structure */ -# pragma warning(disable : 4100) /* disable: C4100: unreferenced formal parameter */ -#else -# if defined (__cplusplus) || defined (__STDC_VERSION__) && __STDC_VERSION__ >= 199901L /* C99 */ -# ifdef __GNUC__ -# define FORCE_INLINE static inline __attribute__((always_inline)) -# else -# define FORCE_INLINE static inline -# endif -# else -# define FORCE_INLINE static -# endif /* __STDC_VERSION__ */ +/* this module contains definitions which must be identical + * across compression, decompression and dictBuilder. + * It also contains a few functions useful to at least 2 of them + * and which benefit from being inlined */ + +/*-************************************* +* Dependencies +***************************************/ +#include "compiler.h" +#include "mem.h" +#include "error_private.h" +#define ZSTD_STATIC_LINKING_ONLY +#include "zstd.h" +#define FSE_STATIC_LINKING_ONLY +#include "fse.h" +#define HUF_STATIC_LINKING_ONLY +#include "huf.h" +#ifndef XXH_STATIC_LINKING_ONLY +# define XXH_STATIC_LINKING_ONLY /* XXH64_state_t */ #endif +#include "xxhash.h" /* XXH_reset, update, digest */ -#ifdef _MSC_VER -# define FORCE_NOINLINE static __declspec(noinline) -#else -# ifdef __GNUC__ -# define FORCE_NOINLINE static __attribute__((__noinline__)) -# else -# define FORCE_NOINLINE static -# endif + +#if defined (__cplusplus) +extern "C" { #endif /*-************************************* -* Dependencies +* Debug ***************************************/ -#include "mem.h" -#include "error_private.h" -#define ZSTD_STATIC_LINKING_ONLY -#include "zstd.h" +#if defined(ZSTD_DEBUG) && (ZSTD_DEBUG>=1) +# include +#else +# ifndef assert +# define assert(condition) ((void)0) +# endif +#endif + +#define ZSTD_STATIC_ASSERT(c) { enum { ZSTD_static_assert = 1/(int)(!!(c)) }; } + +#if defined(ZSTD_DEBUG) && (ZSTD_DEBUG>=2) +# include +extern int g_debuglog_enable; +/* recommended values for ZSTD_DEBUG display levels : + * 1 : no display, enables assert() only + * 2 : reserved for currently active debug path + * 3 : events once per object lifetime (CCtx, CDict, etc.) + * 4 : events once per frame + * 5 : events once per block + * 6 : events once per sequence (*very* verbose) */ +# define RAWLOG(l, ...) { \ + if ((g_debuglog_enable) & (l<=ZSTD_DEBUG)) { \ + fprintf(stderr, __VA_ARGS__); \ + } } +# define DEBUGLOG(l, ...) { \ + if ((g_debuglog_enable) & (l<=ZSTD_DEBUG)) { \ + fprintf(stderr, __FILE__ ": " __VA_ARGS__); \ + fprintf(stderr, " \n"); \ + } } +#else +# define RAWLOG(l, ...) {} /* disabled */ +# define DEBUGLOG(l, ...) {} /* disabled */ +#endif /*-************************************* * shared macros ***************************************/ +#undef MIN +#undef MAX #define MIN(a,b) ((a)<(b) ? (a) : (b)) #define MAX(a,b) ((a)>(b) ? (a) : (b)) #define CHECK_F(f) { size_t const errcod = f; if (ERR_isError(errcod)) return errcod; } /* check and Forward error code */ @@ -64,12 +92,9 @@ * Common constants ***************************************/ #define ZSTD_OPT_NUM (1<<12) -#define ZSTD_DICT_MAGIC 0xEC30A437 /* v0.7+ */ #define ZSTD_REP_NUM 3 /* number of repcodes */ -#define ZSTD_REP_CHECK (ZSTD_REP_NUM) /* number of repcodes to check by the optimal parser */ #define ZSTD_REP_MOVE (ZSTD_REP_NUM-1) -#define ZSTD_REP_MOVE_OPT (ZSTD_REP_NUM) static const U32 repStartValue[ZSTD_REP_NUM] = { 1, 4, 8 }; #define KB *(1 <<10) @@ -84,9 +109,13 @@ #define BIT0 1 #define ZSTD_WINDOWLOG_ABSOLUTEMIN 10 +#define ZSTD_WINDOWLOG_DEFAULTMAX 27 /* Default maximum allowed window log */ static const size_t ZSTD_fcs_fieldSize[4] = { 0, 2, 4, 8 }; static const size_t ZSTD_did_fieldSize[4] = { 0, 1, 2, 4 }; +#define ZSTD_FRAMEIDSIZE 4 +static const size_t ZSTD_frameIdSize = ZSTD_FRAMEIDSIZE; /* magic number size */ + #define ZSTD_BLOCKHEADERSIZE 3 /* C standard doesn't allow `static const` variable to be init using another `static const` variable */ static const size_t ZSTD_blockHeaderSize = ZSTD_BLOCKHEADERSIZE; typedef enum { bt_raw, bt_rle, bt_compressed, bt_reserved } blockType_e; @@ -100,40 +129,53 @@ #define LONGNBSEQ 0x7F00 #define MINMATCH 3 -#define EQUAL_READ32 4 #define Litbits 8 #define MaxLit ((1<= 3) /* GCC Intrinsic */ - return 31 - __builtin_clz(val); + return 31 - __builtin_clz(val); # else /* Software version */ - static const int DeBruijnClz[32] = { 0, 9, 1, 10, 13, 21, 2, 29, 11, 14, 16, 18, 22, 25, 3, 30, 8, 12, 20, 28, 15, 17, 24, 7, 19, 27, 23, 6, 26, 5, 4, 31 }; - U32 v = val; - int r; - v |= v >> 1; - v |= v >> 2; - v |= v >> 4; - v |= v >> 8; - v |= v >> 16; - r = DeBruijnClz[(U32)(v * 0x07C4ACDDU) >> 27]; - return r; + static const U32 DeBruijnClz[32] = { 0, 9, 1, 10, 13, 21, 2, 29, 11, 14, 16, 18, 22, 25, 3, 30, 8, 12, 20, 28, 15, 17, 24, 7, 19, 27, 23, 6, 26, 5, 4, 31 }; + U32 v = val; + v |= v >> 1; + v |= v >> 2; + v |= v >> 4; + v |= v >> 8; + v |= v >> 16; + return DeBruijnClz[(v * 0x07C4ACDDU) >> 27]; # endif + } } -/* hidden functions */ - /* ZSTD_invalidateRepCodes() : * ensures next compression will not use repcodes from previous block. * Note : only works with regular variant; * do not use with extDict variant ! */ -void ZSTD_invalidateRepCodes(ZSTD_CCtx* cctx); +void ZSTD_invalidateRepCodes(ZSTD_CCtx* cctx); /* zstdmt, adaptive_compression (shouldn't get this definition from here) */ +typedef struct { + blockType_e blockType; + U32 lastBlock; + U32 origSize; +} blockProperties_t; + +/*! ZSTD_getcBlockSize() : + * Provides the size of compressed block from block header `src` */ +/* Used by: decompress, fullbench (does not get its definition from here) */ +size_t ZSTD_getcBlockSize(const void* src, size_t srcSize, + blockProperties_t* bpPtr); + +#if defined (__cplusplus) +} +#endif + #endif /* ZSTD_CCOMMON_H_MODULE */ diff -r fb92df8b634c -r ed5448edcbfa contrib/python-zstandard/zstd/compress/fse_compress.c --- a/contrib/python-zstandard/zstd/compress/fse_compress.c Wed Apr 04 10:35:09 2018 -0400 +++ b/contrib/python-zstandard/zstd/compress/fse_compress.c Wed Apr 18 15:32:08 2018 -0400 @@ -33,40 +33,22 @@ ****************************************************************** */ /* ************************************************************** -* Compiler specifics -****************************************************************/ -#ifdef _MSC_VER /* Visual Studio */ -# define FORCE_INLINE static __forceinline -# include /* For Visual 2005 */ -# pragma warning(disable : 4127) /* disable: C4127: conditional expression is constant */ -# pragma warning(disable : 4214) /* disable: C4214: non-int bitfields */ -#else -# if defined (__cplusplus) || defined (__STDC_VERSION__) && __STDC_VERSION__ >= 199901L /* C99 */ -# ifdef __GNUC__ -# define FORCE_INLINE static inline __attribute__((always_inline)) -# else -# define FORCE_INLINE static inline -# endif -# else -# define FORCE_INLINE static -# endif /* __STDC_VERSION__ */ -#endif - - -/* ************************************************************** * Includes ****************************************************************/ #include /* malloc, free, qsort */ #include /* memcpy, memset */ #include /* printf (debug) */ #include "bitstream.h" +#include "compiler.h" #define FSE_STATIC_LINKING_ONLY #include "fse.h" +#include "error_private.h" /* ************************************************************** * Error Management ****************************************************************/ +#define FSE_isError ERR_isError #define FSE_STATIC_ASSERT(c) { enum { FSE_static_assert = 1/(int)(!!(c)) }; } /* use only *after* variable declarations */ @@ -201,8 +183,6 @@ return maxSymbolValue ? maxHeaderSize : FSE_NCOUNTBOUND; /* maxSymbolValue==0 ? use default */ } -static short FSE_abs(short a) { return (short)(a<0 ? -a : a); } - static size_t FSE_writeNCount_generic (void* header, size_t headerBufferSize, const short* normalizedCounter, unsigned maxSymbolValue, unsigned tableLog, unsigned writeIsSafe) @@ -258,17 +238,17 @@ bitStream >>= 16; bitCount -= 16; } } - { short count = normalizedCounter[charnum++]; - const short max = (short)((2*threshold-1)-remaining); - remaining -= FSE_abs(count); - if (remaining<1) return ERROR(GENERIC); + { int count = normalizedCounter[charnum++]; + int const max = (2*threshold-1)-remaining; + remaining -= count < 0 ? -count : count; count++; /* +1 for extra accuracy */ if (count>=threshold) count += max; /* [0..max[ [max..threshold[ (...) [threshold+max 2*threshold[ */ bitStream += count << bitCount; bitCount += nbBits; bitCount -= (count>=1; + if (remaining<1) return ERROR(GENERIC); + while (remaining>=1; } } if (bitCount>16) { if ((!writeIsSafe) && (out > oend - 2)) return ERROR(dstSize_tooSmall); /* Buffer overflow */ @@ -293,7 +273,7 @@ size_t FSE_writeNCount (void* buffer, size_t bufferSize, const short* normalizedCounter, unsigned maxSymbolValue, unsigned tableLog) { - if (tableLog > FSE_MAX_TABLELOG) return ERROR(GENERIC); /* Unsupported */ + if (tableLog > FSE_MAX_TABLELOG) return ERROR(tableLog_tooLarge); /* Unsupported */ if (tableLog < FSE_MIN_TABLELOG) return ERROR(GENERIC); /* Unsupported */ if (bufferSize < FSE_NCountWriteBound(maxSymbolValue, tableLog)) @@ -312,7 +292,7 @@ It doesn't use any additional memory. But this function is unsafe : it doesn't check that all values within `src` can fit into `count`. For this reason, prefer using a table `count` with 256 elements. - @return : count of most numerous element + @return : count of most numerous element. */ size_t FSE_count_simple(unsigned* count, unsigned* maxSymbolValuePtr, const void* src, size_t srcSize) @@ -325,7 +305,10 @@ memset(count, 0, (maxSymbolValue+1)*sizeof(*count)); if (srcSize==0) { *maxSymbolValuePtr = 0; return 0; } - while (ip 255) maxSymbolValue = 255; + for (s=0; s<=maxSymbolValue; s++) { count[s] = Counting1[s] + Counting2[s] + Counting3[s] + Counting4[s]; if (count[s] > max) max = count[s]; } } @@ -413,9 +399,11 @@ * Same as FSE_countFast(), but using an externally provided scratch buffer. * `workSpace` size must be table of >= `1024` unsigned */ size_t FSE_countFast_wksp(unsigned* count, unsigned* maxSymbolValuePtr, - const void* source, size_t sourceSize, unsigned* workSpace) + const void* source, size_t sourceSize, + unsigned* workSpace) { - if (sourceSize < 1500) return FSE_count_simple(count, maxSymbolValuePtr, source, sourceSize); + if (sourceSize < 1500) /* heuristic threshold */ + return FSE_count_simple(count, maxSymbolValuePtr, source, sourceSize); return FSE_count_parallel_wksp(count, maxSymbolValuePtr, source, sourceSize, 0, workSpace); } @@ -478,20 +466,22 @@ /* provides the minimum logSize to safely represent a distribution */ static unsigned FSE_minTableLog(size_t srcSize, unsigned maxSymbolValue) { - U32 minBitsSrc = BIT_highbit32((U32)(srcSize - 1)) + 1; - U32 minBitsSymbols = BIT_highbit32(maxSymbolValue) + 2; - U32 minBits = minBitsSrc < minBitsSymbols ? minBitsSrc : minBitsSymbols; - return minBits; + U32 minBitsSrc = BIT_highbit32((U32)(srcSize - 1)) + 1; + U32 minBitsSymbols = BIT_highbit32(maxSymbolValue) + 2; + U32 minBits = minBitsSrc < minBitsSymbols ? minBitsSrc : minBitsSymbols; + assert(srcSize > 1); /* Not supported, RLE should be used instead */ + return minBits; } unsigned FSE_optimalTableLog_internal(unsigned maxTableLog, size_t srcSize, unsigned maxSymbolValue, unsigned minus) { - U32 maxBitsSrc = BIT_highbit32((U32)(srcSize - 1)) - minus; + U32 maxBitsSrc = BIT_highbit32((U32)(srcSize - 1)) - minus; U32 tableLog = maxTableLog; - U32 minBits = FSE_minTableLog(srcSize, maxSymbolValue); + U32 minBits = FSE_minTableLog(srcSize, maxSymbolValue); + assert(srcSize > 1); /* Not supported, RLE should be used instead */ if (tableLog==0) tableLog = FSE_DEFAULT_TABLELOG; - if (maxBitsSrc < tableLog) tableLog = maxBitsSrc; /* Accuracy can be reduced */ - if (minBits > tableLog) tableLog = minBits; /* Need a minimum to safely represent all symbol values */ + if (maxBitsSrc < tableLog) tableLog = maxBitsSrc; /* Accuracy can be reduced */ + if (minBits > tableLog) tableLog = minBits; /* Need a minimum to safely represent all symbol values */ if (tableLog < FSE_MIN_TABLELOG) tableLog = FSE_MIN_TABLELOG; if (tableLog > FSE_MAX_TABLELOG) tableLog = FSE_MAX_TABLELOG; return tableLog; @@ -508,6 +498,7 @@ static size_t FSE_normalizeM2(short* norm, U32 tableLog, const unsigned* count, size_t total, U32 maxSymbolValue) { + short const NOT_YET_ASSIGNED = -2; U32 s; U32 distributed = 0; U32 ToDistribute; @@ -533,7 +524,8 @@ total -= count[s]; continue; } - norm[s]=-2; + + norm[s]=NOT_YET_ASSIGNED; } ToDistribute = (1 << tableLog) - distributed; @@ -541,7 +533,7 @@ /* risk of rounding to zero */ lowOne = (U32)((total * 3) / (ToDistribute * 2)); for (s=0; s<=maxSymbolValue; s++) { - if ((norm[s] == -2) && (count[s] <= lowOne)) { + if ((norm[s] == NOT_YET_ASSIGNED) && (count[s] <= lowOne)) { norm[s] = 1; distributed++; total -= count[s]; @@ -556,17 +548,24 @@ find max, then give all remaining points to max */ U32 maxV = 0, maxC = 0; for (s=0; s<=maxSymbolValue; s++) - if (count[s] > maxC) maxV=s, maxC=count[s]; + if (count[s] > maxC) { maxV=s; maxC=count[s]; } norm[maxV] += (short)ToDistribute; return 0; } + if (total == 0) { + /* all of the symbols were low enough for the lowOne or lowThreshold */ + for (s=0; ToDistribute > 0; s = (s+1)%(maxSymbolValue+1)) + if (norm[s] > 0) { ToDistribute--; norm[s]++; } + return 0; + } + { U64 const vStepLog = 62 - tableLog; U64 const mid = (1ULL << (vStepLog-1)) - 1; U64 const rStep = ((((U64)1<> vStepLog); U32 const sEnd = (U32)(end >> vStepLog); @@ -591,7 +590,7 @@ if (tableLog > FSE_MAX_TABLELOG) return ERROR(tableLog_tooLarge); /* Unsupported size */ if (tableLog < FSE_minTableLog(total, maxSymbolValue)) return ERROR(GENERIC); /* Too small tableLog, compression potentially impossible */ - { U32 const rtbTable[] = { 0, 473195, 504333, 520860, 550000, 700000, 750000, 830000 }; + { static U32 const rtbTable[] = { 0, 473195, 504333, 520860, 550000, 700000, 750000, 830000 }; U64 const scale = 62 - tableLog; U64 const step = ((U64)1<<62) / total; /* <== here, one division ! */ U64 const vStep = 1ULL<<(scale-20); @@ -613,7 +612,7 @@ U64 restToBeat = vStep * rtbTable[proba]; proba += (count[s]*step) - ((U64)proba< restToBeat; } - if (proba > largestP) largestP=proba, largest=s; + if (proba > largestP) { largestP=proba; largest=s; } normalizedCounter[s] = proba; stillToDistribute -= proba; } } @@ -774,7 +773,7 @@ size_t FSE_compressBound(size_t size) { return FSE_COMPRESSBOUND(size); } -#define CHECK_V_F(e, f) size_t const e = f; if (ERR_isError(e)) return f +#define CHECK_V_F(e, f) size_t const e = f; if (ERR_isError(e)) return e #define CHECK_F(f) { CHECK_V_F(_var_err__, f); } /* FSE_compress_wksp() : @@ -801,7 +800,7 @@ if (!tableLog) tableLog = FSE_DEFAULT_TABLELOG; /* Scan input and build symbol stats */ - { CHECK_V_F(maxCount, FSE_count(count, &maxSymbolValue, src, srcSize) ); + { CHECK_V_F(maxCount, FSE_count_wksp(count, &maxSymbolValue, src, srcSize, (unsigned*)scratchBuffer) ); if (maxCount == srcSize) return 1; /* only a single symbol in src : rle */ if (maxCount == 1) return 0; /* each symbol present maximum once => not compressible */ if (maxCount < (srcSize >> 7)) return 0; /* Heuristic : not compressible enough */ diff -r fb92df8b634c -r ed5448edcbfa contrib/python-zstandard/zstd/compress/huf_compress.c --- a/contrib/python-zstandard/zstd/compress/huf_compress.c Wed Apr 04 10:35:09 2018 -0400 +++ b/contrib/python-zstandard/zstd/compress/huf_compress.c Wed Apr 18 15:32:08 2018 -0400 @@ -46,17 +46,20 @@ #include /* memcpy, memset */ #include /* printf (debug) */ #include "bitstream.h" +#include "compiler.h" #define FSE_STATIC_LINKING_ONLY /* FSE_optimalTableLog_internal */ #include "fse.h" /* header compression */ #define HUF_STATIC_LINKING_ONLY #include "huf.h" +#include "error_private.h" /* ************************************************************** * Error Management ****************************************************************/ +#define HUF_isError ERR_isError #define HUF_STATIC_ASSERT(c) { enum { HUF_static_assert = 1/(int)(!!(c)) }; } /* use only *after* variable declarations */ -#define CHECK_V_F(e, f) size_t const e = f; if (ERR_isError(e)) return f +#define CHECK_V_F(e, f) size_t const e = f; if (ERR_isError(e)) return e #define CHECK_F(f) { CHECK_V_F(_var_err__, f); } @@ -127,7 +130,7 @@ }; /* typedef'd to HUF_CElt within "huf.h" */ /*! HUF_writeCTable() : - `CTable` : huffman tree to save, using huf representation. + `CTable` : Huffman tree to save, using huf representation. @return : size of saved CTable */ size_t HUF_writeCTable (void* dst, size_t maxDstSize, const HUF_CElt* CTable, U32 maxSymbolValue, U32 huffLog) @@ -165,7 +168,7 @@ } -size_t HUF_readCTable (HUF_CElt* CTable, U32 maxSymbolValue, const void* src, size_t srcSize) +size_t HUF_readCTable (HUF_CElt* CTable, U32* maxSymbolValuePtr, const void* src, size_t srcSize) { BYTE huffWeight[HUF_SYMBOLVALUE_MAX + 1]; /* init not required, even though some static analyzer may complain */ U32 rankVal[HUF_TABLELOG_ABSOLUTEMAX + 1]; /* large enough for values from 0 to 16 */ @@ -177,7 +180,7 @@ /* check result */ if (tableLog > HUF_TABLELOG_MAX) return ERROR(tableLog_tooLarge); - if (nbSymbols > maxSymbolValue+1) return ERROR(maxSymbolValue_tooSmall); + if (nbSymbols > *maxSymbolValuePtr+1) return ERROR(maxSymbolValue_tooSmall); /* Prepare base value per rank */ { U32 n, nextRankStart = 0; @@ -206,9 +209,10 @@ min >>= 1; } } /* assign value within rank, symbol order */ - { U32 n; for (n=0; n<=maxSymbolValue; n++) CTable[n].val = valPerRank[CTable[n].nbBits]++; } + { U32 n; for (n=0; n find closest one (note : there is necessarily at least one !) */ - while ((nBitsToDecrease<=HUF_TABLELOG_MAX) && (rankLast[nBitsToDecrease] == noSymbol)) /* HUF_MAX_TABLELOG test just to please gcc 5+; but it should not be necessary */ + /* HUF_MAX_TABLELOG test just to please gcc 5+; but it should not be necessary */ + while ((nBitsToDecrease<=HUF_TABLELOG_MAX) && (rankLast[nBitsToDecrease] == noSymbol)) nBitsToDecrease ++; totalCost -= 1 << (nBitsToDecrease-1); if (rankLast[nBitsToDecrease-1] == noSymbol) @@ -318,7 +323,10 @@ U32 const c = count[n]; U32 const r = BIT_highbit32(c+1) + 1; U32 pos = rank[r].current++; - while ((pos > rank[r].base) && (c > huffNode[pos-1].count)) huffNode[pos]=huffNode[pos-1], pos--; + while ((pos > rank[r].base) && (c > huffNode[pos-1].count)) { + huffNode[pos] = huffNode[pos-1]; + pos--; + } huffNode[pos].count = c; huffNode[pos].byte = (BYTE)n; } @@ -327,10 +335,10 @@ /** HUF_buildCTable_wksp() : * Same as HUF_buildCTable(), but using externally allocated scratch buffer. - * `workSpace` must be aligned on 4-bytes boundaries, and be at least as large as a table of 1024 unsigned. + * `workSpace` must be aligned on 4-bytes boundaries, and be at least as large as a table of HUF_CTABLE_WORKSPACE_SIZE_U32 unsigned. */ #define STARTNODE (HUF_SYMBOLVALUE_MAX+1) -typedef nodeElt huffNodeTable[2*HUF_SYMBOLVALUE_MAX+1 +1]; +typedef nodeElt huffNodeTable[HUF_CTABLE_WORKSPACE_SIZE_U32]; size_t HUF_buildCTable_wksp (HUF_CElt* tree, const U32* count, U32 maxSymbolValue, U32 maxNbBits, void* workSpace, size_t wkspSize) { nodeElt* const huffNode0 = (nodeElt*)workSpace; @@ -341,9 +349,10 @@ U32 nodeRoot; /* safety checks */ - if (wkspSize < sizeof(huffNodeTable)) return ERROR(GENERIC); /* workSpace is not large enough */ + if (((size_t)workSpace & 3) != 0) return ERROR(GENERIC); /* must be aligned on 4-bytes boundaries */ + if (wkspSize < sizeof(huffNodeTable)) return ERROR(workSpace_tooSmall); if (maxNbBits == 0) maxNbBits = HUF_TABLELOG_DEFAULT; - if (maxSymbolValue > HUF_SYMBOLVALUE_MAX) return ERROR(GENERIC); + if (maxSymbolValue > HUF_SYMBOLVALUE_MAX) return ERROR(maxSymbolValue_tooLarge); memset(huffNode0, 0, sizeof(huffNodeTable)); /* sort, decreasing order */ @@ -401,6 +410,7 @@ } /** HUF_buildCTable() : + * @return : maxNbBits * Note : count is used before tree is written, so they can safely overlap */ size_t HUF_buildCTable (HUF_CElt* tree, const U32* count, U32 maxSymbolValue, U32 maxNbBits) @@ -409,14 +419,34 @@ return HUF_buildCTable_wksp(tree, count, maxSymbolValue, maxNbBits, nodeTable, sizeof(nodeTable)); } -static void HUF_encodeSymbol(BIT_CStream_t* bitCPtr, U32 symbol, const HUF_CElt* CTable) +static size_t HUF_estimateCompressedSize(HUF_CElt* CTable, const unsigned* count, unsigned maxSymbolValue) +{ + size_t nbBits = 0; + int s; + for (s = 0; s <= (int)maxSymbolValue; ++s) { + nbBits += CTable[s].nbBits * count[s]; + } + return nbBits >> 3; +} + +static int HUF_validateCTable(const HUF_CElt* CTable, const unsigned* count, unsigned maxSymbolValue) { + int bad = 0; + int s; + for (s = 0; s <= (int)maxSymbolValue; ++s) { + bad |= (count[s] != 0) & (CTable[s].nbBits == 0); + } + return !bad; +} + +size_t HUF_compressBound(size_t size) { return HUF_COMPRESSBOUND(size); } + +FORCE_INLINE_TEMPLATE void +HUF_encodeSymbol(BIT_CStream_t* bitCPtr, U32 symbol, const HUF_CElt* CTable) { BIT_addBitsFast(bitCPtr, CTable[symbol].val, CTable[symbol].nbBits); } -size_t HUF_compressBound(size_t size) { return HUF_COMPRESSBOUND(size); } - -#define HUF_FLUSHBITS(s) (fast ? BIT_flushBitsFast(s) : BIT_flushBits(s)) +#define HUF_FLUSHBITS(s) BIT_flushBits(s) #define HUF_FLUSHBITS_1(stream) \ if (sizeof((stream)->bitContainer)*8 < HUF_TABLELOG_MAX*2+7) HUF_FLUSHBITS(stream) @@ -424,14 +454,16 @@ #define HUF_FLUSHBITS_2(stream) \ if (sizeof((stream)->bitContainer)*8 < HUF_TABLELOG_MAX*4+7) HUF_FLUSHBITS(stream) -size_t HUF_compress1X_usingCTable(void* dst, size_t dstSize, const void* src, size_t srcSize, const HUF_CElt* CTable) +FORCE_INLINE_TEMPLATE size_t +HUF_compress1X_usingCTable_internal_body(void* dst, size_t dstSize, + const void* src, size_t srcSize, + const HUF_CElt* CTable) { const BYTE* ip = (const BYTE*) src; BYTE* const ostart = (BYTE*)dst; BYTE* const oend = ostart + dstSize; BYTE* op = ostart; size_t n; - const unsigned fast = (dstSize >= HUF_BLOCKBOUND(srcSize)); BIT_CStream_t bitC; /* init */ @@ -444,12 +476,15 @@ { case 3 : HUF_encodeSymbol(&bitC, ip[n+ 2], CTable); HUF_FLUSHBITS_2(&bitC); + /* fall-through */ case 2 : HUF_encodeSymbol(&bitC, ip[n+ 1], CTable); HUF_FLUSHBITS_1(&bitC); + /* fall-through */ case 1 : HUF_encodeSymbol(&bitC, ip[n+ 0], CTable); HUF_FLUSHBITS(&bitC); - case 0 : - default: ; + /* fall-through */ + case 0 : /* fall-through */ + default: break; } for (; n>0; n-=4) { /* note : n&3==0 at this stage */ @@ -466,8 +501,58 @@ return BIT_closeCStream(&bitC); } +#if DYNAMIC_BMI2 -size_t HUF_compress4X_usingCTable(void* dst, size_t dstSize, const void* src, size_t srcSize, const HUF_CElt* CTable) +static TARGET_ATTRIBUTE("bmi2") size_t +HUF_compress1X_usingCTable_internal_bmi2(void* dst, size_t dstSize, + const void* src, size_t srcSize, + const HUF_CElt* CTable) +{ + return HUF_compress1X_usingCTable_internal_body(dst, dstSize, src, srcSize, CTable); +} + +static size_t +HUF_compress1X_usingCTable_internal_default(void* dst, size_t dstSize, + const void* src, size_t srcSize, + const HUF_CElt* CTable) +{ + return HUF_compress1X_usingCTable_internal_body(dst, dstSize, src, srcSize, CTable); +} + +static size_t +HUF_compress1X_usingCTable_internal(void* dst, size_t dstSize, + const void* src, size_t srcSize, + const HUF_CElt* CTable, const int bmi2) +{ + if (bmi2) { + return HUF_compress1X_usingCTable_internal_bmi2(dst, dstSize, src, srcSize, CTable); + } + return HUF_compress1X_usingCTable_internal_default(dst, dstSize, src, srcSize, CTable); +} + +#else + +static size_t +HUF_compress1X_usingCTable_internal(void* dst, size_t dstSize, + const void* src, size_t srcSize, + const HUF_CElt* CTable, const int bmi2) +{ + (void)bmi2; + return HUF_compress1X_usingCTable_internal_body(dst, dstSize, src, srcSize, CTable); +} + +#endif + +size_t HUF_compress1X_usingCTable(void* dst, size_t dstSize, const void* src, size_t srcSize, const HUF_CElt* CTable) +{ + return HUF_compress1X_usingCTable_internal(dst, dstSize, src, srcSize, CTable, /* bmi2 */ 0); +} + + +static size_t +HUF_compress4X_usingCTable_internal(void* dst, size_t dstSize, + const void* src, size_t srcSize, + const HUF_CElt* CTable, int bmi2) { size_t const segmentSize = (srcSize+3)/4; /* first 3 segments */ const BYTE* ip = (const BYTE*) src; @@ -480,28 +565,31 @@ if (srcSize < 12) return 0; /* no saving possible : too small input */ op += 6; /* jumpTable */ - { CHECK_V_F(cSize, HUF_compress1X_usingCTable(op, oend-op, ip, segmentSize, CTable) ); + { CHECK_V_F(cSize, HUF_compress1X_usingCTable_internal(op, oend-op, ip, segmentSize, CTable, bmi2) ); if (cSize==0) return 0; + assert(cSize <= 65535); MEM_writeLE16(ostart, (U16)cSize); op += cSize; } ip += segmentSize; - { CHECK_V_F(cSize, HUF_compress1X_usingCTable(op, oend-op, ip, segmentSize, CTable) ); + { CHECK_V_F(cSize, HUF_compress1X_usingCTable_internal(op, oend-op, ip, segmentSize, CTable, bmi2) ); if (cSize==0) return 0; + assert(cSize <= 65535); MEM_writeLE16(ostart+2, (U16)cSize); op += cSize; } ip += segmentSize; - { CHECK_V_F(cSize, HUF_compress1X_usingCTable(op, oend-op, ip, segmentSize, CTable) ); + { CHECK_V_F(cSize, HUF_compress1X_usingCTable_internal(op, oend-op, ip, segmentSize, CTable, bmi2) ); if (cSize==0) return 0; + assert(cSize <= 65535); MEM_writeLE16(ostart+4, (U16)cSize); op += cSize; } ip += segmentSize; - { CHECK_V_F(cSize, HUF_compress1X_usingCTable(op, oend-op, ip, iend-ip, CTable) ); + { CHECK_V_F(cSize, HUF_compress1X_usingCTable_internal(op, oend-op, ip, iend-ip, CTable, bmi2) ); if (cSize==0) return 0; op += cSize; } @@ -509,65 +597,120 @@ return op-ostart; } +size_t HUF_compress4X_usingCTable(void* dst, size_t dstSize, const void* src, size_t srcSize, const HUF_CElt* CTable) +{ + return HUF_compress4X_usingCTable_internal(dst, dstSize, src, srcSize, CTable, /* bmi2 */ 0); +} -/* `workSpace` must a table of at least 1024 unsigned */ + +static size_t HUF_compressCTable_internal( + BYTE* const ostart, BYTE* op, BYTE* const oend, + const void* src, size_t srcSize, + unsigned singleStream, const HUF_CElt* CTable, const int bmi2) +{ + size_t const cSize = singleStream ? + HUF_compress1X_usingCTable_internal(op, oend - op, src, srcSize, CTable, bmi2) : + HUF_compress4X_usingCTable_internal(op, oend - op, src, srcSize, CTable, bmi2); + if (HUF_isError(cSize)) { return cSize; } + if (cSize==0) { return 0; } /* uncompressible */ + op += cSize; + /* check compressibility */ + if ((size_t)(op-ostart) >= srcSize-1) { return 0; } + return op-ostart; +} + +typedef struct { + U32 count[HUF_SYMBOLVALUE_MAX + 1]; + HUF_CElt CTable[HUF_SYMBOLVALUE_MAX + 1]; + huffNodeTable nodeTable; +} HUF_compress_tables_t; + +/* HUF_compress_internal() : + * `workSpace` must a table of at least HUF_WORKSPACE_SIZE_U32 unsigned */ static size_t HUF_compress_internal ( void* dst, size_t dstSize, const void* src, size_t srcSize, unsigned maxSymbolValue, unsigned huffLog, unsigned singleStream, - void* workSpace, size_t wkspSize) + void* workSpace, size_t wkspSize, + HUF_CElt* oldHufTable, HUF_repeat* repeat, int preferRepeat, + const int bmi2) { + HUF_compress_tables_t* const table = (HUF_compress_tables_t*)workSpace; BYTE* const ostart = (BYTE*)dst; BYTE* const oend = ostart + dstSize; BYTE* op = ostart; - union { - U32 count[HUF_SYMBOLVALUE_MAX+1]; - HUF_CElt CTable[HUF_SYMBOLVALUE_MAX+1]; - } table; /* `count` can overlap with `CTable`; saves 1 KB */ - /* checks & inits */ - if (wkspSize < sizeof(huffNodeTable)) return ERROR(GENERIC); - if (!srcSize) return 0; /* Uncompressed (note : 1 means rle, so first byte must be correct) */ - if (!dstSize) return 0; /* cannot fit within dst budget */ + if (((size_t)workSpace & 3) != 0) return ERROR(GENERIC); /* must be aligned on 4-bytes boundaries */ + if (wkspSize < sizeof(*table)) return ERROR(workSpace_tooSmall); + if (!srcSize) return 0; /* Uncompressed */ + if (!dstSize) return 0; /* cannot fit anything within dst budget */ if (srcSize > HUF_BLOCKSIZE_MAX) return ERROR(srcSize_wrong); /* current block size limit */ if (huffLog > HUF_TABLELOG_MAX) return ERROR(tableLog_tooLarge); + if (maxSymbolValue > HUF_SYMBOLVALUE_MAX) return ERROR(maxSymbolValue_tooLarge); if (!maxSymbolValue) maxSymbolValue = HUF_SYMBOLVALUE_MAX; if (!huffLog) huffLog = HUF_TABLELOG_DEFAULT; + /* Heuristic : If old table is valid, use it for small inputs */ + if (preferRepeat && repeat && *repeat == HUF_repeat_valid) { + return HUF_compressCTable_internal(ostart, op, oend, + src, srcSize, + singleStream, oldHufTable, bmi2); + } + /* Scan input and build symbol stats */ - { CHECK_V_F(largest, FSE_count_wksp (table.count, &maxSymbolValue, (const BYTE*)src, srcSize, (U32*)workSpace) ); + { CHECK_V_F(largest, FSE_count_wksp (table->count, &maxSymbolValue, (const BYTE*)src, srcSize, table->count) ); if (largest == srcSize) { *ostart = ((const BYTE*)src)[0]; return 1; } /* single symbol, rle */ - if (largest <= (srcSize >> 7)+1) return 0; /* Fast heuristic : not compressible enough */ + if (largest <= (srcSize >> 7)+1) return 0; /* heuristic : probably not compressible enough */ + } + + /* Check validity of previous table */ + if ( repeat + && *repeat == HUF_repeat_check + && !HUF_validateCTable(oldHufTable, table->count, maxSymbolValue)) { + *repeat = HUF_repeat_none; + } + /* Heuristic : use existing table for small inputs */ + if (preferRepeat && repeat && *repeat != HUF_repeat_none) { + return HUF_compressCTable_internal(ostart, op, oend, + src, srcSize, + singleStream, oldHufTable, bmi2); } /* Build Huffman Tree */ huffLog = HUF_optimalTableLog(huffLog, srcSize, maxSymbolValue); - { CHECK_V_F(maxBits, HUF_buildCTable_wksp (table.CTable, table.count, maxSymbolValue, huffLog, workSpace, wkspSize) ); + { CHECK_V_F(maxBits, HUF_buildCTable_wksp(table->CTable, table->count, + maxSymbolValue, huffLog, + table->nodeTable, sizeof(table->nodeTable)) ); huffLog = (U32)maxBits; + /* Zero unused symbols in CTable, so we can check it for validity */ + memset(table->CTable + (maxSymbolValue + 1), 0, + sizeof(table->CTable) - ((maxSymbolValue + 1) * sizeof(HUF_CElt))); } /* Write table description header */ - { CHECK_V_F(hSize, HUF_writeCTable (op, dstSize, table.CTable, maxSymbolValue, huffLog) ); - if (hSize + 12 >= srcSize) return 0; /* not useful to try compression */ - op += hSize; - } + { CHECK_V_F(hSize, HUF_writeCTable (op, dstSize, table->CTable, maxSymbolValue, huffLog) ); + /* Check if using previous huffman table is beneficial */ + if (repeat && *repeat != HUF_repeat_none) { + size_t const oldSize = HUF_estimateCompressedSize(oldHufTable, table->count, maxSymbolValue); + size_t const newSize = HUF_estimateCompressedSize(table->CTable, table->count, maxSymbolValue); + if (oldSize <= hSize + newSize || hSize + 12 >= srcSize) { + return HUF_compressCTable_internal(ostart, op, oend, + src, srcSize, + singleStream, oldHufTable, bmi2); + } } - /* Compress */ - { size_t const cSize = (singleStream) ? - HUF_compress1X_usingCTable(op, oend - op, src, srcSize, table.CTable) : /* single segment */ - HUF_compress4X_usingCTable(op, oend - op, src, srcSize, table.CTable); - if (HUF_isError(cSize)) return cSize; - if (cSize==0) return 0; /* uncompressible */ - op += cSize; + /* Use the new huffman table */ + if (hSize + 12ul >= srcSize) { return 0; } + op += hSize; + if (repeat) { *repeat = HUF_repeat_none; } + if (oldHufTable) + memcpy(oldHufTable, table->CTable, sizeof(table->CTable)); /* Save new table */ } - - /* check compressibility */ - if ((size_t)(op-ostart) >= srcSize-1) - return 0; - - return op-ostart; + return HUF_compressCTable_internal(ostart, op, oend, + src, srcSize, + singleStream, table->CTable, bmi2); } @@ -576,34 +719,70 @@ unsigned maxSymbolValue, unsigned huffLog, void* workSpace, size_t wkspSize) { - return HUF_compress_internal(dst, dstSize, src, srcSize, maxSymbolValue, huffLog, 1 /* single stream */, workSpace, wkspSize); + return HUF_compress_internal(dst, dstSize, src, srcSize, + maxSymbolValue, huffLog, 1 /*single stream*/, + workSpace, wkspSize, + NULL, NULL, 0, 0 /*bmi2*/); +} + +size_t HUF_compress1X_repeat (void* dst, size_t dstSize, + const void* src, size_t srcSize, + unsigned maxSymbolValue, unsigned huffLog, + void* workSpace, size_t wkspSize, + HUF_CElt* hufTable, HUF_repeat* repeat, int preferRepeat, int bmi2) +{ + return HUF_compress_internal(dst, dstSize, src, srcSize, + maxSymbolValue, huffLog, 1 /*single stream*/, + workSpace, wkspSize, hufTable, + repeat, preferRepeat, bmi2); } size_t HUF_compress1X (void* dst, size_t dstSize, const void* src, size_t srcSize, unsigned maxSymbolValue, unsigned huffLog) { - unsigned workSpace[1024]; + unsigned workSpace[HUF_WORKSPACE_SIZE_U32]; return HUF_compress1X_wksp(dst, dstSize, src, srcSize, maxSymbolValue, huffLog, workSpace, sizeof(workSpace)); } +/* HUF_compress4X_repeat(): + * compress input using 4 streams. + * provide workspace to generate compression tables */ size_t HUF_compress4X_wksp (void* dst, size_t dstSize, const void* src, size_t srcSize, unsigned maxSymbolValue, unsigned huffLog, void* workSpace, size_t wkspSize) { - return HUF_compress_internal(dst, dstSize, src, srcSize, maxSymbolValue, huffLog, 0 /* 4 streams */, workSpace, wkspSize); + return HUF_compress_internal(dst, dstSize, src, srcSize, + maxSymbolValue, huffLog, 0 /*4 streams*/, + workSpace, wkspSize, + NULL, NULL, 0, 0 /*bmi2*/); +} + +/* HUF_compress4X_repeat(): + * compress input using 4 streams. + * re-use an existing huffman compression table */ +size_t HUF_compress4X_repeat (void* dst, size_t dstSize, + const void* src, size_t srcSize, + unsigned maxSymbolValue, unsigned huffLog, + void* workSpace, size_t wkspSize, + HUF_CElt* hufTable, HUF_repeat* repeat, int preferRepeat, int bmi2) +{ + return HUF_compress_internal(dst, dstSize, src, srcSize, + maxSymbolValue, huffLog, 0 /* 4 streams */, + workSpace, wkspSize, + hufTable, repeat, preferRepeat, bmi2); } size_t HUF_compress2 (void* dst, size_t dstSize, const void* src, size_t srcSize, unsigned maxSymbolValue, unsigned huffLog) { - unsigned workSpace[1024]; + unsigned workSpace[HUF_WORKSPACE_SIZE_U32]; return HUF_compress4X_wksp(dst, dstSize, src, srcSize, maxSymbolValue, huffLog, workSpace, sizeof(workSpace)); } size_t HUF_compress (void* dst, size_t maxDstSize, const void* src, size_t srcSize) { - return HUF_compress2(dst, maxDstSize, src, (U32)srcSize, 255, HUF_TABLELOG_DEFAULT); + return HUF_compress2(dst, maxDstSize, src, srcSize, 255, HUF_TABLELOG_DEFAULT); } diff -r fb92df8b634c -r ed5448edcbfa contrib/python-zstandard/zstd/compress/zstd_compress.c --- a/contrib/python-zstandard/zstd/compress/zstd_compress.c Wed Apr 04 10:35:09 2018 -0400 +++ b/contrib/python-zstandard/zstd/compress/zstd_compress.c Wed Apr 18 15:32:08 2018 -0400 @@ -1,162 +1,619 @@ -/** +/* * Copyright (c) 2016-present, Yann Collet, Facebook, Inc. * All rights reserved. * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. An additional grant - * of patent rights can be found in the PATENTS file in the same directory. + * This source code is licensed under both the BSD-style license (found in the + * LICENSE file in the root directory of this source tree) and the GPLv2 (found + * in the COPYING file in the root directory of this source tree). + * You may select, at your option, one of the above-listed licenses. */ /*-************************************* +* Tuning parameters +***************************************/ +#ifndef ZSTD_CLEVEL_DEFAULT +# define ZSTD_CLEVEL_DEFAULT 3 +#endif + + +/*-************************************* * Dependencies ***************************************/ #include /* memset */ +#include "cpu.h" #include "mem.h" -#define XXH_STATIC_LINKING_ONLY /* XXH64_state_t */ -#include "xxhash.h" /* XXH_reset, update, digest */ #define FSE_STATIC_LINKING_ONLY /* FSE_encodeSymbol */ #include "fse.h" #define HUF_STATIC_LINKING_ONLY #include "huf.h" -#include "zstd_internal.h" /* includes zstd.h */ - - -/*-************************************* -* Constants -***************************************/ -static const U32 g_searchStrength = 8; /* control skip over incompressible data */ -#define HASH_READ_SIZE 8 -typedef enum { ZSTDcs_created=0, ZSTDcs_init, ZSTDcs_ongoing, ZSTDcs_ending } ZSTD_compressionStage_e; +#include "zstd_compress_internal.h" +#include "zstd_fast.h" +#include "zstd_double_fast.h" +#include "zstd_lazy.h" +#include "zstd_opt.h" +#include "zstd_ldm.h" /*-************************************* * Helper functions ***************************************/ -#define ZSTD_STATIC_ASSERT(c) { enum { ZSTD_static_assert = 1/(int)(!!(c)) }; } -size_t ZSTD_compressBound(size_t srcSize) { return FSE_compressBound(srcSize) + 12; } - - -/*-************************************* -* Sequence storage -***************************************/ -static void ZSTD_resetSeqStore(seqStore_t* ssPtr) -{ - ssPtr->lit = ssPtr->litStart; - ssPtr->sequences = ssPtr->sequencesStart; - ssPtr->longLengthID = 0; +size_t ZSTD_compressBound(size_t srcSize) { + return ZSTD_COMPRESSBOUND(srcSize); } /*-************************************* * Context memory management ***************************************/ -struct ZSTD_CCtx_s { - const BYTE* nextSrc; /* next block here to continue on current prefix */ - const BYTE* base; /* All regular indexes relative to this position */ - const BYTE* dictBase; /* extDict indexes relative to this position */ - U32 dictLimit; /* below that point, need extDict */ - U32 lowLimit; /* below that point, no more data */ - U32 nextToUpdate; /* index from which to continue dictionary update */ - U32 nextToUpdate3; /* index from which to continue dictionary update */ - U32 hashLog3; /* dispatch table : larger == faster, more memory */ - U32 loadedDictEnd; /* index of end of dictionary */ - U32 forceWindow; /* force back-references to respect limit of 1<customMem = customMem; + ZSTD_STATIC_ASSERT(zcss_init==0); + ZSTD_STATIC_ASSERT(ZSTD_CONTENTSIZE_UNKNOWN==(0ULL - 1)); + if (!customMem.customAlloc ^ !customMem.customFree) return NULL; + { ZSTD_CCtx* const cctx = (ZSTD_CCtx*)ZSTD_calloc(sizeof(ZSTD_CCtx), customMem); + if (!cctx) return NULL; + cctx->customMem = customMem; + cctx->requestedParams.compressionLevel = ZSTD_CLEVEL_DEFAULT; + cctx->requestedParams.fParams.contentSizeFlag = 1; + cctx->bmi2 = ZSTD_cpuid_bmi2(ZSTD_cpuid()); + return cctx; + } +} + +ZSTD_CCtx* ZSTD_initStaticCCtx(void *workspace, size_t workspaceSize) +{ + ZSTD_CCtx* const cctx = (ZSTD_CCtx*) workspace; + if (workspaceSize <= sizeof(ZSTD_CCtx)) return NULL; /* minimum size */ + if ((size_t)workspace & 7) return NULL; /* must be 8-aligned */ + memset(workspace, 0, workspaceSize); /* may be a bit generous, could memset be smaller ? */ + cctx->staticSize = workspaceSize; + cctx->workSpace = (void*)(cctx+1); + cctx->workSpaceSize = workspaceSize - sizeof(ZSTD_CCtx); + + /* statically sized space. entropyWorkspace never moves (but prev/next block swap places) */ + if (cctx->workSpaceSize < HUF_WORKSPACE_SIZE + 2 * sizeof(ZSTD_compressedBlockState_t)) return NULL; + assert(((size_t)cctx->workSpace & (sizeof(void*)-1)) == 0); /* ensure correct alignment */ + cctx->blockState.prevCBlock = (ZSTD_compressedBlockState_t*)cctx->workSpace; + cctx->blockState.nextCBlock = cctx->blockState.prevCBlock + 1; + { + void* const ptr = cctx->blockState.nextCBlock + 1; + cctx->entropyWorkspace = (U32*)ptr; + } + cctx->bmi2 = ZSTD_cpuid_bmi2(ZSTD_cpuid()); return cctx; } size_t ZSTD_freeCCtx(ZSTD_CCtx* cctx) { if (cctx==NULL) return 0; /* support free on NULL */ - ZSTD_free(cctx->workSpace, cctx->customMem); + if (cctx->staticSize) return ERROR(memory_allocation); /* not compatible with static CCtx */ + ZSTD_free(cctx->workSpace, cctx->customMem); cctx->workSpace = NULL; + ZSTD_freeCDict(cctx->cdictLocal); cctx->cdictLocal = NULL; +#ifdef ZSTD_MULTITHREAD + ZSTDMT_freeCCtx(cctx->mtctx); cctx->mtctx = NULL; +#endif ZSTD_free(cctx, cctx->customMem); return 0; /* reserved as a potential error code in the future */ } + +static size_t ZSTD_sizeof_mtctx(const ZSTD_CCtx* cctx) +{ +#ifdef ZSTD_MULTITHREAD + return ZSTDMT_sizeof_CCtx(cctx->mtctx); +#else + (void) cctx; + return 0; +#endif +} + + size_t ZSTD_sizeof_CCtx(const ZSTD_CCtx* cctx) { if (cctx==NULL) return 0; /* support sizeof on NULL */ - return sizeof(*cctx) + cctx->workSpaceSize; + return sizeof(*cctx) + cctx->workSpaceSize + + ZSTD_sizeof_CDict(cctx->cdictLocal) + + ZSTD_sizeof_mtctx(cctx); +} + +size_t ZSTD_sizeof_CStream(const ZSTD_CStream* zcs) +{ + return ZSTD_sizeof_CCtx(zcs); /* same object */ +} + +/* private API call, for dictBuilder only */ +const seqStore_t* ZSTD_getSeqStore(const ZSTD_CCtx* ctx) { return &(ctx->seqStore); } + +ZSTD_compressionParameters ZSTD_getCParamsFromCCtxParams( + const ZSTD_CCtx_params* CCtxParams, U64 srcSizeHint, size_t dictSize) +{ + ZSTD_compressionParameters cParams = ZSTD_getCParams(CCtxParams->compressionLevel, srcSizeHint, dictSize); + if (CCtxParams->ldmParams.enableLdm) cParams.windowLog = ZSTD_LDM_DEFAULT_WINDOW_LOG; + if (CCtxParams->cParams.windowLog) cParams.windowLog = CCtxParams->cParams.windowLog; + if (CCtxParams->cParams.hashLog) cParams.hashLog = CCtxParams->cParams.hashLog; + if (CCtxParams->cParams.chainLog) cParams.chainLog = CCtxParams->cParams.chainLog; + if (CCtxParams->cParams.searchLog) cParams.searchLog = CCtxParams->cParams.searchLog; + if (CCtxParams->cParams.searchLength) cParams.searchLength = CCtxParams->cParams.searchLength; + if (CCtxParams->cParams.targetLength) cParams.targetLength = CCtxParams->cParams.targetLength; + if (CCtxParams->cParams.strategy) cParams.strategy = CCtxParams->cParams.strategy; + return cParams; +} + +static ZSTD_CCtx_params ZSTD_makeCCtxParamsFromCParams( + ZSTD_compressionParameters cParams) +{ + ZSTD_CCtx_params cctxParams; + memset(&cctxParams, 0, sizeof(cctxParams)); + cctxParams.cParams = cParams; + cctxParams.compressionLevel = ZSTD_CLEVEL_DEFAULT; /* should not matter, as all cParams are presumed properly defined */ + assert(!ZSTD_checkCParams(cParams)); + cctxParams.fParams.contentSizeFlag = 1; + return cctxParams; +} + +static ZSTD_CCtx_params* ZSTD_createCCtxParams_advanced( + ZSTD_customMem customMem) +{ + ZSTD_CCtx_params* params; + if (!customMem.customAlloc ^ !customMem.customFree) return NULL; + params = (ZSTD_CCtx_params*)ZSTD_calloc( + sizeof(ZSTD_CCtx_params), customMem); + if (!params) { return NULL; } + params->customMem = customMem; + params->compressionLevel = ZSTD_CLEVEL_DEFAULT; + params->fParams.contentSizeFlag = 1; + return params; } -size_t ZSTD_setCCtxParameter(ZSTD_CCtx* cctx, ZSTD_CCtxParameter param, unsigned value) +ZSTD_CCtx_params* ZSTD_createCCtxParams(void) +{ + return ZSTD_createCCtxParams_advanced(ZSTD_defaultCMem); +} + +size_t ZSTD_freeCCtxParams(ZSTD_CCtx_params* params) +{ + if (params == NULL) { return 0; } + ZSTD_free(params, params->customMem); + return 0; +} + +size_t ZSTD_CCtxParams_reset(ZSTD_CCtx_params* params) +{ + return ZSTD_CCtxParams_init(params, ZSTD_CLEVEL_DEFAULT); +} + +size_t ZSTD_CCtxParams_init(ZSTD_CCtx_params* cctxParams, int compressionLevel) { + if (!cctxParams) { return ERROR(GENERIC); } + memset(cctxParams, 0, sizeof(*cctxParams)); + cctxParams->compressionLevel = compressionLevel; + cctxParams->fParams.contentSizeFlag = 1; + return 0; +} + +size_t ZSTD_CCtxParams_init_advanced(ZSTD_CCtx_params* cctxParams, ZSTD_parameters params) +{ + if (!cctxParams) { return ERROR(GENERIC); } + CHECK_F( ZSTD_checkCParams(params.cParams) ); + memset(cctxParams, 0, sizeof(*cctxParams)); + cctxParams->cParams = params.cParams; + cctxParams->fParams = params.fParams; + cctxParams->compressionLevel = ZSTD_CLEVEL_DEFAULT; /* should not matter, as all cParams are presumed properly defined */ + assert(!ZSTD_checkCParams(params.cParams)); + return 0; +} + +/* ZSTD_assignParamsToCCtxParams() : + * params is presumed valid at this stage */ +static ZSTD_CCtx_params ZSTD_assignParamsToCCtxParams( + ZSTD_CCtx_params cctxParams, ZSTD_parameters params) +{ + ZSTD_CCtx_params ret = cctxParams; + ret.cParams = params.cParams; + ret.fParams = params.fParams; + ret.compressionLevel = ZSTD_CLEVEL_DEFAULT; /* should not matter, as all cParams are presumed properly defined */ + assert(!ZSTD_checkCParams(params.cParams)); + return ret; +} + +#define CLAMPCHECK(val,min,max) { \ + if (((val)<(min)) | ((val)>(max))) { \ + return ERROR(parameter_outOfBound); \ +} } + + +static int ZSTD_isUpdateAuthorized(ZSTD_cParameter param) { switch(param) { - case ZSTD_p_forceWindow : cctx->forceWindow = value>0; cctx->loadedDictEnd = 0; return 0; - default: return ERROR(parameter_unknown); + case ZSTD_p_compressionLevel: + case ZSTD_p_hashLog: + case ZSTD_p_chainLog: + case ZSTD_p_searchLog: + case ZSTD_p_minMatch: + case ZSTD_p_targetLength: + case ZSTD_p_compressionStrategy: + case ZSTD_p_compressLiterals: + return 1; + + case ZSTD_p_format: + case ZSTD_p_windowLog: + case ZSTD_p_contentSizeFlag: + case ZSTD_p_checksumFlag: + case ZSTD_p_dictIDFlag: + case ZSTD_p_forceMaxWindow : + case ZSTD_p_nbWorkers: + case ZSTD_p_jobSize: + case ZSTD_p_overlapSizeLog: + case ZSTD_p_enableLongDistanceMatching: + case ZSTD_p_ldmHashLog: + case ZSTD_p_ldmMinMatch: + case ZSTD_p_ldmBucketSizeLog: + case ZSTD_p_ldmHashEveryLog: + default: + return 0; + } +} + +size_t ZSTD_CCtx_setParameter(ZSTD_CCtx* cctx, ZSTD_cParameter param, unsigned value) +{ + DEBUGLOG(4, "ZSTD_CCtx_setParameter (%u, %u)", (U32)param, value); + if (cctx->streamStage != zcss_init) { + if (ZSTD_isUpdateAuthorized(param)) { + cctx->cParamsChanged = 1; + } else { + return ERROR(stage_wrong); + } } + + switch(param) + { + case ZSTD_p_format : + return ZSTD_CCtxParam_setParameter(&cctx->requestedParams, param, value); + + case ZSTD_p_compressionLevel: + if (cctx->cdict) return ERROR(stage_wrong); + return ZSTD_CCtxParam_setParameter(&cctx->requestedParams, param, value); + + case ZSTD_p_windowLog: + case ZSTD_p_hashLog: + case ZSTD_p_chainLog: + case ZSTD_p_searchLog: + case ZSTD_p_minMatch: + case ZSTD_p_targetLength: + case ZSTD_p_compressionStrategy: + if (cctx->cdict) return ERROR(stage_wrong); + return ZSTD_CCtxParam_setParameter(&cctx->requestedParams, param, value); + + case ZSTD_p_compressLiterals: + case ZSTD_p_contentSizeFlag: + case ZSTD_p_checksumFlag: + case ZSTD_p_dictIDFlag: + return ZSTD_CCtxParam_setParameter(&cctx->requestedParams, param, value); + + case ZSTD_p_forceMaxWindow : /* Force back-references to remain < windowSize, + * even when referencing into Dictionary content. + * default : 0 when using a CDict, 1 when using a Prefix */ + return ZSTD_CCtxParam_setParameter(&cctx->requestedParams, param, value); + + case ZSTD_p_nbWorkers: + if ((value>0) && cctx->staticSize) { + return ERROR(parameter_unsupported); /* MT not compatible with static alloc */ + } + return ZSTD_CCtxParam_setParameter(&cctx->requestedParams, param, value); + + case ZSTD_p_jobSize: + case ZSTD_p_overlapSizeLog: + return ZSTD_CCtxParam_setParameter(&cctx->requestedParams, param, value); + + case ZSTD_p_enableLongDistanceMatching: + case ZSTD_p_ldmHashLog: + case ZSTD_p_ldmMinMatch: + case ZSTD_p_ldmBucketSizeLog: + case ZSTD_p_ldmHashEveryLog: + if (cctx->cdict) return ERROR(stage_wrong); + return ZSTD_CCtxParam_setParameter(&cctx->requestedParams, param, value); + + default: return ERROR(parameter_unsupported); } } -const seqStore_t* ZSTD_getSeqStore(const ZSTD_CCtx* ctx) /* hidden interface */ -{ - return &(ctx->seqStore); -} - -static ZSTD_parameters ZSTD_getParamsFromCCtx(const ZSTD_CCtx* cctx) +size_t ZSTD_CCtxParam_setParameter( + ZSTD_CCtx_params* CCtxParams, ZSTD_cParameter param, unsigned value) { - return cctx->params; + DEBUGLOG(4, "ZSTD_CCtxParam_setParameter (%u, %u)", (U32)param, value); + switch(param) + { + case ZSTD_p_format : + if (value > (unsigned)ZSTD_f_zstd1_magicless) + return ERROR(parameter_unsupported); + CCtxParams->format = (ZSTD_format_e)value; + return (size_t)CCtxParams->format; + + case ZSTD_p_compressionLevel : { + int cLevel = (int)value; /* cast expected to restore negative sign */ + if (cLevel > ZSTD_maxCLevel()) cLevel = ZSTD_maxCLevel(); + if (cLevel) { /* 0 : does not change current level */ + CCtxParams->disableLiteralCompression = (cLevel<0); /* negative levels disable huffman */ + CCtxParams->compressionLevel = cLevel; + } + if (CCtxParams->compressionLevel >= 0) return CCtxParams->compressionLevel; + return 0; /* return type (size_t) cannot represent negative values */ + } + + case ZSTD_p_windowLog : + if (value>0) /* 0 => use default */ + CLAMPCHECK(value, ZSTD_WINDOWLOG_MIN, ZSTD_WINDOWLOG_MAX); + CCtxParams->cParams.windowLog = value; + return CCtxParams->cParams.windowLog; + + case ZSTD_p_hashLog : + if (value>0) /* 0 => use default */ + CLAMPCHECK(value, ZSTD_HASHLOG_MIN, ZSTD_HASHLOG_MAX); + CCtxParams->cParams.hashLog = value; + return CCtxParams->cParams.hashLog; + + case ZSTD_p_chainLog : + if (value>0) /* 0 => use default */ + CLAMPCHECK(value, ZSTD_CHAINLOG_MIN, ZSTD_CHAINLOG_MAX); + CCtxParams->cParams.chainLog = value; + return CCtxParams->cParams.chainLog; + + case ZSTD_p_searchLog : + if (value>0) /* 0 => use default */ + CLAMPCHECK(value, ZSTD_SEARCHLOG_MIN, ZSTD_SEARCHLOG_MAX); + CCtxParams->cParams.searchLog = value; + return value; + + case ZSTD_p_minMatch : + if (value>0) /* 0 => use default */ + CLAMPCHECK(value, ZSTD_SEARCHLENGTH_MIN, ZSTD_SEARCHLENGTH_MAX); + CCtxParams->cParams.searchLength = value; + return CCtxParams->cParams.searchLength; + + case ZSTD_p_targetLength : + /* all values are valid. 0 => use default */ + CCtxParams->cParams.targetLength = value; + return CCtxParams->cParams.targetLength; + + case ZSTD_p_compressionStrategy : + if (value>0) /* 0 => use default */ + CLAMPCHECK(value, (unsigned)ZSTD_fast, (unsigned)ZSTD_btultra); + CCtxParams->cParams.strategy = (ZSTD_strategy)value; + return (size_t)CCtxParams->cParams.strategy; + + case ZSTD_p_compressLiterals: + CCtxParams->disableLiteralCompression = !value; + return !CCtxParams->disableLiteralCompression; + + case ZSTD_p_contentSizeFlag : + /* Content size written in frame header _when known_ (default:1) */ + DEBUGLOG(4, "set content size flag = %u", (value>0)); + CCtxParams->fParams.contentSizeFlag = value > 0; + return CCtxParams->fParams.contentSizeFlag; + + case ZSTD_p_checksumFlag : + /* A 32-bits content checksum will be calculated and written at end of frame (default:0) */ + CCtxParams->fParams.checksumFlag = value > 0; + return CCtxParams->fParams.checksumFlag; + + case ZSTD_p_dictIDFlag : /* When applicable, dictionary's dictID is provided in frame header (default:1) */ + DEBUGLOG(4, "set dictIDFlag = %u", (value>0)); + CCtxParams->fParams.noDictIDFlag = !value; + return !CCtxParams->fParams.noDictIDFlag; + + case ZSTD_p_forceMaxWindow : + CCtxParams->forceWindow = (value > 0); + return CCtxParams->forceWindow; + + case ZSTD_p_nbWorkers : +#ifndef ZSTD_MULTITHREAD + if (value>0) return ERROR(parameter_unsupported); + return 0; +#else + return ZSTDMT_CCtxParam_setNbWorkers(CCtxParams, value); +#endif + + case ZSTD_p_jobSize : +#ifndef ZSTD_MULTITHREAD + return ERROR(parameter_unsupported); +#else + return ZSTDMT_CCtxParam_setMTCtxParameter(CCtxParams, ZSTDMT_p_jobSize, value); +#endif + + case ZSTD_p_overlapSizeLog : +#ifndef ZSTD_MULTITHREAD + return ERROR(parameter_unsupported); +#else + return ZSTDMT_CCtxParam_setMTCtxParameter(CCtxParams, ZSTDMT_p_overlapSectionLog, value); +#endif + + case ZSTD_p_enableLongDistanceMatching : + CCtxParams->ldmParams.enableLdm = (value>0); + return CCtxParams->ldmParams.enableLdm; + + case ZSTD_p_ldmHashLog : + if (value>0) /* 0 ==> auto */ + CLAMPCHECK(value, ZSTD_HASHLOG_MIN, ZSTD_HASHLOG_MAX); + CCtxParams->ldmParams.hashLog = value; + return CCtxParams->ldmParams.hashLog; + + case ZSTD_p_ldmMinMatch : + if (value>0) /* 0 ==> default */ + CLAMPCHECK(value, ZSTD_LDM_MINMATCH_MIN, ZSTD_LDM_MINMATCH_MAX); + CCtxParams->ldmParams.minMatchLength = value; + return CCtxParams->ldmParams.minMatchLength; + + case ZSTD_p_ldmBucketSizeLog : + if (value > ZSTD_LDM_BUCKETSIZELOG_MAX) + return ERROR(parameter_outOfBound); + CCtxParams->ldmParams.bucketSizeLog = value; + return CCtxParams->ldmParams.bucketSizeLog; + + case ZSTD_p_ldmHashEveryLog : + if (value > ZSTD_WINDOWLOG_MAX - ZSTD_HASHLOG_MIN) + return ERROR(parameter_outOfBound); + CCtxParams->ldmParams.hashEveryLog = value; + return CCtxParams->ldmParams.hashEveryLog; + + default: return ERROR(parameter_unsupported); + } } - -/** ZSTD_checkParams() : - ensure param values remain within authorized range. +/** ZSTD_CCtx_setParametersUsingCCtxParams() : + * just applies `params` into `cctx` + * no action is performed, parameters are merely stored. + * If ZSTDMT is enabled, parameters are pushed to cctx->mtctx. + * This is possible even if a compression is ongoing. + * In which case, new parameters will be applied on the fly, starting with next compression job. + */ +size_t ZSTD_CCtx_setParametersUsingCCtxParams( + ZSTD_CCtx* cctx, const ZSTD_CCtx_params* params) +{ + if (cctx->streamStage != zcss_init) return ERROR(stage_wrong); + if (cctx->cdict) return ERROR(stage_wrong); + + cctx->requestedParams = *params; + return 0; +} + +ZSTDLIB_API size_t ZSTD_CCtx_setPledgedSrcSize(ZSTD_CCtx* cctx, unsigned long long pledgedSrcSize) +{ + DEBUGLOG(4, "ZSTD_CCtx_setPledgedSrcSize to %u bytes", (U32)pledgedSrcSize); + if (cctx->streamStage != zcss_init) return ERROR(stage_wrong); + cctx->pledgedSrcSizePlusOne = pledgedSrcSize+1; + return 0; +} + +size_t ZSTD_CCtx_loadDictionary_advanced( + ZSTD_CCtx* cctx, const void* dict, size_t dictSize, + ZSTD_dictLoadMethod_e dictLoadMethod, ZSTD_dictContentType_e dictContentType) +{ + if (cctx->streamStage != zcss_init) return ERROR(stage_wrong); + if (cctx->staticSize) return ERROR(memory_allocation); /* no malloc for static CCtx */ + DEBUGLOG(4, "ZSTD_CCtx_loadDictionary_advanced (size: %u)", (U32)dictSize); + ZSTD_freeCDict(cctx->cdictLocal); /* in case one already exists */ + if (dict==NULL || dictSize==0) { /* no dictionary mode */ + cctx->cdictLocal = NULL; + cctx->cdict = NULL; + } else { + ZSTD_compressionParameters const cParams = + ZSTD_getCParamsFromCCtxParams(&cctx->requestedParams, cctx->pledgedSrcSizePlusOne-1, dictSize); + cctx->cdictLocal = ZSTD_createCDict_advanced( + dict, dictSize, + dictLoadMethod, dictContentType, + cParams, cctx->customMem); + cctx->cdict = cctx->cdictLocal; + if (cctx->cdictLocal == NULL) + return ERROR(memory_allocation); + } + return 0; +} + +ZSTDLIB_API size_t ZSTD_CCtx_loadDictionary_byReference( + ZSTD_CCtx* cctx, const void* dict, size_t dictSize) +{ + return ZSTD_CCtx_loadDictionary_advanced( + cctx, dict, dictSize, ZSTD_dlm_byRef, ZSTD_dct_auto); +} + +ZSTDLIB_API size_t ZSTD_CCtx_loadDictionary(ZSTD_CCtx* cctx, const void* dict, size_t dictSize) +{ + return ZSTD_CCtx_loadDictionary_advanced( + cctx, dict, dictSize, ZSTD_dlm_byCopy, ZSTD_dct_auto); +} + + +size_t ZSTD_CCtx_refCDict(ZSTD_CCtx* cctx, const ZSTD_CDict* cdict) +{ + if (cctx->streamStage != zcss_init) return ERROR(stage_wrong); + cctx->cdict = cdict; + memset(&cctx->prefixDict, 0, sizeof(cctx->prefixDict)); /* exclusive */ + return 0; +} + +size_t ZSTD_CCtx_refPrefix(ZSTD_CCtx* cctx, const void* prefix, size_t prefixSize) +{ + return ZSTD_CCtx_refPrefix_advanced(cctx, prefix, prefixSize, ZSTD_dct_rawContent); +} + +size_t ZSTD_CCtx_refPrefix_advanced( + ZSTD_CCtx* cctx, const void* prefix, size_t prefixSize, ZSTD_dictContentType_e dictContentType) +{ + if (cctx->streamStage != zcss_init) return ERROR(stage_wrong); + cctx->cdict = NULL; /* prefix discards any prior cdict */ + cctx->prefixDict.dict = prefix; + cctx->prefixDict.dictSize = prefixSize; + cctx->prefixDict.dictContentType = dictContentType; + return 0; +} + +static void ZSTD_startNewCompression(ZSTD_CCtx* cctx) +{ + cctx->streamStage = zcss_init; + cctx->pledgedSrcSizePlusOne = 0; +} + +/*! ZSTD_CCtx_reset() : + * Also dumps dictionary */ +void ZSTD_CCtx_reset(ZSTD_CCtx* cctx) +{ + ZSTD_startNewCompression(cctx); + cctx->cdict = NULL; +} + +/** ZSTD_checkCParams() : + control CParam values remain within authorized range. @return : 0, or an error code if one value is beyond authorized range */ size_t ZSTD_checkCParams(ZSTD_compressionParameters cParams) { -# define CLAMPCHECK(val,min,max) { if ((valmax)) return ERROR(compressionParameter_unsupported); } CLAMPCHECK(cParams.windowLog, ZSTD_WINDOWLOG_MIN, ZSTD_WINDOWLOG_MAX); CLAMPCHECK(cParams.chainLog, ZSTD_CHAINLOG_MIN, ZSTD_CHAINLOG_MAX); CLAMPCHECK(cParams.hashLog, ZSTD_HASHLOG_MIN, ZSTD_HASHLOG_MAX); CLAMPCHECK(cParams.searchLog, ZSTD_SEARCHLOG_MIN, ZSTD_SEARCHLOG_MAX); - { U32 const searchLengthMin = ((cParams.strategy == ZSTD_fast) | (cParams.strategy == ZSTD_greedy)) ? ZSTD_SEARCHLENGTH_MIN+1 : ZSTD_SEARCHLENGTH_MIN; - U32 const searchLengthMax = (cParams.strategy == ZSTD_fast) ? ZSTD_SEARCHLENGTH_MAX : ZSTD_SEARCHLENGTH_MAX-1; - CLAMPCHECK(cParams.searchLength, searchLengthMin, searchLengthMax); } - CLAMPCHECK(cParams.targetLength, ZSTD_TARGETLENGTH_MIN, ZSTD_TARGETLENGTH_MAX); - if ((U32)(cParams.strategy) > (U32)ZSTD_btopt2) return ERROR(compressionParameter_unsupported); + CLAMPCHECK(cParams.searchLength, ZSTD_SEARCHLENGTH_MIN, ZSTD_SEARCHLENGTH_MAX); + if ((U32)(cParams.targetLength) < ZSTD_TARGETLENGTH_MIN) + return ERROR(parameter_unsupported); + if ((U32)(cParams.strategy) > (U32)ZSTD_btultra) + return ERROR(parameter_unsupported); return 0; } +/** ZSTD_clampCParams() : + * make CParam values within valid range. + * @return : valid CParams */ +static ZSTD_compressionParameters ZSTD_clampCParams(ZSTD_compressionParameters cParams) +{ +# define CLAMP(val,min,max) { \ + if (valmax) val=max; \ + } + CLAMP(cParams.windowLog, ZSTD_WINDOWLOG_MIN, ZSTD_WINDOWLOG_MAX); + CLAMP(cParams.chainLog, ZSTD_CHAINLOG_MIN, ZSTD_CHAINLOG_MAX); + CLAMP(cParams.hashLog, ZSTD_HASHLOG_MIN, ZSTD_HASHLOG_MAX); + CLAMP(cParams.searchLog, ZSTD_SEARCHLOG_MIN, ZSTD_SEARCHLOG_MAX); + CLAMP(cParams.searchLength, ZSTD_SEARCHLENGTH_MIN, ZSTD_SEARCHLENGTH_MAX); + if ((U32)(cParams.targetLength) < ZSTD_TARGETLENGTH_MIN) cParams.targetLength = ZSTD_TARGETLENGTH_MIN; + if ((U32)(cParams.strategy) > (U32)ZSTD_btultra) cParams.strategy = ZSTD_btultra; + return cParams; +} /** ZSTD_cycleLog() : * condition for correct operation : hashLog > 1 */ @@ -166,162 +623,466 @@ return hashLog - btScale; } -/** ZSTD_adjustCParams() : +/** ZSTD_adjustCParams_internal() : optimize `cPar` for a given input (`srcSize` and `dictSize`). - mostly downsizing to reduce memory consumption and initialization. - Both `srcSize` and `dictSize` are optional (use 0 if unknown), - but if both are 0, no optimization can be done. - Note : cPar is considered validated at this stage. Use ZSTD_checkParams() to ensure that. */ -ZSTD_compressionParameters ZSTD_adjustCParams(ZSTD_compressionParameters cPar, unsigned long long srcSize, size_t dictSize) + mostly downsizing to reduce memory consumption and initialization latency. + Both `srcSize` and `dictSize` are optional (use 0 if unknown). + Note : cPar is considered validated at this stage. Use ZSTD_checkCParams() to ensure that condition. */ +ZSTD_compressionParameters ZSTD_adjustCParams_internal(ZSTD_compressionParameters cPar, unsigned long long srcSize, size_t dictSize) { - if (srcSize+dictSize == 0) return cPar; /* no size information available : no adjustment */ - - /* resize params, to use less memory when necessary */ - { U32 const minSrcSize = (srcSize==0) ? 500 : 0; - U64 const rSize = srcSize + dictSize + minSrcSize; - if (rSize < ((U64)1< srcLog) cPar.windowLog = srcLog; - } } + static const U64 minSrcSize = 513; /* (1<<9) + 1 */ + static const U64 maxWindowResize = 1ULL << (ZSTD_WINDOWLOG_MAX-1); + assert(ZSTD_checkCParams(cPar)==0); + + if (dictSize && (srcSize+1<2) /* srcSize unknown */ ) + srcSize = minSrcSize; /* presumed small when there is a dictionary */ + else if (srcSize == 0) + srcSize = ZSTD_CONTENTSIZE_UNKNOWN; /* 0 == unknown : presumed large */ + + /* resize windowLog if input is small enough, to use less memory */ + if ( (srcSize < maxWindowResize) + && (dictSize < maxWindowResize) ) { + U32 const tSize = (U32)(srcSize + dictSize); + static U32 const hashSizeMin = 1 << ZSTD_HASHLOG_MIN; + U32 const srcLog = (tSize < hashSizeMin) ? ZSTD_HASHLOG_MIN : + ZSTD_highbit32(tSize-1) + 1; + if (cPar.windowLog > srcLog) cPar.windowLog = srcLog; + } if (cPar.hashLog > cPar.windowLog) cPar.hashLog = cPar.windowLog; { U32 const cycleLog = ZSTD_cycleLog(cPar.chainLog, cPar.strategy); - if (cycleLog > cPar.windowLog) cPar.chainLog -= (cycleLog - cPar.windowLog); + if (cycleLog > cPar.windowLog) + cPar.chainLog -= (cycleLog - cPar.windowLog); } - if (cPar.windowLog < ZSTD_WINDOWLOG_ABSOLUTEMIN) cPar.windowLog = ZSTD_WINDOWLOG_ABSOLUTEMIN; /* required for frame header */ + if (cPar.windowLog < ZSTD_WINDOWLOG_ABSOLUTEMIN) + cPar.windowLog = ZSTD_WINDOWLOG_ABSOLUTEMIN; /* required for frame header */ return cPar; } - -size_t ZSTD_estimateCCtxSize(ZSTD_compressionParameters cParams) +ZSTD_compressionParameters ZSTD_adjustCParams(ZSTD_compressionParameters cPar, unsigned long long srcSize, size_t dictSize) { - size_t const blockSize = MIN(ZSTD_BLOCKSIZE_ABSOLUTEMAX, (size_t)1 << cParams.windowLog); - U32 const divider = (cParams.searchLength==3) ? 3 : 4; - size_t const maxNbSeq = blockSize / divider; - size_t const tokenSpace = blockSize + 11*maxNbSeq; - - size_t const chainSize = (cParams.strategy == ZSTD_fast) ? 0 : (1 << cParams.chainLog); - size_t const hSize = ((size_t)1) << cParams.hashLog; - U32 const hashLog3 = (cParams.searchLength>3) ? 0 : MIN(ZSTD_HASHLOG3_MAX, cParams.windowLog); + cPar = ZSTD_clampCParams(cPar); + return ZSTD_adjustCParams_internal(cPar, srcSize, dictSize); +} + +static size_t ZSTD_sizeof_matchState(ZSTD_compressionParameters const* cParams, const U32 forCCtx) +{ + size_t const chainSize = (cParams->strategy == ZSTD_fast) ? 0 : ((size_t)1 << cParams->chainLog); + size_t const hSize = ((size_t)1) << cParams->hashLog; + U32 const hashLog3 = (forCCtx && cParams->searchLength==3) ? MIN(ZSTD_HASHLOG3_MAX, cParams->windowLog) : 0; size_t const h3Size = ((size_t)1) << hashLog3; size_t const tableSpace = (chainSize + hSize + h3Size) * sizeof(U32); - - size_t const optSpace = ((MaxML+1) + (MaxLL+1) + (MaxOff+1) + (1<strategy == ZSTD_btopt) || + (cParams->strategy == ZSTD_btultra))) + ? optPotentialSpace + : 0; + DEBUGLOG(4, "chainSize: %u - hSize: %u - h3Size: %u", + (U32)chainSize, (U32)hSize, (U32)h3Size); + return tableSpace + optSpace; +} + +size_t ZSTD_estimateCCtxSize_usingCCtxParams(const ZSTD_CCtx_params* params) +{ + /* Estimate CCtx size is supported for single-threaded compression only. */ + if (params->nbWorkers > 0) { return ERROR(GENERIC); } + { ZSTD_compressionParameters const cParams = + ZSTD_getCParamsFromCCtxParams(params, 0, 0); + size_t const blockSize = MIN(ZSTD_BLOCKSIZE_MAX, (size_t)1 << cParams.windowLog); + U32 const divider = (cParams.searchLength==3) ? 3 : 4; + size_t const maxNbSeq = blockSize / divider; + size_t const tokenSpace = blockSize + 11*maxNbSeq; + size_t const entropySpace = HUF_WORKSPACE_SIZE; + size_t const blockStateSpace = 2 * sizeof(ZSTD_compressedBlockState_t); + size_t const matchStateSize = ZSTD_sizeof_matchState(&cParams, /* forCCtx */ 1); + + size_t const ldmSpace = ZSTD_ldm_getTableSize(params->ldmParams); + size_t const ldmSeqSpace = ZSTD_ldm_getMaxNbSeq(params->ldmParams, blockSize) * sizeof(rawSeq); + + size_t const neededSpace = entropySpace + blockStateSpace + tokenSpace + + matchStateSize + ldmSpace + ldmSeqSpace; + + DEBUGLOG(5, "sizeof(ZSTD_CCtx) : %u", (U32)sizeof(ZSTD_CCtx)); + DEBUGLOG(5, "estimate workSpace : %u", (U32)neededSpace); + return sizeof(ZSTD_CCtx) + neededSpace; + } +} + +size_t ZSTD_estimateCCtxSize_usingCParams(ZSTD_compressionParameters cParams) +{ + ZSTD_CCtx_params const params = ZSTD_makeCCtxParamsFromCParams(cParams); + return ZSTD_estimateCCtxSize_usingCCtxParams(¶ms); +} + +static size_t ZSTD_estimateCCtxSize_internal(int compressionLevel) +{ + ZSTD_compressionParameters const cParams = ZSTD_getCParams(compressionLevel, 0, 0); + return ZSTD_estimateCCtxSize_usingCParams(cParams); +} + +size_t ZSTD_estimateCCtxSize(int compressionLevel) +{ + int level; + size_t memBudget = 0; + for (level=1; level<=compressionLevel; level++) { + size_t const newMB = ZSTD_estimateCCtxSize_internal(level); + if (newMB > memBudget) memBudget = newMB; + } + return memBudget; +} + +size_t ZSTD_estimateCStreamSize_usingCCtxParams(const ZSTD_CCtx_params* params) +{ + if (params->nbWorkers > 0) { return ERROR(GENERIC); } + { size_t const CCtxSize = ZSTD_estimateCCtxSize_usingCCtxParams(params); + size_t const blockSize = MIN(ZSTD_BLOCKSIZE_MAX, (size_t)1 << params->cParams.windowLog); + size_t const inBuffSize = ((size_t)1 << params->cParams.windowLog) + blockSize; + size_t const outBuffSize = ZSTD_compressBound(blockSize) + 1; + size_t const streamingSize = inBuffSize + outBuffSize; + + return CCtxSize + streamingSize; + } +} + +size_t ZSTD_estimateCStreamSize_usingCParams(ZSTD_compressionParameters cParams) +{ + ZSTD_CCtx_params const params = ZSTD_makeCCtxParamsFromCParams(cParams); + return ZSTD_estimateCStreamSize_usingCCtxParams(¶ms); +} + +static size_t ZSTD_estimateCStreamSize_internal(int compressionLevel) { + ZSTD_compressionParameters const cParams = ZSTD_getCParams(compressionLevel, 0, 0); + return ZSTD_estimateCStreamSize_usingCParams(cParams); +} + +size_t ZSTD_estimateCStreamSize(int compressionLevel) { + int level; + size_t memBudget = 0; + for (level=1; level<=compressionLevel; level++) { + size_t const newMB = ZSTD_estimateCStreamSize_internal(level); + if (newMB > memBudget) memBudget = newMB; + } + return memBudget; } - -static U32 ZSTD_equivalentParams(ZSTD_parameters param1, ZSTD_parameters param2) +/* ZSTD_getFrameProgression(): + * tells how much data has been consumed (input) and produced (output) for current frame. + * able to count progression inside worker threads (non-blocking mode). + */ +ZSTD_frameProgression ZSTD_getFrameProgression(const ZSTD_CCtx* cctx) +{ +#ifdef ZSTD_MULTITHREAD + if (cctx->appliedParams.nbWorkers > 0) { + return ZSTDMT_getFrameProgression(cctx->mtctx); + } +#endif + { ZSTD_frameProgression fp; + size_t const buffered = (cctx->inBuff == NULL) ? 0 : + cctx->inBuffPos - cctx->inToCompress; + if (buffered) assert(cctx->inBuffPos >= cctx->inToCompress); + assert(buffered <= ZSTD_BLOCKSIZE_MAX); + fp.ingested = cctx->consumedSrcSize + buffered; + fp.consumed = cctx->consumedSrcSize; + fp.produced = cctx->producedCSize; + return fp; +} } + + +static U32 ZSTD_equivalentCParams(ZSTD_compressionParameters cParams1, + ZSTD_compressionParameters cParams2) +{ + return (cParams1.hashLog == cParams2.hashLog) + & (cParams1.chainLog == cParams2.chainLog) + & (cParams1.strategy == cParams2.strategy) /* opt parser space */ + & ((cParams1.searchLength==3) == (cParams2.searchLength==3)); /* hashlog3 space */ +} + +/** The parameters are equivalent if ldm is not enabled in both sets or + * all the parameters are equivalent. */ +static U32 ZSTD_equivalentLdmParams(ldmParams_t ldmParams1, + ldmParams_t ldmParams2) { - return (param1.cParams.hashLog == param2.cParams.hashLog) - & (param1.cParams.chainLog == param2.cParams.chainLog) - & (param1.cParams.strategy == param2.cParams.strategy) - & ((param1.cParams.searchLength==3) == (param2.cParams.searchLength==3)); + return (!ldmParams1.enableLdm && !ldmParams2.enableLdm) || + (ldmParams1.enableLdm == ldmParams2.enableLdm && + ldmParams1.hashLog == ldmParams2.hashLog && + ldmParams1.bucketSizeLog == ldmParams2.bucketSizeLog && + ldmParams1.minMatchLength == ldmParams2.minMatchLength && + ldmParams1.hashEveryLog == ldmParams2.hashEveryLog); +} + +typedef enum { ZSTDb_not_buffered, ZSTDb_buffered } ZSTD_buffered_policy_e; + +/* ZSTD_sufficientBuff() : + * check internal buffers exist for streaming if buffPol == ZSTDb_buffered . + * Note : they are assumed to be correctly sized if ZSTD_equivalentCParams()==1 */ +static U32 ZSTD_sufficientBuff(size_t bufferSize1, size_t blockSize1, + ZSTD_buffered_policy_e buffPol2, + ZSTD_compressionParameters cParams2, + U64 pledgedSrcSize) +{ + size_t const windowSize2 = MAX(1, (size_t)MIN(((U64)1 << cParams2.windowLog), pledgedSrcSize)); + size_t const blockSize2 = MIN(ZSTD_BLOCKSIZE_MAX, windowSize2); + size_t const neededBufferSize2 = (buffPol2==ZSTDb_buffered) ? windowSize2 + blockSize2 : 0; + DEBUGLOG(4, "ZSTD_sufficientBuff: is windowSize2=%u <= wlog1=%u", + (U32)windowSize2, cParams2.windowLog); + DEBUGLOG(4, "ZSTD_sufficientBuff: is blockSize2=%u <= blockSize1=%u", + (U32)blockSize2, (U32)blockSize1); + return (blockSize2 <= blockSize1) /* seqStore space depends on blockSize */ + & (neededBufferSize2 <= bufferSize1); +} + +/** Equivalence for resetCCtx purposes */ +static U32 ZSTD_equivalentParams(ZSTD_CCtx_params params1, + ZSTD_CCtx_params params2, + size_t buffSize1, size_t blockSize1, + ZSTD_buffered_policy_e buffPol2, + U64 pledgedSrcSize) +{ + DEBUGLOG(4, "ZSTD_equivalentParams: pledgedSrcSize=%u", (U32)pledgedSrcSize); + return ZSTD_equivalentCParams(params1.cParams, params2.cParams) && + ZSTD_equivalentLdmParams(params1.ldmParams, params2.ldmParams) && + ZSTD_sufficientBuff(buffSize1, blockSize1, buffPol2, params2.cParams, pledgedSrcSize); +} + +static void ZSTD_reset_compressedBlockState(ZSTD_compressedBlockState_t* bs) +{ + int i; + for (i = 0; i < ZSTD_REP_NUM; ++i) + bs->rep[i] = repStartValue[i]; + bs->entropy.hufCTable_repeatMode = HUF_repeat_none; + bs->entropy.offcode_repeatMode = FSE_repeat_none; + bs->entropy.matchlength_repeatMode = FSE_repeat_none; + bs->entropy.litlength_repeatMode = FSE_repeat_none; +} + +/*! ZSTD_invalidateMatchState() + * Invalidate all the matches in the match finder tables. + * Requires nextSrc and base to be set (can be NULL). + */ +static void ZSTD_invalidateMatchState(ZSTD_matchState_t* ms) +{ + ZSTD_window_clear(&ms->window); + + ms->nextToUpdate = ms->window.dictLimit + 1; + ms->loadedDictEnd = 0; + ms->opt.litLengthSum = 0; /* force reset of btopt stats */ } /*! ZSTD_continueCCtx() : - reuse CCtx without reset (note : requires no dictionary) */ -static size_t ZSTD_continueCCtx(ZSTD_CCtx* cctx, ZSTD_parameters params, U64 frameContentSize) + * reuse CCtx without reset (note : requires no dictionary) */ +static size_t ZSTD_continueCCtx(ZSTD_CCtx* cctx, ZSTD_CCtx_params params, U64 pledgedSrcSize) { - U32 const end = (U32)(cctx->nextSrc - cctx->base); - cctx->params = params; - cctx->frameContentSize = frameContentSize; - cctx->lowLimit = end; - cctx->dictLimit = end; - cctx->nextToUpdate = end+1; + size_t const windowSize = MAX(1, (size_t)MIN(((U64)1 << params.cParams.windowLog), pledgedSrcSize)); + size_t const blockSize = MIN(ZSTD_BLOCKSIZE_MAX, windowSize); + DEBUGLOG(4, "ZSTD_continueCCtx: re-use context in place"); + + cctx->blockSize = blockSize; /* previous block size could be different even for same windowLog, due to pledgedSrcSize */ + cctx->appliedParams = params; + cctx->pledgedSrcSizePlusOne = pledgedSrcSize+1; + cctx->consumedSrcSize = 0; + cctx->producedCSize = 0; + if (pledgedSrcSize == ZSTD_CONTENTSIZE_UNKNOWN) + cctx->appliedParams.fParams.contentSizeFlag = 0; + DEBUGLOG(4, "pledged content size : %u ; flag : %u", + (U32)pledgedSrcSize, cctx->appliedParams.fParams.contentSizeFlag); cctx->stage = ZSTDcs_init; cctx->dictID = 0; - cctx->loadedDictEnd = 0; - { int i; for (i=0; irep[i] = repStartValue[i]; } - cctx->seqStore.litLengthSum = 0; /* force reset of btopt stats */ + if (params.ldmParams.enableLdm) + ZSTD_window_clear(&cctx->ldmState.window); + ZSTD_referenceExternalSequences(cctx, NULL, 0); + ZSTD_invalidateMatchState(&cctx->blockState.matchState); + ZSTD_reset_compressedBlockState(cctx->blockState.prevCBlock); XXH64_reset(&cctx->xxhState, 0); return 0; } -typedef enum { ZSTDcrp_continue, ZSTDcrp_noMemset, ZSTDcrp_fullReset } ZSTD_compResetPolicy_e; - -/*! ZSTD_resetCCtx_advanced() : - note : 'params' must be validated */ -static size_t ZSTD_resetCCtx_advanced (ZSTD_CCtx* zc, - ZSTD_parameters params, U64 frameContentSize, - ZSTD_compResetPolicy_e const crp) +typedef enum { ZSTDcrp_continue, ZSTDcrp_noMemset } ZSTD_compResetPolicy_e; + +static void* ZSTD_reset_matchState(ZSTD_matchState_t* ms, void* ptr, ZSTD_compressionParameters const* cParams, ZSTD_compResetPolicy_e const crp, U32 const forCCtx) { - if (crp == ZSTDcrp_continue) - if (ZSTD_equivalentParams(params, zc->params)) - return ZSTD_continueCCtx(zc, params, frameContentSize); - - { size_t const blockSize = MIN(ZSTD_BLOCKSIZE_ABSOLUTEMAX, (size_t)1 << params.cParams.windowLog); + size_t const chainSize = (cParams->strategy == ZSTD_fast) ? 0 : ((size_t)1 << cParams->chainLog); + size_t const hSize = ((size_t)1) << cParams->hashLog; + U32 const hashLog3 = (forCCtx && cParams->searchLength==3) ? MIN(ZSTD_HASHLOG3_MAX, cParams->windowLog) : 0; + size_t const h3Size = ((size_t)1) << hashLog3; + size_t const tableSpace = (chainSize + hSize + h3Size) * sizeof(U32); + + assert(((size_t)ptr & 3) == 0); + + ms->hashLog3 = hashLog3; + memset(&ms->window, 0, sizeof(ms->window)); + ZSTD_invalidateMatchState(ms); + + /* opt parser space */ + if (forCCtx && ((cParams->strategy == ZSTD_btopt) | (cParams->strategy == ZSTD_btultra))) { + DEBUGLOG(4, "reserving optimal parser space"); + ms->opt.litFreq = (U32*)ptr; + ms->opt.litLengthFreq = ms->opt.litFreq + (1<opt.matchLengthFreq = ms->opt.litLengthFreq + (MaxLL+1); + ms->opt.offCodeFreq = ms->opt.matchLengthFreq + (MaxML+1); + ptr = ms->opt.offCodeFreq + (MaxOff+1); + ms->opt.matchTable = (ZSTD_match_t*)ptr; + ptr = ms->opt.matchTable + ZSTD_OPT_NUM+1; + ms->opt.priceTable = (ZSTD_optimal_t*)ptr; + ptr = ms->opt.priceTable + ZSTD_OPT_NUM+1; + } + + /* table Space */ + DEBUGLOG(4, "reset table : %u", crp!=ZSTDcrp_noMemset); + assert(((size_t)ptr & 3) == 0); /* ensure ptr is properly aligned */ + if (crp!=ZSTDcrp_noMemset) memset(ptr, 0, tableSpace); /* reset tables only */ + ms->hashTable = (U32*)(ptr); + ms->chainTable = ms->hashTable + hSize; + ms->hashTable3 = ms->chainTable + chainSize; + ptr = ms->hashTable3 + h3Size; + + assert(((size_t)ptr & 3) == 0); + return ptr; +} + +/*! ZSTD_resetCCtx_internal() : + note : `params` are assumed fully validated at this stage */ +static size_t ZSTD_resetCCtx_internal(ZSTD_CCtx* zc, + ZSTD_CCtx_params params, U64 pledgedSrcSize, + ZSTD_compResetPolicy_e const crp, + ZSTD_buffered_policy_e const zbuff) +{ + DEBUGLOG(4, "ZSTD_resetCCtx_internal: pledgedSrcSize=%u, wlog=%u", + (U32)pledgedSrcSize, params.cParams.windowLog); + assert(!ZSTD_isError(ZSTD_checkCParams(params.cParams))); + + if (crp == ZSTDcrp_continue) { + if (ZSTD_equivalentParams(zc->appliedParams, params, + zc->inBuffSize, zc->blockSize, + zbuff, pledgedSrcSize)) { + DEBUGLOG(4, "ZSTD_equivalentParams()==1 -> continue mode (wLog1=%u, blockSize1=%u)", + zc->appliedParams.cParams.windowLog, (U32)zc->blockSize); + return ZSTD_continueCCtx(zc, params, pledgedSrcSize); + } } + DEBUGLOG(4, "ZSTD_equivalentParams()==0 -> reset CCtx"); + + if (params.ldmParams.enableLdm) { + /* Adjust long distance matching parameters */ + params.ldmParams.windowLog = params.cParams.windowLog; + ZSTD_ldm_adjustParameters(¶ms.ldmParams, ¶ms.cParams); + assert(params.ldmParams.hashLog >= params.ldmParams.bucketSizeLog); + assert(params.ldmParams.hashEveryLog < 32); + zc->ldmState.hashPower = + ZSTD_ldm_getHashPower(params.ldmParams.minMatchLength); + } + + { size_t const windowSize = MAX(1, (size_t)MIN(((U64)1 << params.cParams.windowLog), pledgedSrcSize)); + size_t const blockSize = MIN(ZSTD_BLOCKSIZE_MAX, windowSize); U32 const divider = (params.cParams.searchLength==3) ? 3 : 4; size_t const maxNbSeq = blockSize / divider; size_t const tokenSpace = blockSize + 11*maxNbSeq; - size_t const chainSize = (params.cParams.strategy == ZSTD_fast) ? 0 : (1 << params.cParams.chainLog); - size_t const hSize = ((size_t)1) << params.cParams.hashLog; - U32 const hashLog3 = (params.cParams.searchLength>3) ? 0 : MIN(ZSTD_HASHLOG3_MAX, params.cParams.windowLog); - size_t const h3Size = ((size_t)1) << hashLog3; - size_t const tableSpace = (chainSize + hSize + h3Size) * sizeof(U32); + size_t const buffOutSize = (zbuff==ZSTDb_buffered) ? ZSTD_compressBound(blockSize)+1 : 0; + size_t const buffInSize = (zbuff==ZSTDb_buffered) ? windowSize + blockSize : 0; + size_t const matchStateSize = ZSTD_sizeof_matchState(¶ms.cParams, /* forCCtx */ 1); + size_t const maxNbLdmSeq = ZSTD_ldm_getMaxNbSeq(params.ldmParams, blockSize); void* ptr; /* Check if workSpace is large enough, alloc a new one if needed */ - { size_t const optSpace = ((MaxML+1) + (MaxLL+1) + (MaxOff+1) + (1<workSpaceSize < neededSpace) { + { size_t const entropySpace = HUF_WORKSPACE_SIZE; + size_t const blockStateSpace = 2 * sizeof(ZSTD_compressedBlockState_t); + size_t const bufferSpace = buffInSize + buffOutSize; + size_t const ldmSpace = ZSTD_ldm_getTableSize(params.ldmParams); + size_t const ldmSeqSpace = maxNbLdmSeq * sizeof(rawSeq); + + size_t const neededSpace = entropySpace + blockStateSpace + ldmSpace + + ldmSeqSpace + matchStateSize + tokenSpace + + bufferSpace; + DEBUGLOG(4, "Need %uKB workspace, including %uKB for match state, and %uKB for buffers", + (U32)(neededSpace>>10), (U32)(matchStateSize>>10), (U32)(bufferSpace>>10)); + DEBUGLOG(4, "windowSize: %u - blockSize: %u", (U32)windowSize, (U32)blockSize); + + if (zc->workSpaceSize < neededSpace) { /* too small : resize */ + DEBUGLOG(4, "Need to update workSpaceSize from %uK to %uK", + (unsigned)(zc->workSpaceSize>>10), + (unsigned)(neededSpace>>10)); + /* static cctx : no resize, error out */ + if (zc->staticSize) return ERROR(memory_allocation); + + zc->workSpaceSize = 0; ZSTD_free(zc->workSpace, zc->customMem); zc->workSpace = ZSTD_malloc(neededSpace, zc->customMem); if (zc->workSpace == NULL) return ERROR(memory_allocation); zc->workSpaceSize = neededSpace; + ptr = zc->workSpace; + + /* Statically sized space. entropyWorkspace never moves (but prev/next block swap places) */ + assert(((size_t)zc->workSpace & 3) == 0); /* ensure correct alignment */ + assert(zc->workSpaceSize >= 2 * sizeof(ZSTD_compressedBlockState_t)); + zc->blockState.prevCBlock = (ZSTD_compressedBlockState_t*)zc->workSpace; + zc->blockState.nextCBlock = zc->blockState.prevCBlock + 1; + ptr = zc->blockState.nextCBlock + 1; + zc->entropyWorkspace = (U32*)ptr; } } - if (crp!=ZSTDcrp_noMemset) memset(zc->workSpace, 0, tableSpace); /* reset tables only */ + /* init params */ + zc->appliedParams = params; + zc->pledgedSrcSizePlusOne = pledgedSrcSize+1; + zc->consumedSrcSize = 0; + zc->producedCSize = 0; + if (pledgedSrcSize == ZSTD_CONTENTSIZE_UNKNOWN) + zc->appliedParams.fParams.contentSizeFlag = 0; + DEBUGLOG(4, "pledged content size : %u ; flag : %u", + (U32)pledgedSrcSize, zc->appliedParams.fParams.contentSizeFlag); + zc->blockSize = blockSize; + XXH64_reset(&zc->xxhState, 0); - zc->hashLog3 = hashLog3; - zc->hashTable = (U32*)(zc->workSpace); - zc->chainTable = zc->hashTable + hSize; - zc->hashTable3 = zc->chainTable + chainSize; - ptr = zc->hashTable3 + h3Size; - zc->hufTable = (HUF_CElt*)ptr; - zc->flagStaticTables = 0; - ptr = ((U32*)ptr) + 256; /* note : HUF_CElt* is incomplete type, size is simulated using U32 */ - - zc->nextToUpdate = 1; - zc->nextSrc = NULL; - zc->base = NULL; - zc->dictBase = NULL; - zc->dictLimit = 0; - zc->lowLimit = 0; - zc->params = params; - zc->blockSize = blockSize; - zc->frameContentSize = frameContentSize; - { int i; for (i=0; irep[i] = repStartValue[i]; } - - if ((params.cParams.strategy == ZSTD_btopt) || (params.cParams.strategy == ZSTD_btopt2)) { - zc->seqStore.litFreq = (U32*)ptr; - zc->seqStore.litLengthFreq = zc->seqStore.litFreq + (1<seqStore.matchLengthFreq = zc->seqStore.litLengthFreq + (MaxLL+1); - zc->seqStore.offCodeFreq = zc->seqStore.matchLengthFreq + (MaxML+1); - ptr = zc->seqStore.offCodeFreq + (MaxOff+1); - zc->seqStore.matchTable = (ZSTD_match_t*)ptr; - ptr = zc->seqStore.matchTable + ZSTD_OPT_NUM+1; - zc->seqStore.priceTable = (ZSTD_optimal_t*)ptr; - ptr = zc->seqStore.priceTable + ZSTD_OPT_NUM+1; - zc->seqStore.litLengthSum = 0; + zc->stage = ZSTDcs_init; + zc->dictID = 0; + + ZSTD_reset_compressedBlockState(zc->blockState.prevCBlock); + + ptr = zc->entropyWorkspace + HUF_WORKSPACE_SIZE_U32; + + /* ldm hash table */ + /* initialize bucketOffsets table later for pointer alignment */ + if (params.ldmParams.enableLdm) { + size_t const ldmHSize = ((size_t)1) << params.ldmParams.hashLog; + memset(ptr, 0, ldmHSize * sizeof(ldmEntry_t)); + assert(((size_t)ptr & 3) == 0); /* ensure ptr is properly aligned */ + zc->ldmState.hashTable = (ldmEntry_t*)ptr; + ptr = zc->ldmState.hashTable + ldmHSize; + zc->ldmSequences = (rawSeq*)ptr; + ptr = zc->ldmSequences + maxNbLdmSeq; + zc->maxNbLdmSequences = maxNbLdmSeq; + + memset(&zc->ldmState.window, 0, sizeof(zc->ldmState.window)); } + assert(((size_t)ptr & 3) == 0); /* ensure ptr is properly aligned */ + + ptr = ZSTD_reset_matchState(&zc->blockState.matchState, ptr, ¶ms.cParams, crp, /* forCCtx */ 1); + + /* sequences storage */ zc->seqStore.sequencesStart = (seqDef*)ptr; ptr = zc->seqStore.sequencesStart + maxNbSeq; zc->seqStore.llCode = (BYTE*) ptr; zc->seqStore.mlCode = zc->seqStore.llCode + maxNbSeq; zc->seqStore.ofCode = zc->seqStore.mlCode + maxNbSeq; zc->seqStore.litStart = zc->seqStore.ofCode + maxNbSeq; - - zc->stage = ZSTDcs_init; - zc->dictID = 0; - zc->loadedDictEnd = 0; + ptr = zc->seqStore.litStart + blockSize; + + /* ldm bucketOffsets table */ + if (params.ldmParams.enableLdm) { + size_t const ldmBucketSize = + ((size_t)1) << (params.ldmParams.hashLog - + params.ldmParams.bucketSizeLog); + memset(ptr, 0, ldmBucketSize); + zc->ldmState.bucketOffsets = (BYTE*)ptr; + ptr = zc->ldmState.bucketOffsets + ldmBucketSize; + ZSTD_window_clear(&zc->ldmState.window); + } + ZSTD_referenceExternalSequences(zc, NULL, 0); + + /* buffers */ + zc->inBuffSize = buffInSize; + zc->inBuff = (char*)ptr; + zc->outBuffSize = buffOutSize; + zc->outBuff = zc->inBuff + buffInSize; return 0; } @@ -333,75 +1094,197 @@ * do not use with extDict variant ! */ void ZSTD_invalidateRepCodes(ZSTD_CCtx* cctx) { int i; - for (i=0; irep[i] = 0; + for (i=0; iblockState.prevCBlock->rep[i] = 0; + assert(!ZSTD_window_hasExtDict(cctx->blockState.matchState.window)); } -/*! ZSTD_copyCCtx() : -* Duplicate an existing context `srcCCtx` into another one `dstCCtx`. -* Only works during stage ZSTDcs_init (i.e. after creation, but before first call to ZSTD_compressContinue()). -* @return : 0, or an error code */ -size_t ZSTD_copyCCtx(ZSTD_CCtx* dstCCtx, const ZSTD_CCtx* srcCCtx, unsigned long long pledgedSrcSize) +static size_t ZSTD_resetCCtx_usingCDict(ZSTD_CCtx* cctx, + const ZSTD_CDict* cdict, + unsigned windowLog, + ZSTD_frameParameters fParams, + U64 pledgedSrcSize, + ZSTD_buffered_policy_e zbuff) { - if (srcCCtx->stage!=ZSTDcs_init) return ERROR(stage_wrong); - - memcpy(&dstCCtx->customMem, &srcCCtx->customMem, sizeof(ZSTD_customMem)); - ZSTD_resetCCtx_advanced(dstCCtx, srcCCtx->params, pledgedSrcSize, ZSTDcrp_noMemset); + { ZSTD_CCtx_params params = cctx->requestedParams; + /* Copy only compression parameters related to tables. */ + params.cParams = cdict->cParams; + if (windowLog) params.cParams.windowLog = windowLog; + params.fParams = fParams; + ZSTD_resetCCtx_internal(cctx, params, pledgedSrcSize, + ZSTDcrp_noMemset, zbuff); + assert(cctx->appliedParams.cParams.strategy == cdict->cParams.strategy); + assert(cctx->appliedParams.cParams.hashLog == cdict->cParams.hashLog); + assert(cctx->appliedParams.cParams.chainLog == cdict->cParams.chainLog); + } /* copy tables */ - { size_t const chainSize = (srcCCtx->params.cParams.strategy == ZSTD_fast) ? 0 : (1 << srcCCtx->params.cParams.chainLog); - size_t const hSize = ((size_t)1) << srcCCtx->params.cParams.hashLog; - size_t const h3Size = (size_t)1 << srcCCtx->hashLog3; - size_t const tableSpace = (chainSize + hSize + h3Size) * sizeof(U32); - memcpy(dstCCtx->workSpace, srcCCtx->workSpace, tableSpace); + { size_t const chainSize = (cdict->cParams.strategy == ZSTD_fast) ? 0 : ((size_t)1 << cdict->cParams.chainLog); + size_t const hSize = (size_t)1 << cdict->cParams.hashLog; + size_t const tableSpace = (chainSize + hSize) * sizeof(U32); + assert((U32*)cctx->blockState.matchState.chainTable == (U32*)cctx->blockState.matchState.hashTable + hSize); /* chainTable must follow hashTable */ + assert((U32*)cctx->blockState.matchState.hashTable3 == (U32*)cctx->blockState.matchState.chainTable + chainSize); + assert((U32*)cdict->matchState.chainTable == (U32*)cdict->matchState.hashTable + hSize); /* chainTable must follow hashTable */ + assert((U32*)cdict->matchState.hashTable3 == (U32*)cdict->matchState.chainTable + chainSize); + memcpy(cctx->blockState.matchState.hashTable, cdict->matchState.hashTable, tableSpace); /* presumes all tables follow each other */ + } + /* Zero the hashTable3, since the cdict never fills it */ + { size_t const h3Size = (size_t)1 << cctx->blockState.matchState.hashLog3; + assert(cdict->matchState.hashLog3 == 0); + memset(cctx->blockState.matchState.hashTable3, 0, h3Size * sizeof(U32)); } /* copy dictionary offsets */ - dstCCtx->nextToUpdate = srcCCtx->nextToUpdate; - dstCCtx->nextToUpdate3= srcCCtx->nextToUpdate3; - dstCCtx->nextSrc = srcCCtx->nextSrc; - dstCCtx->base = srcCCtx->base; - dstCCtx->dictBase = srcCCtx->dictBase; - dstCCtx->dictLimit = srcCCtx->dictLimit; - dstCCtx->lowLimit = srcCCtx->lowLimit; - dstCCtx->loadedDictEnd= srcCCtx->loadedDictEnd; - dstCCtx->dictID = srcCCtx->dictID; - - /* copy entropy tables */ - dstCCtx->flagStaticTables = srcCCtx->flagStaticTables; - if (srcCCtx->flagStaticTables) { - memcpy(dstCCtx->hufTable, srcCCtx->hufTable, 256*4); - memcpy(dstCCtx->litlengthCTable, srcCCtx->litlengthCTable, sizeof(dstCCtx->litlengthCTable)); - memcpy(dstCCtx->matchlengthCTable, srcCCtx->matchlengthCTable, sizeof(dstCCtx->matchlengthCTable)); - memcpy(dstCCtx->offcodeCTable, srcCCtx->offcodeCTable, sizeof(dstCCtx->offcodeCTable)); + { + ZSTD_matchState_t const* srcMatchState = &cdict->matchState; + ZSTD_matchState_t* dstMatchState = &cctx->blockState.matchState; + dstMatchState->window = srcMatchState->window; + dstMatchState->nextToUpdate = srcMatchState->nextToUpdate; + dstMatchState->nextToUpdate3= srcMatchState->nextToUpdate3; + dstMatchState->loadedDictEnd= srcMatchState->loadedDictEnd; } + cctx->dictID = cdict->dictID; + + /* copy block state */ + memcpy(cctx->blockState.prevCBlock, &cdict->cBlockState, sizeof(cdict->cBlockState)); return 0; } - -/*! ZSTD_reduceTable() : -* reduce table indexes by `reducerValue` */ -static void ZSTD_reduceTable (U32* const table, U32 const size, U32 const reducerValue) +/*! ZSTD_copyCCtx_internal() : + * Duplicate an existing context `srcCCtx` into another one `dstCCtx`. + * Only works during stage ZSTDcs_init (i.e. after creation, but before first call to ZSTD_compressContinue()). + * The "context", in this case, refers to the hash and chain tables, + * entropy tables, and dictionary references. + * `windowLog` value is enforced if != 0, otherwise value is copied from srcCCtx. + * @return : 0, or an error code */ +static size_t ZSTD_copyCCtx_internal(ZSTD_CCtx* dstCCtx, + const ZSTD_CCtx* srcCCtx, + ZSTD_frameParameters fParams, + U64 pledgedSrcSize, + ZSTD_buffered_policy_e zbuff) { - U32 u; - for (u=0 ; u < size ; u++) { - if (table[u] < reducerValue) table[u] = 0; - else table[u] -= reducerValue; + DEBUGLOG(5, "ZSTD_copyCCtx_internal"); + if (srcCCtx->stage!=ZSTDcs_init) return ERROR(stage_wrong); + + memcpy(&dstCCtx->customMem, &srcCCtx->customMem, sizeof(ZSTD_customMem)); + { ZSTD_CCtx_params params = dstCCtx->requestedParams; + /* Copy only compression parameters related to tables. */ + params.cParams = srcCCtx->appliedParams.cParams; + params.fParams = fParams; + ZSTD_resetCCtx_internal(dstCCtx, params, pledgedSrcSize, + ZSTDcrp_noMemset, zbuff); + assert(dstCCtx->appliedParams.cParams.windowLog == srcCCtx->appliedParams.cParams.windowLog); + assert(dstCCtx->appliedParams.cParams.strategy == srcCCtx->appliedParams.cParams.strategy); + assert(dstCCtx->appliedParams.cParams.hashLog == srcCCtx->appliedParams.cParams.hashLog); + assert(dstCCtx->appliedParams.cParams.chainLog == srcCCtx->appliedParams.cParams.chainLog); + assert(dstCCtx->blockState.matchState.hashLog3 == srcCCtx->blockState.matchState.hashLog3); + } + + /* copy tables */ + { size_t const chainSize = (srcCCtx->appliedParams.cParams.strategy == ZSTD_fast) ? 0 : ((size_t)1 << srcCCtx->appliedParams.cParams.chainLog); + size_t const hSize = (size_t)1 << srcCCtx->appliedParams.cParams.hashLog; + size_t const h3Size = (size_t)1 << srcCCtx->blockState.matchState.hashLog3; + size_t const tableSpace = (chainSize + hSize + h3Size) * sizeof(U32); + assert((U32*)dstCCtx->blockState.matchState.chainTable == (U32*)dstCCtx->blockState.matchState.hashTable + hSize); /* chainTable must follow hashTable */ + assert((U32*)dstCCtx->blockState.matchState.hashTable3 == (U32*)dstCCtx->blockState.matchState.chainTable + chainSize); + memcpy(dstCCtx->blockState.matchState.hashTable, srcCCtx->blockState.matchState.hashTable, tableSpace); /* presumes all tables follow each other */ + } + + /* copy dictionary offsets */ + { + ZSTD_matchState_t const* srcMatchState = &srcCCtx->blockState.matchState; + ZSTD_matchState_t* dstMatchState = &dstCCtx->blockState.matchState; + dstMatchState->window = srcMatchState->window; + dstMatchState->nextToUpdate = srcMatchState->nextToUpdate; + dstMatchState->nextToUpdate3= srcMatchState->nextToUpdate3; + dstMatchState->loadedDictEnd= srcMatchState->loadedDictEnd; } + dstCCtx->dictID = srcCCtx->dictID; + + /* copy block state */ + memcpy(dstCCtx->blockState.prevCBlock, srcCCtx->blockState.prevCBlock, sizeof(*srcCCtx->blockState.prevCBlock)); + + return 0; +} + +/*! ZSTD_copyCCtx() : + * Duplicate an existing context `srcCCtx` into another one `dstCCtx`. + * Only works during stage ZSTDcs_init (i.e. after creation, but before first call to ZSTD_compressContinue()). + * pledgedSrcSize==0 means "unknown". +* @return : 0, or an error code */ +size_t ZSTD_copyCCtx(ZSTD_CCtx* dstCCtx, const ZSTD_CCtx* srcCCtx, unsigned long long pledgedSrcSize) +{ + ZSTD_frameParameters fParams = { 1 /*content*/, 0 /*checksum*/, 0 /*noDictID*/ }; + ZSTD_buffered_policy_e const zbuff = (ZSTD_buffered_policy_e)(srcCCtx->inBuffSize>0); + ZSTD_STATIC_ASSERT((U32)ZSTDb_buffered==1); + if (pledgedSrcSize==0) pledgedSrcSize = ZSTD_CONTENTSIZE_UNKNOWN; + fParams.contentSizeFlag = (pledgedSrcSize != ZSTD_CONTENTSIZE_UNKNOWN); + + return ZSTD_copyCCtx_internal(dstCCtx, srcCCtx, + fParams, pledgedSrcSize, + zbuff); +} + + +#define ZSTD_ROWSIZE 16 +/*! ZSTD_reduceTable() : + * reduce table indexes by `reducerValue`, or squash to zero. + * PreserveMark preserves "unsorted mark" for btlazy2 strategy. + * It must be set to a clear 0/1 value, to remove branch during inlining. + * Presume table size is a multiple of ZSTD_ROWSIZE + * to help auto-vectorization */ +FORCE_INLINE_TEMPLATE void +ZSTD_reduceTable_internal (U32* const table, U32 const size, U32 const reducerValue, int const preserveMark) +{ + int const nbRows = (int)size / ZSTD_ROWSIZE; + int cellNb = 0; + int rowNb; + assert((size & (ZSTD_ROWSIZE-1)) == 0); /* multiple of ZSTD_ROWSIZE */ + assert(size < (1U<<31)); /* can be casted to int */ + for (rowNb=0 ; rowNb < nbRows ; rowNb++) { + int column; + for (column=0; columnparams.cParams.hashLog; - ZSTD_reduceTable(zc->hashTable, hSize, reducerValue); } - - { U32 const chainSize = (zc->params.cParams.strategy == ZSTD_fast) ? 0 : (1 << zc->params.cParams.chainLog); - ZSTD_reduceTable(zc->chainTable, chainSize, reducerValue); } - - { U32 const h3Size = (zc->hashLog3) ? 1 << zc->hashLog3 : 0; - ZSTD_reduceTable(zc->hashTable3, h3Size, reducerValue); } + ZSTD_matchState_t* const ms = &zc->blockState.matchState; + { U32 const hSize = (U32)1 << zc->appliedParams.cParams.hashLog; + ZSTD_reduceTable(ms->hashTable, hSize, reducerValue); + } + + if (zc->appliedParams.cParams.strategy != ZSTD_fast) { + U32 const chainSize = (U32)1 << zc->appliedParams.cParams.chainLog; + if (zc->appliedParams.cParams.strategy == ZSTD_btlazy2) + ZSTD_reduceTable_btlazy2(ms->chainTable, chainSize, reducerValue); + else + ZSTD_reduceTable(ms->chainTable, chainSize, reducerValue); + } + + if (ms->hashLog3) { + U32 const h3Size = (U32)1 << ms->hashLog3; + ZSTD_reduceTable(ms->hashTable3, h3Size, reducerValue); + } } @@ -435,10 +1318,11 @@ case 2: /* 2 - 2 - 12 */ MEM_writeLE16(ostart, (U16)((U32)set_basic + (1<<2) + (srcSize<<4))); break; - default: /*note : should not be necessary : flSize is within {1,2,3} */ case 3: /* 2 - 2 - 20 */ MEM_writeLE32(ostart, (U32)((U32)set_basic + (3<<2) + (srcSize<<4))); break; + default: /* not necessary : flSize is {1,2,3} */ + assert(0); } memcpy(ostart + flSize, src, srcSize); @@ -460,10 +1344,11 @@ case 2: /* 2 - 2 - 12 */ MEM_writeLE16(ostart, (U16)((U32)set_rle + (1<<2) + (srcSize<<4))); break; - default: /*note : should not be necessary : flSize is necessarily within {1,2,3} */ case 3: /* 2 - 2 - 20 */ MEM_writeLE32(ostart, (U32)((U32)set_rle + (3<<2) + (srcSize<<4))); break; + default: /* not necessary : flSize is {1,2,3} */ + assert(0); } ostart[flSize] = *(const BYTE*)src; @@ -473,9 +1358,12 @@ static size_t ZSTD_minGain(size_t srcSize) { return (srcSize >> 6) + 2; } -static size_t ZSTD_compressLiterals (ZSTD_CCtx* zc, +static size_t ZSTD_compressLiterals (ZSTD_entropyCTables_t const* prevEntropy, + ZSTD_entropyCTables_t* nextEntropy, + ZSTD_strategy strategy, int disableLiteralCompression, void* dst, size_t dstCapacity, - const void* src, size_t srcSize) + const void* src, size_t srcSize, + U32* workspace, const int bmi2) { size_t const minGain = ZSTD_minGain(srcSize); size_t const lhSize = 3 + (srcSize >= 1 KB) + (srcSize >= 16 KB); @@ -484,27 +1372,50 @@ symbolEncodingType_e hType = set_compressed; size_t cLitSize; + DEBUGLOG(5,"ZSTD_compressLiterals (disableLiteralCompression=%i)", + disableLiteralCompression); + + /* Prepare nextEntropy assuming reusing the existing table */ + nextEntropy->hufCTable_repeatMode = prevEntropy->hufCTable_repeatMode; + memcpy(nextEntropy->hufCTable, prevEntropy->hufCTable, + sizeof(prevEntropy->hufCTable)); + + if (disableLiteralCompression) + return ZSTD_noCompressLiterals(dst, dstCapacity, src, srcSize); /* small ? don't even attempt compression (speed opt) */ -# define LITERAL_NOENTROPY 63 - { size_t const minLitSize = zc->flagStaticTables ? 6 : LITERAL_NOENTROPY; +# define COMPRESS_LITERALS_SIZE_MIN 63 + { size_t const minLitSize = (prevEntropy->hufCTable_repeatMode == HUF_repeat_valid) ? 6 : COMPRESS_LITERALS_SIZE_MIN; if (srcSize <= minLitSize) return ZSTD_noCompressLiterals(dst, dstCapacity, src, srcSize); } if (dstCapacity < lhSize+1) return ERROR(dstSize_tooSmall); /* not enough space for compression */ - if (zc->flagStaticTables && (lhSize==3)) { - hType = set_repeat; - singleStream = 1; - cLitSize = HUF_compress1X_usingCTable(ostart+lhSize, dstCapacity-lhSize, src, srcSize, zc->hufTable); - } else { - cLitSize = singleStream ? HUF_compress1X_wksp(ostart+lhSize, dstCapacity-lhSize, src, srcSize, 255, 11, zc->tmpCounters, sizeof(zc->tmpCounters)) - : HUF_compress4X_wksp(ostart+lhSize, dstCapacity-lhSize, src, srcSize, 255, 11, zc->tmpCounters, sizeof(zc->tmpCounters)); + { HUF_repeat repeat = prevEntropy->hufCTable_repeatMode; + int const preferRepeat = strategy < ZSTD_lazy ? srcSize <= 1024 : 0; + if (repeat == HUF_repeat_valid && lhSize == 3) singleStream = 1; + cLitSize = singleStream ? HUF_compress1X_repeat(ostart+lhSize, dstCapacity-lhSize, src, srcSize, 255, 11, + workspace, HUF_WORKSPACE_SIZE, (HUF_CElt*)nextEntropy->hufCTable, &repeat, preferRepeat, bmi2) + : HUF_compress4X_repeat(ostart+lhSize, dstCapacity-lhSize, src, srcSize, 255, 11, + workspace, HUF_WORKSPACE_SIZE, (HUF_CElt*)nextEntropy->hufCTable, &repeat, preferRepeat, bmi2); + if (repeat != HUF_repeat_none) { + /* reused the existing table */ + hType = set_repeat; + } } - if ((cLitSize==0) | (cLitSize >= srcSize - minGain)) + if ((cLitSize==0) | (cLitSize >= srcSize - minGain) | ERR_isError(cLitSize)) { + memcpy(nextEntropy->hufCTable, prevEntropy->hufCTable, sizeof(prevEntropy->hufCTable)); return ZSTD_noCompressLiterals(dst, dstCapacity, src, srcSize); - if (cLitSize==1) + } + if (cLitSize==1) { + memcpy(nextEntropy->hufCTable, prevEntropy->hufCTable, sizeof(prevEntropy->hufCTable)); return ZSTD_compressRleLiteralsBlock(dst, dstCapacity, src, srcSize); + } + + if (hType == set_compressed) { + /* using a newly constructed table */ + nextEntropy->hufCTable_repeatMode = HUF_repeat_check; + } /* Build header */ switch(lhSize) @@ -519,40 +1430,21 @@ MEM_writeLE32(ostart, lhc); break; } - default: /* should not be necessary, lhSize is only {3,4,5} */ case 5: /* 2 - 2 - 18 - 18 */ { U32 const lhc = hType + (3 << 2) + ((U32)srcSize<<4) + ((U32)cLitSize<<22); MEM_writeLE32(ostart, lhc); ostart[4] = (BYTE)(cLitSize >> 10); break; } + default: /* not possible : lhSize is {3,4,5} */ + assert(0); } return lhSize+cLitSize; } -static const BYTE LL_Code[64] = { 0, 1, 2, 3, 4, 5, 6, 7, - 8, 9, 10, 11, 12, 13, 14, 15, - 16, 16, 17, 17, 18, 18, 19, 19, - 20, 20, 20, 20, 21, 21, 21, 21, - 22, 22, 22, 22, 22, 22, 22, 22, - 23, 23, 23, 23, 23, 23, 23, 23, - 24, 24, 24, 24, 24, 24, 24, 24, - 24, 24, 24, 24, 24, 24, 24, 24 }; - -static const BYTE ML_Code[128] = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, - 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, - 32, 32, 33, 33, 34, 34, 35, 35, 36, 36, 36, 36, 37, 37, 37, 37, - 38, 38, 38, 38, 38, 38, 38, 38, 39, 39, 39, 39, 39, 39, 39, 39, - 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, - 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, - 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, - 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42 }; - void ZSTD_seqToCodes(const seqStore_t* seqStorePtr) { - BYTE const LL_deltaCode = 19; - BYTE const ML_deltaCode = 36; const seqDef* const sequences = seqStorePtr->sequencesStart; BYTE* const llCodeTable = seqStorePtr->llCode; BYTE* const ofCodeTable = seqStorePtr->ofCode; @@ -562,9 +1454,9 @@ for (u=0; u 63) ? (BYTE)ZSTD_highbit32(llv) + LL_deltaCode : LL_Code[llv]; + llCodeTable[u] = (BYTE)ZSTD_LLcode(llv); ofCodeTable[u] = (BYTE)ZSTD_highbit32(sequences[u].offset); - mlCodeTable[u] = (mlv>127) ? (BYTE)ZSTD_highbit32(mlv) + ML_deltaCode : ML_Code[mlv]; + mlCodeTable[u] = (BYTE)ZSTD_MLcode(mlv); } if (seqStorePtr->longLengthID==1) llCodeTable[seqStorePtr->longLengthPos] = MaxLL; @@ -572,17 +1464,252 @@ mlCodeTable[seqStorePtr->longLengthPos] = MaxML; } - -size_t ZSTD_compressSequences(ZSTD_CCtx* zc, - void* dst, size_t dstCapacity, - size_t srcSize) +typedef enum { + ZSTD_defaultDisallowed = 0, + ZSTD_defaultAllowed = 1 +} ZSTD_defaultPolicy_e; + +MEM_STATIC +symbolEncodingType_e ZSTD_selectEncodingType( + FSE_repeat* repeatMode, size_t const mostFrequent, size_t nbSeq, + U32 defaultNormLog, ZSTD_defaultPolicy_e const isDefaultAllowed) +{ +#define MIN_SEQ_FOR_DYNAMIC_FSE 64 +#define MAX_SEQ_FOR_STATIC_FSE 1000 + ZSTD_STATIC_ASSERT(ZSTD_defaultDisallowed == 0 && ZSTD_defaultAllowed != 0); + if ((mostFrequent == nbSeq) && (!isDefaultAllowed || nbSeq > 2)) { + DEBUGLOG(5, "Selected set_rle"); + /* Prefer set_basic over set_rle when there are 2 or less symbols, + * since RLE uses 1 byte, but set_basic uses 5-6 bits per symbol. + * If basic encoding isn't possible, always choose RLE. + */ + *repeatMode = FSE_repeat_check; + return set_rle; + } + if ( isDefaultAllowed + && (*repeatMode == FSE_repeat_valid) && (nbSeq < MAX_SEQ_FOR_STATIC_FSE)) { + DEBUGLOG(5, "Selected set_repeat"); + return set_repeat; + } + if ( isDefaultAllowed + && ((nbSeq < MIN_SEQ_FOR_DYNAMIC_FSE) || (mostFrequent < (nbSeq >> (defaultNormLog-1)))) ) { + DEBUGLOG(5, "Selected set_basic"); + /* The format allows default tables to be repeated, but it isn't useful. + * When using simple heuristics to select encoding type, we don't want + * to confuse these tables with dictionaries. When running more careful + * analysis, we don't need to waste time checking both repeating tables + * and default tables. + */ + *repeatMode = FSE_repeat_none; + return set_basic; + } + DEBUGLOG(5, "Selected set_compressed"); + *repeatMode = FSE_repeat_check; + return set_compressed; +} + +MEM_STATIC +size_t ZSTD_buildCTable(void* dst, size_t dstCapacity, + FSE_CTable* nextCTable, U32 FSELog, symbolEncodingType_e type, + U32* count, U32 max, + BYTE const* codeTable, size_t nbSeq, + S16 const* defaultNorm, U32 defaultNormLog, U32 defaultMax, + FSE_CTable const* prevCTable, size_t prevCTableSize, + void* workspace, size_t workspaceSize) +{ + BYTE* op = (BYTE*)dst; + BYTE const* const oend = op + dstCapacity; + + switch (type) { + case set_rle: + *op = codeTable[0]; + CHECK_F(FSE_buildCTable_rle(nextCTable, (BYTE)max)); + return 1; + case set_repeat: + memcpy(nextCTable, prevCTable, prevCTableSize); + return 0; + case set_basic: + CHECK_F(FSE_buildCTable_wksp(nextCTable, defaultNorm, defaultMax, defaultNormLog, workspace, workspaceSize)); /* note : could be pre-calculated */ + return 0; + case set_compressed: { + S16 norm[MaxSeq + 1]; + size_t nbSeq_1 = nbSeq; + const U32 tableLog = FSE_optimalTableLog(FSELog, nbSeq, max); + if (count[codeTable[nbSeq-1]] > 1) { + count[codeTable[nbSeq-1]]--; + nbSeq_1--; + } + assert(nbSeq_1 > 1); + CHECK_F(FSE_normalizeCount(norm, tableLog, count, nbSeq_1, max)); + { size_t const NCountSize = FSE_writeNCount(op, oend - op, norm, max, tableLog); /* overflow protected */ + if (FSE_isError(NCountSize)) return NCountSize; + CHECK_F(FSE_buildCTable_wksp(nextCTable, norm, max, tableLog, workspace, workspaceSize)); + return NCountSize; + } + } + default: return assert(0), ERROR(GENERIC); + } +} + +FORCE_INLINE_TEMPLATE size_t +ZSTD_encodeSequences_body( + void* dst, size_t dstCapacity, + FSE_CTable const* CTable_MatchLength, BYTE const* mlCodeTable, + FSE_CTable const* CTable_OffsetBits, BYTE const* ofCodeTable, + FSE_CTable const* CTable_LitLength, BYTE const* llCodeTable, + seqDef const* sequences, size_t nbSeq, int longOffsets) { - const seqStore_t* seqStorePtr = &(zc->seqStore); + BIT_CStream_t blockStream; + FSE_CState_t stateMatchLength; + FSE_CState_t stateOffsetBits; + FSE_CState_t stateLitLength; + + CHECK_E(BIT_initCStream(&blockStream, dst, dstCapacity), dstSize_tooSmall); /* not enough space remaining */ + + /* first symbols */ + FSE_initCState2(&stateMatchLength, CTable_MatchLength, mlCodeTable[nbSeq-1]); + FSE_initCState2(&stateOffsetBits, CTable_OffsetBits, ofCodeTable[nbSeq-1]); + FSE_initCState2(&stateLitLength, CTable_LitLength, llCodeTable[nbSeq-1]); + BIT_addBits(&blockStream, sequences[nbSeq-1].litLength, LL_bits[llCodeTable[nbSeq-1]]); + if (MEM_32bits()) BIT_flushBits(&blockStream); + BIT_addBits(&blockStream, sequences[nbSeq-1].matchLength, ML_bits[mlCodeTable[nbSeq-1]]); + if (MEM_32bits()) BIT_flushBits(&blockStream); + if (longOffsets) { + U32 const ofBits = ofCodeTable[nbSeq-1]; + int const extraBits = ofBits - MIN(ofBits, STREAM_ACCUMULATOR_MIN-1); + if (extraBits) { + BIT_addBits(&blockStream, sequences[nbSeq-1].offset, extraBits); + BIT_flushBits(&blockStream); + } + BIT_addBits(&blockStream, sequences[nbSeq-1].offset >> extraBits, + ofBits - extraBits); + } else { + BIT_addBits(&blockStream, sequences[nbSeq-1].offset, ofCodeTable[nbSeq-1]); + } + BIT_flushBits(&blockStream); + + { size_t n; + for (n=nbSeq-2 ; n= 64-7-(LLFSELog+MLFSELog+OffFSELog))) + BIT_flushBits(&blockStream); /* (7)*/ + BIT_addBits(&blockStream, sequences[n].litLength, llBits); + if (MEM_32bits() && ((llBits+mlBits)>24)) BIT_flushBits(&blockStream); + BIT_addBits(&blockStream, sequences[n].matchLength, mlBits); + if (MEM_32bits() || (ofBits+mlBits+llBits > 56)) BIT_flushBits(&blockStream); + if (longOffsets) { + int const extraBits = ofBits - MIN(ofBits, STREAM_ACCUMULATOR_MIN-1); + if (extraBits) { + BIT_addBits(&blockStream, sequences[n].offset, extraBits); + BIT_flushBits(&blockStream); /* (7)*/ + } + BIT_addBits(&blockStream, sequences[n].offset >> extraBits, + ofBits - extraBits); /* 31 */ + } else { + BIT_addBits(&blockStream, sequences[n].offset, ofBits); /* 31 */ + } + BIT_flushBits(&blockStream); /* (7)*/ + } } + + DEBUGLOG(6, "ZSTD_encodeSequences: flushing ML state with %u bits", stateMatchLength.stateLog); + FSE_flushCState(&blockStream, &stateMatchLength); + DEBUGLOG(6, "ZSTD_encodeSequences: flushing Off state with %u bits", stateOffsetBits.stateLog); + FSE_flushCState(&blockStream, &stateOffsetBits); + DEBUGLOG(6, "ZSTD_encodeSequences: flushing LL state with %u bits", stateLitLength.stateLog); + FSE_flushCState(&blockStream, &stateLitLength); + + { size_t const streamSize = BIT_closeCStream(&blockStream); + if (streamSize==0) return ERROR(dstSize_tooSmall); /* not enough space */ + return streamSize; + } +} + +static size_t +ZSTD_encodeSequences_default( + void* dst, size_t dstCapacity, + FSE_CTable const* CTable_MatchLength, BYTE const* mlCodeTable, + FSE_CTable const* CTable_OffsetBits, BYTE const* ofCodeTable, + FSE_CTable const* CTable_LitLength, BYTE const* llCodeTable, + seqDef const* sequences, size_t nbSeq, int longOffsets) +{ + return ZSTD_encodeSequences_body(dst, dstCapacity, + CTable_MatchLength, mlCodeTable, + CTable_OffsetBits, ofCodeTable, + CTable_LitLength, llCodeTable, + sequences, nbSeq, longOffsets); +} + + +#if DYNAMIC_BMI2 + +static TARGET_ATTRIBUTE("bmi2") size_t +ZSTD_encodeSequences_bmi2( + void* dst, size_t dstCapacity, + FSE_CTable const* CTable_MatchLength, BYTE const* mlCodeTable, + FSE_CTable const* CTable_OffsetBits, BYTE const* ofCodeTable, + FSE_CTable const* CTable_LitLength, BYTE const* llCodeTable, + seqDef const* sequences, size_t nbSeq, int longOffsets) +{ + return ZSTD_encodeSequences_body(dst, dstCapacity, + CTable_MatchLength, mlCodeTable, + CTable_OffsetBits, ofCodeTable, + CTable_LitLength, llCodeTable, + sequences, nbSeq, longOffsets); +} + +#endif + +size_t ZSTD_encodeSequences( + void* dst, size_t dstCapacity, + FSE_CTable const* CTable_MatchLength, BYTE const* mlCodeTable, + FSE_CTable const* CTable_OffsetBits, BYTE const* ofCodeTable, + FSE_CTable const* CTable_LitLength, BYTE const* llCodeTable, + seqDef const* sequences, size_t nbSeq, int longOffsets, int bmi2) +{ +#if DYNAMIC_BMI2 + if (bmi2) { + return ZSTD_encodeSequences_bmi2(dst, dstCapacity, + CTable_MatchLength, mlCodeTable, + CTable_OffsetBits, ofCodeTable, + CTable_LitLength, llCodeTable, + sequences, nbSeq, longOffsets); + } +#endif + (void)bmi2; + return ZSTD_encodeSequences_default(dst, dstCapacity, + CTable_MatchLength, mlCodeTable, + CTable_OffsetBits, ofCodeTable, + CTable_LitLength, llCodeTable, + sequences, nbSeq, longOffsets); +} + +MEM_STATIC size_t ZSTD_compressSequences_internal(seqStore_t* seqStorePtr, + ZSTD_entropyCTables_t const* prevEntropy, + ZSTD_entropyCTables_t* nextEntropy, + ZSTD_CCtx_params const* cctxParams, + void* dst, size_t dstCapacity, U32* workspace, + const int bmi2) +{ + const int longOffsets = cctxParams->cParams.windowLog > STREAM_ACCUMULATOR_MIN; U32 count[MaxSeq+1]; - S16 norm[MaxSeq+1]; - FSE_CTable* CTable_LitLength = zc->litlengthCTable; - FSE_CTable* CTable_OffsetBits = zc->offcodeCTable; - FSE_CTable* CTable_MatchLength = zc->matchlengthCTable; + FSE_CTable* CTable_LitLength = nextEntropy->litlengthCTable; + FSE_CTable* CTable_OffsetBits = nextEntropy->offcodeCTable; + FSE_CTable* CTable_MatchLength = nextEntropy->matchlengthCTable; U32 LLtype, Offtype, MLtype; /* compressed, raw or rle */ const seqDef* const sequences = seqStorePtr->sequencesStart; const BYTE* const ofCodeTable = seqStorePtr->ofCode; @@ -593,1699 +1720,261 @@ BYTE* op = ostart; size_t const nbSeq = seqStorePtr->sequences - seqStorePtr->sequencesStart; BYTE* seqHead; - BYTE scratchBuffer[1<= (1<litStart; size_t const litSize = seqStorePtr->lit - literals; - size_t const cSize = ZSTD_compressLiterals(zc, op, dstCapacity, literals, litSize); - if (ZSTD_isError(cSize)) return cSize; + size_t const cSize = ZSTD_compressLiterals( + prevEntropy, nextEntropy, + cctxParams->cParams.strategy, cctxParams->disableLiteralCompression, + op, dstCapacity, + literals, litSize, + workspace, bmi2); + if (ZSTD_isError(cSize)) + return cSize; + assert(cSize <= dstCapacity); op += cSize; } /* Sequences Header */ - if ((oend-op) < 3 /*max nbSeq Size*/ + 1 /*seqHead */) return ERROR(dstSize_tooSmall); - if (nbSeq < 0x7F) *op++ = (BYTE)nbSeq; - else if (nbSeq < LONGNBSEQ) op[0] = (BYTE)((nbSeq>>8) + 0x80), op[1] = (BYTE)nbSeq, op+=2; - else op[0]=0xFF, MEM_writeLE16(op+1, (U16)(nbSeq - LONGNBSEQ)), op+=3; - if (nbSeq==0) goto _check_compressibility; + if ((oend-op) < 3 /*max nbSeq Size*/ + 1 /*seqHead*/) return ERROR(dstSize_tooSmall); + if (nbSeq < 0x7F) + *op++ = (BYTE)nbSeq; + else if (nbSeq < LONGNBSEQ) + op[0] = (BYTE)((nbSeq>>8) + 0x80), op[1] = (BYTE)nbSeq, op+=2; + else + op[0]=0xFF, MEM_writeLE16(op+1, (U16)(nbSeq - LONGNBSEQ)), op+=3; + if (nbSeq==0) { + memcpy(nextEntropy->litlengthCTable, prevEntropy->litlengthCTable, sizeof(prevEntropy->litlengthCTable)); + nextEntropy->litlength_repeatMode = prevEntropy->litlength_repeatMode; + memcpy(nextEntropy->offcodeCTable, prevEntropy->offcodeCTable, sizeof(prevEntropy->offcodeCTable)); + nextEntropy->offcode_repeatMode = prevEntropy->offcode_repeatMode; + memcpy(nextEntropy->matchlengthCTable, prevEntropy->matchlengthCTable, sizeof(prevEntropy->matchlengthCTable)); + nextEntropy->matchlength_repeatMode = prevEntropy->matchlength_repeatMode; + return op - ostart; + } /* seqHead : flags for FSE encoding type */ seqHead = op++; -#define MIN_SEQ_FOR_DYNAMIC_FSE 64 -#define MAX_SEQ_FOR_STATIC_FSE 1000 - /* convert length/distances into codes */ ZSTD_seqToCodes(seqStorePtr); - - /* CTable for Literal Lengths */ + /* build CTable for Literal Lengths */ { U32 max = MaxLL; - size_t const mostFrequent = FSE_countFast_wksp(count, &max, llCodeTable, nbSeq, zc->tmpCounters); - if ((mostFrequent == nbSeq) && (nbSeq > 2)) { - *op++ = llCodeTable[0]; - FSE_buildCTable_rle(CTable_LitLength, (BYTE)max); - LLtype = set_rle; - } else if ((zc->flagStaticTables) && (nbSeq < MAX_SEQ_FOR_STATIC_FSE)) { - LLtype = set_repeat; - } else if ((nbSeq < MIN_SEQ_FOR_DYNAMIC_FSE) || (mostFrequent < (nbSeq >> (LL_defaultNormLog-1)))) { - FSE_buildCTable_wksp(CTable_LitLength, LL_defaultNorm, MaxLL, LL_defaultNormLog, scratchBuffer, sizeof(scratchBuffer)); - LLtype = set_basic; - } else { - size_t nbSeq_1 = nbSeq; - const U32 tableLog = FSE_optimalTableLog(LLFSELog, nbSeq, max); - if (count[llCodeTable[nbSeq-1]]>1) { count[llCodeTable[nbSeq-1]]--; nbSeq_1--; } - FSE_normalizeCount(norm, tableLog, count, nbSeq_1, max); - { size_t const NCountSize = FSE_writeNCount(op, oend-op, norm, max, tableLog); /* overflow protected */ - if (FSE_isError(NCountSize)) return ERROR(GENERIC); - op += NCountSize; } - FSE_buildCTable_wksp(CTable_LitLength, norm, max, tableLog, scratchBuffer, sizeof(scratchBuffer)); - LLtype = set_compressed; + size_t const mostFrequent = FSE_countFast_wksp(count, &max, llCodeTable, nbSeq, workspace); + DEBUGLOG(5, "Building LL table"); + nextEntropy->litlength_repeatMode = prevEntropy->litlength_repeatMode; + LLtype = ZSTD_selectEncodingType(&nextEntropy->litlength_repeatMode, mostFrequent, nbSeq, LL_defaultNormLog, ZSTD_defaultAllowed); + { size_t const countSize = ZSTD_buildCTable(op, oend - op, CTable_LitLength, LLFSELog, (symbolEncodingType_e)LLtype, + count, max, llCodeTable, nbSeq, LL_defaultNorm, LL_defaultNormLog, MaxLL, + prevEntropy->litlengthCTable, sizeof(prevEntropy->litlengthCTable), + workspace, HUF_WORKSPACE_SIZE); + if (ZSTD_isError(countSize)) return countSize; + op += countSize; } } - - /* CTable for Offsets */ + /* build CTable for Offsets */ { U32 max = MaxOff; - size_t const mostFrequent = FSE_countFast_wksp(count, &max, ofCodeTable, nbSeq, zc->tmpCounters); - if ((mostFrequent == nbSeq) && (nbSeq > 2)) { - *op++ = ofCodeTable[0]; - FSE_buildCTable_rle(CTable_OffsetBits, (BYTE)max); - Offtype = set_rle; - } else if ((zc->flagStaticTables) && (nbSeq < MAX_SEQ_FOR_STATIC_FSE)) { - Offtype = set_repeat; - } else if ((nbSeq < MIN_SEQ_FOR_DYNAMIC_FSE) || (mostFrequent < (nbSeq >> (OF_defaultNormLog-1)))) { - FSE_buildCTable_wksp(CTable_OffsetBits, OF_defaultNorm, MaxOff, OF_defaultNormLog, scratchBuffer, sizeof(scratchBuffer)); - Offtype = set_basic; - } else { - size_t nbSeq_1 = nbSeq; - const U32 tableLog = FSE_optimalTableLog(OffFSELog, nbSeq, max); - if (count[ofCodeTable[nbSeq-1]]>1) { count[ofCodeTable[nbSeq-1]]--; nbSeq_1--; } - FSE_normalizeCount(norm, tableLog, count, nbSeq_1, max); - { size_t const NCountSize = FSE_writeNCount(op, oend-op, norm, max, tableLog); /* overflow protected */ - if (FSE_isError(NCountSize)) return ERROR(GENERIC); - op += NCountSize; } - FSE_buildCTable_wksp(CTable_OffsetBits, norm, max, tableLog, scratchBuffer, sizeof(scratchBuffer)); - Offtype = set_compressed; + size_t const mostFrequent = FSE_countFast_wksp(count, &max, ofCodeTable, nbSeq, workspace); + /* We can only use the basic table if max <= DefaultMaxOff, otherwise the offsets are too large */ + ZSTD_defaultPolicy_e const defaultPolicy = (max <= DefaultMaxOff) ? ZSTD_defaultAllowed : ZSTD_defaultDisallowed; + DEBUGLOG(5, "Building OF table"); + nextEntropy->offcode_repeatMode = prevEntropy->offcode_repeatMode; + Offtype = ZSTD_selectEncodingType(&nextEntropy->offcode_repeatMode, mostFrequent, nbSeq, OF_defaultNormLog, defaultPolicy); + { size_t const countSize = ZSTD_buildCTable(op, oend - op, CTable_OffsetBits, OffFSELog, (symbolEncodingType_e)Offtype, + count, max, ofCodeTable, nbSeq, OF_defaultNorm, OF_defaultNormLog, DefaultMaxOff, + prevEntropy->offcodeCTable, sizeof(prevEntropy->offcodeCTable), + workspace, HUF_WORKSPACE_SIZE); + if (ZSTD_isError(countSize)) return countSize; + op += countSize; } } - - /* CTable for MatchLengths */ + /* build CTable for MatchLengths */ { U32 max = MaxML; - size_t const mostFrequent = FSE_countFast_wksp(count, &max, mlCodeTable, nbSeq, zc->tmpCounters); - if ((mostFrequent == nbSeq) && (nbSeq > 2)) { - *op++ = *mlCodeTable; - FSE_buildCTable_rle(CTable_MatchLength, (BYTE)max); - MLtype = set_rle; - } else if ((zc->flagStaticTables) && (nbSeq < MAX_SEQ_FOR_STATIC_FSE)) { - MLtype = set_repeat; - } else if ((nbSeq < MIN_SEQ_FOR_DYNAMIC_FSE) || (mostFrequent < (nbSeq >> (ML_defaultNormLog-1)))) { - FSE_buildCTable_wksp(CTable_MatchLength, ML_defaultNorm, MaxML, ML_defaultNormLog, scratchBuffer, sizeof(scratchBuffer)); - MLtype = set_basic; - } else { - size_t nbSeq_1 = nbSeq; - const U32 tableLog = FSE_optimalTableLog(MLFSELog, nbSeq, max); - if (count[mlCodeTable[nbSeq-1]]>1) { count[mlCodeTable[nbSeq-1]]--; nbSeq_1--; } - FSE_normalizeCount(norm, tableLog, count, nbSeq_1, max); - { size_t const NCountSize = FSE_writeNCount(op, oend-op, norm, max, tableLog); /* overflow protected */ - if (FSE_isError(NCountSize)) return ERROR(GENERIC); - op += NCountSize; } - FSE_buildCTable_wksp(CTable_MatchLength, norm, max, tableLog, scratchBuffer, sizeof(scratchBuffer)); - MLtype = set_compressed; + size_t const mostFrequent = FSE_countFast_wksp(count, &max, mlCodeTable, nbSeq, workspace); + DEBUGLOG(5, "Building ML table"); + nextEntropy->matchlength_repeatMode = prevEntropy->matchlength_repeatMode; + MLtype = ZSTD_selectEncodingType(&nextEntropy->matchlength_repeatMode, mostFrequent, nbSeq, ML_defaultNormLog, ZSTD_defaultAllowed); + { size_t const countSize = ZSTD_buildCTable(op, oend - op, CTable_MatchLength, MLFSELog, (symbolEncodingType_e)MLtype, + count, max, mlCodeTable, nbSeq, ML_defaultNorm, ML_defaultNormLog, MaxML, + prevEntropy->matchlengthCTable, sizeof(prevEntropy->matchlengthCTable), + workspace, HUF_WORKSPACE_SIZE); + if (ZSTD_isError(countSize)) return countSize; + op += countSize; } } *seqHead = (BYTE)((LLtype<<6) + (Offtype<<4) + (MLtype<<2)); - zc->flagStaticTables = 0; - - /* Encoding Sequences */ - { BIT_CStream_t blockStream; - FSE_CState_t stateMatchLength; - FSE_CState_t stateOffsetBits; - FSE_CState_t stateLitLength; - - CHECK_E(BIT_initCStream(&blockStream, op, oend-op), dstSize_tooSmall); /* not enough space remaining */ - - /* first symbols */ - FSE_initCState2(&stateMatchLength, CTable_MatchLength, mlCodeTable[nbSeq-1]); - FSE_initCState2(&stateOffsetBits, CTable_OffsetBits, ofCodeTable[nbSeq-1]); - FSE_initCState2(&stateLitLength, CTable_LitLength, llCodeTable[nbSeq-1]); - BIT_addBits(&blockStream, sequences[nbSeq-1].litLength, LL_bits[llCodeTable[nbSeq-1]]); - if (MEM_32bits()) BIT_flushBits(&blockStream); - BIT_addBits(&blockStream, sequences[nbSeq-1].matchLength, ML_bits[mlCodeTable[nbSeq-1]]); - if (MEM_32bits()) BIT_flushBits(&blockStream); - BIT_addBits(&blockStream, sequences[nbSeq-1].offset, ofCodeTable[nbSeq-1]); - BIT_flushBits(&blockStream); - - { size_t n; - for (n=nbSeq-2 ; n= 64-7-(LLFSELog+MLFSELog+OffFSELog))) - BIT_flushBits(&blockStream); /* (7)*/ - BIT_addBits(&blockStream, sequences[n].litLength, llBits); - if (MEM_32bits() && ((llBits+mlBits)>24)) BIT_flushBits(&blockStream); - BIT_addBits(&blockStream, sequences[n].matchLength, mlBits); - if (MEM_32bits()) BIT_flushBits(&blockStream); /* (7)*/ - BIT_addBits(&blockStream, sequences[n].offset, ofBits); /* 31 */ - BIT_flushBits(&blockStream); /* (7)*/ - } } - - FSE_flushCState(&blockStream, &stateMatchLength); - FSE_flushCState(&blockStream, &stateOffsetBits); - FSE_flushCState(&blockStream, &stateLitLength); - - { size_t const streamSize = BIT_closeCStream(&blockStream); - if (streamSize==0) return ERROR(dstSize_tooSmall); /* not enough space */ - op += streamSize; - } } - - /* check compressibility */ -_check_compressibility: - { size_t const minGain = ZSTD_minGain(srcSize); - size_t const maxCSize = srcSize - minGain; - if ((size_t)(op-ostart) >= maxCSize) return 0; } - - /* confirm repcodes */ - { int i; for (i=0; irep[i] = zc->repToConfirm[i]; } + + { size_t const bitstreamSize = ZSTD_encodeSequences( + op, oend - op, + CTable_MatchLength, mlCodeTable, + CTable_OffsetBits, ofCodeTable, + CTable_LitLength, llCodeTable, + sequences, nbSeq, + longOffsets, bmi2); + if (ZSTD_isError(bitstreamSize)) return bitstreamSize; + op += bitstreamSize; + } return op - ostart; } - -#if 0 /* for debug */ -# define STORESEQ_DEBUG -#include /* fprintf */ -U32 g_startDebug = 0; -const BYTE* g_start = NULL; -#endif - -/*! ZSTD_storeSeq() : - Store a sequence (literal length, literals, offset code and match length code) into seqStore_t. - `offsetCode` : distance to match, or 0 == repCode. - `matchCode` : matchLength - MINMATCH -*/ -MEM_STATIC void ZSTD_storeSeq(seqStore_t* seqStorePtr, size_t litLength, const void* literals, U32 offsetCode, size_t matchCode) -{ -#ifdef STORESEQ_DEBUG - if (g_startDebug) { - const U32 pos = (U32)((const BYTE*)literals - g_start); - if (g_start==NULL) g_start = (const BYTE*)literals; - if ((pos > 1895000) && (pos < 1895300)) - fprintf(stderr, "Cpos %6u :%5u literals & match %3u bytes at distance %6u \n", - pos, (U32)litLength, (U32)matchCode+MINMATCH, (U32)offsetCode); - } -#endif - /* copy Literals */ - ZSTD_wildcopy(seqStorePtr->lit, literals, litLength); - seqStorePtr->lit += litLength; - - /* literal Length */ - if (litLength>0xFFFF) { seqStorePtr->longLengthID = 1; seqStorePtr->longLengthPos = (U32)(seqStorePtr->sequences - seqStorePtr->sequencesStart); } - seqStorePtr->sequences[0].litLength = (U16)litLength; - - /* match offset */ - seqStorePtr->sequences[0].offset = offsetCode + 1; - - /* match Length */ - if (matchCode>0xFFFF) { seqStorePtr->longLengthID = 2; seqStorePtr->longLengthPos = (U32)(seqStorePtr->sequences - seqStorePtr->sequencesStart); } - seqStorePtr->sequences[0].matchLength = (U16)matchCode; - - seqStorePtr->sequences++; -} - - -/*-************************************* -* Match length counter -***************************************/ -static unsigned ZSTD_NbCommonBytes (register size_t val) +MEM_STATIC size_t ZSTD_compressSequences(seqStore_t* seqStorePtr, + ZSTD_entropyCTables_t const* prevEntropy, + ZSTD_entropyCTables_t* nextEntropy, + ZSTD_CCtx_params const* cctxParams, + void* dst, size_t dstCapacity, + size_t srcSize, U32* workspace, int bmi2) { - if (MEM_isLittleEndian()) { - if (MEM_64bits()) { -# if defined(_MSC_VER) && defined(_WIN64) - unsigned long r = 0; - _BitScanForward64( &r, (U64)val ); - return (unsigned)(r>>3); -# elif defined(__GNUC__) && (__GNUC__ >= 3) - return (__builtin_ctzll((U64)val) >> 3); -# else - static const int DeBruijnBytePos[64] = { 0, 0, 0, 0, 0, 1, 1, 2, 0, 3, 1, 3, 1, 4, 2, 7, 0, 2, 3, 6, 1, 5, 3, 5, 1, 3, 4, 4, 2, 5, 6, 7, 7, 0, 1, 2, 3, 3, 4, 6, 2, 6, 5, 5, 3, 4, 5, 6, 7, 1, 2, 4, 6, 4, 4, 5, 7, 2, 6, 5, 7, 6, 7, 7 }; - return DeBruijnBytePos[((U64)((val & -(long long)val) * 0x0218A392CDABBD3FULL)) >> 58]; -# endif - } else { /* 32 bits */ -# if defined(_MSC_VER) - unsigned long r=0; - _BitScanForward( &r, (U32)val ); - return (unsigned)(r>>3); -# elif defined(__GNUC__) && (__GNUC__ >= 3) - return (__builtin_ctz((U32)val) >> 3); -# else - static const int DeBruijnBytePos[32] = { 0, 0, 3, 0, 3, 1, 3, 0, 3, 2, 2, 1, 3, 2, 0, 1, 3, 3, 1, 2, 2, 2, 2, 0, 3, 1, 2, 0, 1, 0, 1, 1 }; - return DeBruijnBytePos[((U32)((val & -(S32)val) * 0x077CB531U)) >> 27]; -# endif - } - } else { /* Big Endian CPU */ - if (MEM_64bits()) { -# if defined(_MSC_VER) && defined(_WIN64) - unsigned long r = 0; - _BitScanReverse64( &r, val ); - return (unsigned)(r>>3); -# elif defined(__GNUC__) && (__GNUC__ >= 3) - return (__builtin_clzll(val) >> 3); -# else - unsigned r; - const unsigned n32 = sizeof(size_t)*4; /* calculate this way due to compiler complaining in 32-bits mode */ - if (!(val>>n32)) { r=4; } else { r=0; val>>=n32; } - if (!(val>>16)) { r+=2; val>>=8; } else { val>>=24; } - r += (!val); - return r; -# endif - } else { /* 32 bits */ -# if defined(_MSC_VER) - unsigned long r = 0; - _BitScanReverse( &r, (unsigned long)val ); - return (unsigned)(r>>3); -# elif defined(__GNUC__) && (__GNUC__ >= 3) - return (__builtin_clz((U32)val) >> 3); -# else - unsigned r; - if (!(val>>16)) { r=2; val>>=8; } else { r=0; val>>=24; } - r += (!val); - return r; -# endif - } } -} - - -static size_t ZSTD_count(const BYTE* pIn, const BYTE* pMatch, const BYTE* const pInLimit) -{ - const BYTE* const pStart = pIn; - const BYTE* const pInLoopLimit = pInLimit - (sizeof(size_t)-1); - - while (pIn < pInLoopLimit) { - size_t const diff = MEM_readST(pMatch) ^ MEM_readST(pIn); - if (!diff) { pIn+=sizeof(size_t); pMatch+=sizeof(size_t); continue; } - pIn += ZSTD_NbCommonBytes(diff); - return (size_t)(pIn - pStart); + size_t const cSize = ZSTD_compressSequences_internal( + seqStorePtr, prevEntropy, nextEntropy, cctxParams, dst, dstCapacity, + workspace, bmi2); + /* When srcSize <= dstCapacity, there is enough space to write a raw uncompressed block. + * Since we ran out of space, block must be not compressible, so fall back to raw uncompressed block. + */ + if ((cSize == ERROR(dstSize_tooSmall)) & (srcSize <= dstCapacity)) + return 0; /* block not compressed */ + if (ZSTD_isError(cSize)) return cSize; + + /* Check compressibility */ + { size_t const maxCSize = srcSize - ZSTD_minGain(srcSize); /* note : fixed formula, maybe should depend on compression level, or strategy */ + if (cSize >= maxCSize) return 0; /* block not compressed */ } - if (MEM_64bits()) if ((pIn<(pInLimit-3)) && (MEM_read32(pMatch) == MEM_read32(pIn))) { pIn+=4; pMatch+=4; } - if ((pIn<(pInLimit-1)) && (MEM_read16(pMatch) == MEM_read16(pIn))) { pIn+=2; pMatch+=2; } - if ((pInoffcode_repeatMode == FSE_repeat_valid) + nextEntropy->offcode_repeatMode = FSE_repeat_check; + + return cSize; } - -/*-************************************* -* Hashes -***************************************/ -static const U32 prime3bytes = 506832829U; -static U32 ZSTD_hash3(U32 u, U32 h) { return ((u << (32-24)) * prime3bytes) >> (32-h) ; } -MEM_STATIC size_t ZSTD_hash3Ptr(const void* ptr, U32 h) { return ZSTD_hash3(MEM_readLE32(ptr), h); } /* only in zstd_opt.h */ - -static const U32 prime4bytes = 2654435761U; -static U32 ZSTD_hash4(U32 u, U32 h) { return (u * prime4bytes) >> (32-h) ; } -static size_t ZSTD_hash4Ptr(const void* ptr, U32 h) { return ZSTD_hash4(MEM_read32(ptr), h); } - -static const U64 prime5bytes = 889523592379ULL; -static size_t ZSTD_hash5(U64 u, U32 h) { return (size_t)(((u << (64-40)) * prime5bytes) >> (64-h)) ; } -static size_t ZSTD_hash5Ptr(const void* p, U32 h) { return ZSTD_hash5(MEM_readLE64(p), h); } - -static const U64 prime6bytes = 227718039650203ULL; -static size_t ZSTD_hash6(U64 u, U32 h) { return (size_t)(((u << (64-48)) * prime6bytes) >> (64-h)) ; } -static size_t ZSTD_hash6Ptr(const void* p, U32 h) { return ZSTD_hash6(MEM_readLE64(p), h); } - -static const U64 prime7bytes = 58295818150454627ULL; -static size_t ZSTD_hash7(U64 u, U32 h) { return (size_t)(((u << (64-56)) * prime7bytes) >> (64-h)) ; } -static size_t ZSTD_hash7Ptr(const void* p, U32 h) { return ZSTD_hash7(MEM_readLE64(p), h); } - -static const U64 prime8bytes = 0xCF1BBCDCB7A56463ULL; -static size_t ZSTD_hash8(U64 u, U32 h) { return (size_t)(((u) * prime8bytes) >> (64-h)) ; } -static size_t ZSTD_hash8Ptr(const void* p, U32 h) { return ZSTD_hash8(MEM_readLE64(p), h); } - -static size_t ZSTD_hashPtr(const void* p, U32 hBits, U32 mls) -{ - switch(mls) - { - default: - case 4: return ZSTD_hash4Ptr(p, hBits); - case 5: return ZSTD_hash5Ptr(p, hBits); - case 6: return ZSTD_hash6Ptr(p, hBits); - case 7: return ZSTD_hash7Ptr(p, hBits); - case 8: return ZSTD_hash8Ptr(p, hBits); - } -} - - -/*-************************************* -* Fast Scan -***************************************/ -static void ZSTD_fillHashTable (ZSTD_CCtx* zc, const void* end, const U32 mls) -{ - U32* const hashTable = zc->hashTable; - U32 const hBits = zc->params.cParams.hashLog; - const BYTE* const base = zc->base; - const BYTE* ip = base + zc->nextToUpdate; - const BYTE* const iend = ((const BYTE*)end) - HASH_READ_SIZE; - const size_t fastHashFillStep = 3; - - while(ip <= iend) { - hashTable[ZSTD_hashPtr(ip, hBits, mls)] = (U32)(ip - base); - ip += fastHashFillStep; - } -} - - -FORCE_INLINE -void ZSTD_compressBlock_fast_generic(ZSTD_CCtx* cctx, - const void* src, size_t srcSize, - const U32 mls) +/* ZSTD_selectBlockCompressor() : + * Not static, but internal use only (used by long distance matcher) + * assumption : strat is a valid strategy */ +ZSTD_blockCompressor ZSTD_selectBlockCompressor(ZSTD_strategy strat, int extDict) { - U32* const hashTable = cctx->hashTable; - U32 const hBits = cctx->params.cParams.hashLog; - seqStore_t* seqStorePtr = &(cctx->seqStore); - const BYTE* const base = cctx->base; - const BYTE* const istart = (const BYTE*)src; - const BYTE* ip = istart; - const BYTE* anchor = istart; - const U32 lowestIndex = cctx->dictLimit; - const BYTE* const lowest = base + lowestIndex; - const BYTE* const iend = istart + srcSize; - const BYTE* const ilimit = iend - HASH_READ_SIZE; - U32 offset_1=cctx->rep[0], offset_2=cctx->rep[1]; - U32 offsetSaved = 0; - - /* init */ - ip += (ip==lowest); - { U32 const maxRep = (U32)(ip-lowest); - if (offset_2 > maxRep) offsetSaved = offset_2, offset_2 = 0; - if (offset_1 > maxRep) offsetSaved = offset_1, offset_1 = 0; - } - - /* Main Search Loop */ - while (ip < ilimit) { /* < instead of <=, because repcode check at (ip+1) */ - size_t mLength; - size_t const h = ZSTD_hashPtr(ip, hBits, mls); - U32 const current = (U32)(ip-base); - U32 const matchIndex = hashTable[h]; - const BYTE* match = base + matchIndex; - hashTable[h] = current; /* update hash table */ - - if ((offset_1 > 0) & (MEM_read32(ip+1-offset_1) == MEM_read32(ip+1))) { - mLength = ZSTD_count(ip+1+4, ip+1+4-offset_1, iend) + 4; - ip++; - ZSTD_storeSeq(seqStorePtr, ip-anchor, anchor, 0, mLength-MINMATCH); - } else { - U32 offset; - if ( (matchIndex <= lowestIndex) || (MEM_read32(match) != MEM_read32(ip)) ) { - ip += ((ip-anchor) >> g_searchStrength) + 1; - continue; - } - mLength = ZSTD_count(ip+4, match+4, iend) + 4; - offset = (U32)(ip-match); - while (((ip>anchor) & (match>lowest)) && (ip[-1] == match[-1])) { ip--; match--; mLength++; } /* catch up */ - offset_2 = offset_1; - offset_1 = offset; - - ZSTD_storeSeq(seqStorePtr, ip-anchor, anchor, offset + ZSTD_REP_MOVE, mLength-MINMATCH); - } - - /* match found */ - ip += mLength; - anchor = ip; - - if (ip <= ilimit) { - /* Fill Table */ - hashTable[ZSTD_hashPtr(base+current+2, hBits, mls)] = current+2; /* here because current+2 could be > iend-8 */ - hashTable[ZSTD_hashPtr(ip-2, hBits, mls)] = (U32)(ip-2-base); - /* check immediate repcode */ - while ( (ip <= ilimit) - && ( (offset_2>0) - & (MEM_read32(ip) == MEM_read32(ip - offset_2)) )) { - /* store sequence */ - size_t const rLength = ZSTD_count(ip+4, ip+4-offset_2, iend) + 4; - { U32 const tmpOff = offset_2; offset_2 = offset_1; offset_1 = tmpOff; } /* swap offset_2 <=> offset_1 */ - hashTable[ZSTD_hashPtr(ip, hBits, mls)] = (U32)(ip-base); - ZSTD_storeSeq(seqStorePtr, 0, anchor, 0, rLength-MINMATCH); - ip += rLength; - anchor = ip; - continue; /* faster when present ... (?) */ - } } } - - /* save reps for next block */ - cctx->repToConfirm[0] = offset_1 ? offset_1 : offsetSaved; - cctx->repToConfirm[1] = offset_2 ? offset_2 : offsetSaved; - - /* Last Literals */ - { size_t const lastLLSize = iend - anchor; - memcpy(seqStorePtr->lit, anchor, lastLLSize); - seqStorePtr->lit += lastLLSize; - } + static const ZSTD_blockCompressor blockCompressor[2][(unsigned)ZSTD_btultra+1] = { + { ZSTD_compressBlock_fast /* default for 0 */, + ZSTD_compressBlock_fast, ZSTD_compressBlock_doubleFast, ZSTD_compressBlock_greedy, + ZSTD_compressBlock_lazy, ZSTD_compressBlock_lazy2, ZSTD_compressBlock_btlazy2, + ZSTD_compressBlock_btopt, ZSTD_compressBlock_btultra }, + { ZSTD_compressBlock_fast_extDict /* default for 0 */, + ZSTD_compressBlock_fast_extDict, ZSTD_compressBlock_doubleFast_extDict, ZSTD_compressBlock_greedy_extDict, + ZSTD_compressBlock_lazy_extDict,ZSTD_compressBlock_lazy2_extDict, ZSTD_compressBlock_btlazy2_extDict, + ZSTD_compressBlock_btopt_extDict, ZSTD_compressBlock_btultra_extDict } + }; + ZSTD_STATIC_ASSERT((unsigned)ZSTD_fast == 1); + + assert((U32)strat >= (U32)ZSTD_fast); + assert((U32)strat <= (U32)ZSTD_btultra); + return blockCompressor[extDict!=0][(U32)strat]; } - -static void ZSTD_compressBlock_fast(ZSTD_CCtx* ctx, - const void* src, size_t srcSize) +static void ZSTD_storeLastLiterals(seqStore_t* seqStorePtr, + const BYTE* anchor, size_t lastLLSize) +{ + memcpy(seqStorePtr->lit, anchor, lastLLSize); + seqStorePtr->lit += lastLLSize; +} + +static void ZSTD_resetSeqStore(seqStore_t* ssPtr) { - const U32 mls = ctx->params.cParams.searchLength; - switch(mls) - { - default: - case 4 : - ZSTD_compressBlock_fast_generic(ctx, src, srcSize, 4); return; - case 5 : - ZSTD_compressBlock_fast_generic(ctx, src, srcSize, 5); return; - case 6 : - ZSTD_compressBlock_fast_generic(ctx, src, srcSize, 6); return; - case 7 : - ZSTD_compressBlock_fast_generic(ctx, src, srcSize, 7); return; - } + ssPtr->lit = ssPtr->litStart; + ssPtr->sequences = ssPtr->sequencesStart; + ssPtr->longLengthID = 0; } - -static void ZSTD_compressBlock_fast_extDict_generic(ZSTD_CCtx* ctx, - const void* src, size_t srcSize, - const U32 mls) +static size_t ZSTD_compressBlock_internal(ZSTD_CCtx* zc, + void* dst, size_t dstCapacity, + const void* src, size_t srcSize) { - U32* hashTable = ctx->hashTable; - const U32 hBits = ctx->params.cParams.hashLog; - seqStore_t* seqStorePtr = &(ctx->seqStore); - const BYTE* const base = ctx->base; - const BYTE* const dictBase = ctx->dictBase; - const BYTE* const istart = (const BYTE*)src; - const BYTE* ip = istart; - const BYTE* anchor = istart; - const U32 lowestIndex = ctx->lowLimit; - const BYTE* const dictStart = dictBase + lowestIndex; - const U32 dictLimit = ctx->dictLimit; - const BYTE* const lowPrefixPtr = base + dictLimit; - const BYTE* const dictEnd = dictBase + dictLimit; - const BYTE* const iend = istart + srcSize; - const BYTE* const ilimit = iend - 8; - U32 offset_1=ctx->rep[0], offset_2=ctx->rep[1]; - - /* Search Loop */ - while (ip < ilimit) { /* < instead of <=, because (ip+1) */ - const size_t h = ZSTD_hashPtr(ip, hBits, mls); - const U32 matchIndex = hashTable[h]; - const BYTE* matchBase = matchIndex < dictLimit ? dictBase : base; - const BYTE* match = matchBase + matchIndex; - const U32 current = (U32)(ip-base); - const U32 repIndex = current + 1 - offset_1; /* offset_1 expected <= current +1 */ - const BYTE* repBase = repIndex < dictLimit ? dictBase : base; - const BYTE* repMatch = repBase + repIndex; - size_t mLength; - hashTable[h] = current; /* update hash table */ - - if ( (((U32)((dictLimit-1) - repIndex) >= 3) /* intentional underflow */ & (repIndex > lowestIndex)) - && (MEM_read32(repMatch) == MEM_read32(ip+1)) ) { - const BYTE* repMatchEnd = repIndex < dictLimit ? dictEnd : iend; - mLength = ZSTD_count_2segments(ip+1+EQUAL_READ32, repMatch+EQUAL_READ32, iend, repMatchEnd, lowPrefixPtr) + EQUAL_READ32; - ip++; - ZSTD_storeSeq(seqStorePtr, ip-anchor, anchor, 0, mLength-MINMATCH); - } else { - if ( (matchIndex < lowestIndex) || - (MEM_read32(match) != MEM_read32(ip)) ) { - ip += ((ip-anchor) >> g_searchStrength) + 1; - continue; - } - { const BYTE* matchEnd = matchIndex < dictLimit ? dictEnd : iend; - const BYTE* lowMatchPtr = matchIndex < dictLimit ? dictStart : lowPrefixPtr; - U32 offset; - mLength = ZSTD_count_2segments(ip+EQUAL_READ32, match+EQUAL_READ32, iend, matchEnd, lowPrefixPtr) + EQUAL_READ32; - while (((ip>anchor) & (match>lowMatchPtr)) && (ip[-1] == match[-1])) { ip--; match--; mLength++; } /* catch up */ - offset = current - matchIndex; - offset_2 = offset_1; - offset_1 = offset; - ZSTD_storeSeq(seqStorePtr, ip-anchor, anchor, offset + ZSTD_REP_MOVE, mLength-MINMATCH); - } } - - /* found a match : store it */ - ip += mLength; - anchor = ip; - - if (ip <= ilimit) { - /* Fill Table */ - hashTable[ZSTD_hashPtr(base+current+2, hBits, mls)] = current+2; - hashTable[ZSTD_hashPtr(ip-2, hBits, mls)] = (U32)(ip-2-base); - /* check immediate repcode */ - while (ip <= ilimit) { - U32 const current2 = (U32)(ip-base); - U32 const repIndex2 = current2 - offset_2; - const BYTE* repMatch2 = repIndex2 < dictLimit ? dictBase + repIndex2 : base + repIndex2; - if ( (((U32)((dictLimit-1) - repIndex2) >= 3) & (repIndex2 > lowestIndex)) /* intentional overflow */ - && (MEM_read32(repMatch2) == MEM_read32(ip)) ) { - const BYTE* const repEnd2 = repIndex2 < dictLimit ? dictEnd : iend; - size_t repLength2 = ZSTD_count_2segments(ip+EQUAL_READ32, repMatch2+EQUAL_READ32, iend, repEnd2, lowPrefixPtr) + EQUAL_READ32; - U32 tmpOffset = offset_2; offset_2 = offset_1; offset_1 = tmpOffset; /* swap offset_2 <=> offset_1 */ - ZSTD_storeSeq(seqStorePtr, 0, anchor, 0, repLength2-MINMATCH); - hashTable[ZSTD_hashPtr(ip, hBits, mls)] = current2; - ip += repLength2; - anchor = ip; - continue; - } - break; - } } } - - /* save reps for next block */ - ctx->repToConfirm[0] = offset_1; ctx->repToConfirm[1] = offset_2; - - /* Last Literals */ - { size_t const lastLLSize = iend - anchor; - memcpy(seqStorePtr->lit, anchor, lastLLSize); - seqStorePtr->lit += lastLLSize; + ZSTD_matchState_t* const ms = &zc->blockState.matchState; + DEBUGLOG(5, "ZSTD_compressBlock_internal (dstCapacity=%u, dictLimit=%u, nextToUpdate=%u)", + (U32)dstCapacity, ms->window.dictLimit, ms->nextToUpdate); + if (srcSize < MIN_CBLOCK_SIZE+ZSTD_blockHeaderSize+1) { + ZSTD_ldm_skipSequences(&zc->externSeqStore, srcSize, zc->appliedParams.cParams.searchLength); + return 0; /* don't even attempt compression below a certain srcSize */ } -} - - -static void ZSTD_compressBlock_fast_extDict(ZSTD_CCtx* ctx, - const void* src, size_t srcSize) -{ - U32 const mls = ctx->params.cParams.searchLength; - switch(mls) - { - default: - case 4 : - ZSTD_compressBlock_fast_extDict_generic(ctx, src, srcSize, 4); return; - case 5 : - ZSTD_compressBlock_fast_extDict_generic(ctx, src, srcSize, 5); return; - case 6 : - ZSTD_compressBlock_fast_extDict_generic(ctx, src, srcSize, 6); return; - case 7 : - ZSTD_compressBlock_fast_extDict_generic(ctx, src, srcSize, 7); return; - } -} - - -/*-************************************* -* Double Fast -***************************************/ -static void ZSTD_fillDoubleHashTable (ZSTD_CCtx* cctx, const void* end, const U32 mls) -{ - U32* const hashLarge = cctx->hashTable; - U32 const hBitsL = cctx->params.cParams.hashLog; - U32* const hashSmall = cctx->chainTable; - U32 const hBitsS = cctx->params.cParams.chainLog; - const BYTE* const base = cctx->base; - const BYTE* ip = base + cctx->nextToUpdate; - const BYTE* const iend = ((const BYTE*)end) - HASH_READ_SIZE; - const size_t fastHashFillStep = 3; - - while(ip <= iend) { - hashSmall[ZSTD_hashPtr(ip, hBitsS, mls)] = (U32)(ip - base); - hashLarge[ZSTD_hashPtr(ip, hBitsL, 8)] = (U32)(ip - base); - ip += fastHashFillStep; - } -} - - -FORCE_INLINE -void ZSTD_compressBlock_doubleFast_generic(ZSTD_CCtx* cctx, - const void* src, size_t srcSize, - const U32 mls) -{ - U32* const hashLong = cctx->hashTable; - const U32 hBitsL = cctx->params.cParams.hashLog; - U32* const hashSmall = cctx->chainTable; - const U32 hBitsS = cctx->params.cParams.chainLog; - seqStore_t* seqStorePtr = &(cctx->seqStore); - const BYTE* const base = cctx->base; - const BYTE* const istart = (const BYTE*)src; - const BYTE* ip = istart; - const BYTE* anchor = istart; - const U32 lowestIndex = cctx->dictLimit; - const BYTE* const lowest = base + lowestIndex; - const BYTE* const iend = istart + srcSize; - const BYTE* const ilimit = iend - HASH_READ_SIZE; - U32 offset_1=cctx->rep[0], offset_2=cctx->rep[1]; - U32 offsetSaved = 0; - - /* init */ - ip += (ip==lowest); - { U32 const maxRep = (U32)(ip-lowest); - if (offset_2 > maxRep) offsetSaved = offset_2, offset_2 = 0; - if (offset_1 > maxRep) offsetSaved = offset_1, offset_1 = 0; + ZSTD_resetSeqStore(&(zc->seqStore)); + + /* limited update after a very long match */ + { const BYTE* const base = ms->window.base; + const BYTE* const istart = (const BYTE*)src; + const U32 current = (U32)(istart-base); + if (current > ms->nextToUpdate + 384) + ms->nextToUpdate = current - MIN(192, (U32)(current - ms->nextToUpdate - 384)); } - /* Main Search Loop */ - while (ip < ilimit) { /* < instead of <=, because repcode check at (ip+1) */ - size_t mLength; - size_t const h2 = ZSTD_hashPtr(ip, hBitsL, 8); - size_t const h = ZSTD_hashPtr(ip, hBitsS, mls); - U32 const current = (U32)(ip-base); - U32 const matchIndexL = hashLong[h2]; - U32 const matchIndexS = hashSmall[h]; - const BYTE* matchLong = base + matchIndexL; - const BYTE* match = base + matchIndexS; - hashLong[h2] = hashSmall[h] = current; /* update hash tables */ - - if ((offset_1 > 0) & (MEM_read32(ip+1-offset_1) == MEM_read32(ip+1))) { /* note : by construction, offset_1 <= current */ - mLength = ZSTD_count(ip+1+4, ip+1+4-offset_1, iend) + 4; - ip++; - ZSTD_storeSeq(seqStorePtr, ip-anchor, anchor, 0, mLength-MINMATCH); - } else { - U32 offset; - if ( (matchIndexL > lowestIndex) && (MEM_read64(matchLong) == MEM_read64(ip)) ) { - mLength = ZSTD_count(ip+8, matchLong+8, iend) + 8; - offset = (U32)(ip-matchLong); - while (((ip>anchor) & (matchLong>lowest)) && (ip[-1] == matchLong[-1])) { ip--; matchLong--; mLength++; } /* catch up */ - } else if ( (matchIndexS > lowestIndex) && (MEM_read32(match) == MEM_read32(ip)) ) { - size_t const h3 = ZSTD_hashPtr(ip+1, hBitsL, 8); - U32 const matchIndex3 = hashLong[h3]; - const BYTE* match3 = base + matchIndex3; - hashLong[h3] = current + 1; - if ( (matchIndex3 > lowestIndex) && (MEM_read64(match3) == MEM_read64(ip+1)) ) { - mLength = ZSTD_count(ip+9, match3+8, iend) + 8; - ip++; - offset = (U32)(ip-match3); - while (((ip>anchor) & (match3>lowest)) && (ip[-1] == match3[-1])) { ip--; match3--; mLength++; } /* catch up */ - } else { - mLength = ZSTD_count(ip+4, match+4, iend) + 4; - offset = (U32)(ip-match); - while (((ip>anchor) & (match>lowest)) && (ip[-1] == match[-1])) { ip--; match--; mLength++; } /* catch up */ - } - } else { - ip += ((ip-anchor) >> g_searchStrength) + 1; - continue; - } - - offset_2 = offset_1; - offset_1 = offset; - - ZSTD_storeSeq(seqStorePtr, ip-anchor, anchor, offset + ZSTD_REP_MOVE, mLength-MINMATCH); + /* select and store sequences */ + { U32 const extDict = ZSTD_window_hasExtDict(ms->window); + size_t lastLLSize; + { int i; + for (i = 0; i < ZSTD_REP_NUM; ++i) + zc->blockState.nextCBlock->rep[i] = zc->blockState.prevCBlock->rep[i]; } - - /* match found */ - ip += mLength; - anchor = ip; - - if (ip <= ilimit) { - /* Fill Table */ - hashLong[ZSTD_hashPtr(base+current+2, hBitsL, 8)] = - hashSmall[ZSTD_hashPtr(base+current+2, hBitsS, mls)] = current+2; /* here because current+2 could be > iend-8 */ - hashLong[ZSTD_hashPtr(ip-2, hBitsL, 8)] = - hashSmall[ZSTD_hashPtr(ip-2, hBitsS, mls)] = (U32)(ip-2-base); - - /* check immediate repcode */ - while ( (ip <= ilimit) - && ( (offset_2>0) - & (MEM_read32(ip) == MEM_read32(ip - offset_2)) )) { - /* store sequence */ - size_t const rLength = ZSTD_count(ip+4, ip+4-offset_2, iend) + 4; - { U32 const tmpOff = offset_2; offset_2 = offset_1; offset_1 = tmpOff; } /* swap offset_2 <=> offset_1 */ - hashSmall[ZSTD_hashPtr(ip, hBitsS, mls)] = (U32)(ip-base); - hashLong[ZSTD_hashPtr(ip, hBitsL, 8)] = (U32)(ip-base); - ZSTD_storeSeq(seqStorePtr, 0, anchor, 0, rLength-MINMATCH); - ip += rLength; - anchor = ip; - continue; /* faster when present ... (?) */ - } } } - - /* save reps for next block */ - cctx->repToConfirm[0] = offset_1 ? offset_1 : offsetSaved; - cctx->repToConfirm[1] = offset_2 ? offset_2 : offsetSaved; - - /* Last Literals */ - { size_t const lastLLSize = iend - anchor; - memcpy(seqStorePtr->lit, anchor, lastLLSize); - seqStorePtr->lit += lastLLSize; - } -} - - -static void ZSTD_compressBlock_doubleFast(ZSTD_CCtx* ctx, const void* src, size_t srcSize) -{ - const U32 mls = ctx->params.cParams.searchLength; - switch(mls) - { - default: - case 4 : - ZSTD_compressBlock_doubleFast_generic(ctx, src, srcSize, 4); return; - case 5 : - ZSTD_compressBlock_doubleFast_generic(ctx, src, srcSize, 5); return; - case 6 : - ZSTD_compressBlock_doubleFast_generic(ctx, src, srcSize, 6); return; - case 7 : - ZSTD_compressBlock_doubleFast_generic(ctx, src, srcSize, 7); return; - } -} - - -static void ZSTD_compressBlock_doubleFast_extDict_generic(ZSTD_CCtx* ctx, - const void* src, size_t srcSize, - const U32 mls) -{ - U32* const hashLong = ctx->hashTable; - U32 const hBitsL = ctx->params.cParams.hashLog; - U32* const hashSmall = ctx->chainTable; - U32 const hBitsS = ctx->params.cParams.chainLog; - seqStore_t* seqStorePtr = &(ctx->seqStore); - const BYTE* const base = ctx->base; - const BYTE* const dictBase = ctx->dictBase; - const BYTE* const istart = (const BYTE*)src; - const BYTE* ip = istart; - const BYTE* anchor = istart; - const U32 lowestIndex = ctx->lowLimit; - const BYTE* const dictStart = dictBase + lowestIndex; - const U32 dictLimit = ctx->dictLimit; - const BYTE* const lowPrefixPtr = base + dictLimit; - const BYTE* const dictEnd = dictBase + dictLimit; - const BYTE* const iend = istart + srcSize; - const BYTE* const ilimit = iend - 8; - U32 offset_1=ctx->rep[0], offset_2=ctx->rep[1]; - - /* Search Loop */ - while (ip < ilimit) { /* < instead of <=, because (ip+1) */ - const size_t hSmall = ZSTD_hashPtr(ip, hBitsS, mls); - const U32 matchIndex = hashSmall[hSmall]; - const BYTE* matchBase = matchIndex < dictLimit ? dictBase : base; - const BYTE* match = matchBase + matchIndex; - - const size_t hLong = ZSTD_hashPtr(ip, hBitsL, 8); - const U32 matchLongIndex = hashLong[hLong]; - const BYTE* matchLongBase = matchLongIndex < dictLimit ? dictBase : base; - const BYTE* matchLong = matchLongBase + matchLongIndex; - - const U32 current = (U32)(ip-base); - const U32 repIndex = current + 1 - offset_1; /* offset_1 expected <= current +1 */ - const BYTE* repBase = repIndex < dictLimit ? dictBase : base; - const BYTE* repMatch = repBase + repIndex; - size_t mLength; - hashSmall[hSmall] = hashLong[hLong] = current; /* update hash table */ - - if ( (((U32)((dictLimit-1) - repIndex) >= 3) /* intentional underflow */ & (repIndex > lowestIndex)) - && (MEM_read32(repMatch) == MEM_read32(ip+1)) ) { - const BYTE* repMatchEnd = repIndex < dictLimit ? dictEnd : iend; - mLength = ZSTD_count_2segments(ip+1+4, repMatch+4, iend, repMatchEnd, lowPrefixPtr) + 4; - ip++; - ZSTD_storeSeq(seqStorePtr, ip-anchor, anchor, 0, mLength-MINMATCH); - } else { - if ((matchLongIndex > lowestIndex) && (MEM_read64(matchLong) == MEM_read64(ip))) { - const BYTE* matchEnd = matchLongIndex < dictLimit ? dictEnd : iend; - const BYTE* lowMatchPtr = matchLongIndex < dictLimit ? dictStart : lowPrefixPtr; - U32 offset; - mLength = ZSTD_count_2segments(ip+8, matchLong+8, iend, matchEnd, lowPrefixPtr) + 8; - offset = current - matchLongIndex; - while (((ip>anchor) & (matchLong>lowMatchPtr)) && (ip[-1] == matchLong[-1])) { ip--; matchLong--; mLength++; } /* catch up */ - offset_2 = offset_1; - offset_1 = offset; - ZSTD_storeSeq(seqStorePtr, ip-anchor, anchor, offset + ZSTD_REP_MOVE, mLength-MINMATCH); - - } else if ((matchIndex > lowestIndex) && (MEM_read32(match) == MEM_read32(ip))) { - size_t const h3 = ZSTD_hashPtr(ip+1, hBitsL, 8); - U32 const matchIndex3 = hashLong[h3]; - const BYTE* const match3Base = matchIndex3 < dictLimit ? dictBase : base; - const BYTE* match3 = match3Base + matchIndex3; - U32 offset; - hashLong[h3] = current + 1; - if ( (matchIndex3 > lowestIndex) && (MEM_read64(match3) == MEM_read64(ip+1)) ) { - const BYTE* matchEnd = matchIndex3 < dictLimit ? dictEnd : iend; - const BYTE* lowMatchPtr = matchIndex3 < dictLimit ? dictStart : lowPrefixPtr; - mLength = ZSTD_count_2segments(ip+9, match3+8, iend, matchEnd, lowPrefixPtr) + 8; - ip++; - offset = current+1 - matchIndex3; - while (((ip>anchor) & (match3>lowMatchPtr)) && (ip[-1] == match3[-1])) { ip--; match3--; mLength++; } /* catch up */ - } else { - const BYTE* matchEnd = matchIndex < dictLimit ? dictEnd : iend; - const BYTE* lowMatchPtr = matchIndex < dictLimit ? dictStart : lowPrefixPtr; - mLength = ZSTD_count_2segments(ip+4, match+4, iend, matchEnd, lowPrefixPtr) + 4; - offset = current - matchIndex; - while (((ip>anchor) & (match>lowMatchPtr)) && (ip[-1] == match[-1])) { ip--; match--; mLength++; } /* catch up */ - } - offset_2 = offset_1; - offset_1 = offset; - ZSTD_storeSeq(seqStorePtr, ip-anchor, anchor, offset + ZSTD_REP_MOVE, mLength-MINMATCH); - - } else { - ip += ((ip-anchor) >> g_searchStrength) + 1; - continue; - } } - - /* found a match : store it */ - ip += mLength; - anchor = ip; - - if (ip <= ilimit) { - /* Fill Table */ - hashSmall[ZSTD_hashPtr(base+current+2, hBitsS, mls)] = current+2; - hashLong[ZSTD_hashPtr(base+current+2, hBitsL, 8)] = current+2; - hashSmall[ZSTD_hashPtr(ip-2, hBitsS, mls)] = (U32)(ip-2-base); - hashLong[ZSTD_hashPtr(ip-2, hBitsL, 8)] = (U32)(ip-2-base); - /* check immediate repcode */ - while (ip <= ilimit) { - U32 const current2 = (U32)(ip-base); - U32 const repIndex2 = current2 - offset_2; - const BYTE* repMatch2 = repIndex2 < dictLimit ? dictBase + repIndex2 : base + repIndex2; - if ( (((U32)((dictLimit-1) - repIndex2) >= 3) & (repIndex2 > lowestIndex)) /* intentional overflow */ - && (MEM_read32(repMatch2) == MEM_read32(ip)) ) { - const BYTE* const repEnd2 = repIndex2 < dictLimit ? dictEnd : iend; - size_t const repLength2 = ZSTD_count_2segments(ip+EQUAL_READ32, repMatch2+EQUAL_READ32, iend, repEnd2, lowPrefixPtr) + EQUAL_READ32; - U32 tmpOffset = offset_2; offset_2 = offset_1; offset_1 = tmpOffset; /* swap offset_2 <=> offset_1 */ - ZSTD_storeSeq(seqStorePtr, 0, anchor, 0, repLength2-MINMATCH); - hashSmall[ZSTD_hashPtr(ip, hBitsS, mls)] = current2; - hashLong[ZSTD_hashPtr(ip, hBitsL, 8)] = current2; - ip += repLength2; - anchor = ip; - continue; - } - break; - } } } - - /* save reps for next block */ - ctx->repToConfirm[0] = offset_1; ctx->repToConfirm[1] = offset_2; - - /* Last Literals */ - { size_t const lastLLSize = iend - anchor; - memcpy(seqStorePtr->lit, anchor, lastLLSize); - seqStorePtr->lit += lastLLSize; - } -} - - -static void ZSTD_compressBlock_doubleFast_extDict(ZSTD_CCtx* ctx, - const void* src, size_t srcSize) -{ - U32 const mls = ctx->params.cParams.searchLength; - switch(mls) - { - default: - case 4 : - ZSTD_compressBlock_doubleFast_extDict_generic(ctx, src, srcSize, 4); return; - case 5 : - ZSTD_compressBlock_doubleFast_extDict_generic(ctx, src, srcSize, 5); return; - case 6 : - ZSTD_compressBlock_doubleFast_extDict_generic(ctx, src, srcSize, 6); return; - case 7 : - ZSTD_compressBlock_doubleFast_extDict_generic(ctx, src, srcSize, 7); return; + if (zc->externSeqStore.pos < zc->externSeqStore.size) { + assert(!zc->appliedParams.ldmParams.enableLdm); + /* Updates ldmSeqStore.pos */ + lastLLSize = + ZSTD_ldm_blockCompress(&zc->externSeqStore, + ms, &zc->seqStore, + zc->blockState.nextCBlock->rep, + &zc->appliedParams.cParams, + src, srcSize, extDict); + assert(zc->externSeqStore.pos <= zc->externSeqStore.size); + } else if (zc->appliedParams.ldmParams.enableLdm) { + rawSeqStore_t ldmSeqStore = {NULL, 0, 0, 0}; + + ldmSeqStore.seq = zc->ldmSequences; + ldmSeqStore.capacity = zc->maxNbLdmSequences; + /* Updates ldmSeqStore.size */ + CHECK_F(ZSTD_ldm_generateSequences(&zc->ldmState, &ldmSeqStore, + &zc->appliedParams.ldmParams, + src, srcSize)); + /* Updates ldmSeqStore.pos */ + lastLLSize = + ZSTD_ldm_blockCompress(&ldmSeqStore, + ms, &zc->seqStore, + zc->blockState.nextCBlock->rep, + &zc->appliedParams.cParams, + src, srcSize, extDict); + assert(ldmSeqStore.pos == ldmSeqStore.size); + } else { /* not long range mode */ + ZSTD_blockCompressor const blockCompressor = ZSTD_selectBlockCompressor(zc->appliedParams.cParams.strategy, extDict); + lastLLSize = blockCompressor(ms, &zc->seqStore, zc->blockState.nextCBlock->rep, &zc->appliedParams.cParams, src, srcSize); + } + { const BYTE* const lastLiterals = (const BYTE*)src + srcSize - lastLLSize; + ZSTD_storeLastLiterals(&zc->seqStore, lastLiterals, lastLLSize); + } } + + /* encode sequences and literals */ + { size_t const cSize = ZSTD_compressSequences(&zc->seqStore, + &zc->blockState.prevCBlock->entropy, &zc->blockState.nextCBlock->entropy, + &zc->appliedParams, + dst, dstCapacity, + srcSize, zc->entropyWorkspace, zc->bmi2); + if (ZSTD_isError(cSize) || cSize == 0) return cSize; + /* confirm repcodes and entropy tables */ + { ZSTD_compressedBlockState_t* const tmp = zc->blockState.prevCBlock; + zc->blockState.prevCBlock = zc->blockState.nextCBlock; + zc->blockState.nextCBlock = tmp; + } + return cSize; } } -/*-************************************* -* Binary Tree search -***************************************/ -/** ZSTD_insertBt1() : add one or multiple positions to tree. -* ip : assumed <= iend-8 . -* @return : nb of positions added */ -static U32 ZSTD_insertBt1(ZSTD_CCtx* zc, const BYTE* const ip, const U32 mls, const BYTE* const iend, U32 nbCompares, - U32 extDict) -{ - U32* const hashTable = zc->hashTable; - U32 const hashLog = zc->params.cParams.hashLog; - size_t const h = ZSTD_hashPtr(ip, hashLog, mls); - U32* const bt = zc->chainTable; - U32 const btLog = zc->params.cParams.chainLog - 1; - U32 const btMask = (1 << btLog) - 1; - U32 matchIndex = hashTable[h]; - size_t commonLengthSmaller=0, commonLengthLarger=0; - const BYTE* const base = zc->base; - const BYTE* const dictBase = zc->dictBase; - const U32 dictLimit = zc->dictLimit; - const BYTE* const dictEnd = dictBase + dictLimit; - const BYTE* const prefixStart = base + dictLimit; - const BYTE* match; - const U32 current = (U32)(ip-base); - const U32 btLow = btMask >= current ? 0 : current - btMask; - U32* smallerPtr = bt + 2*(current&btMask); - U32* largerPtr = smallerPtr + 1; - U32 dummy32; /* to be nullified at the end */ - U32 const windowLow = zc->lowLimit; - U32 matchEndIdx = current+8; - size_t bestLength = 8; -#ifdef ZSTD_C_PREDICT - U32 predictedSmall = *(bt + 2*((current-1)&btMask) + 0); - U32 predictedLarge = *(bt + 2*((current-1)&btMask) + 1); - predictedSmall += (predictedSmall>0); - predictedLarge += (predictedLarge>0); -#endif /* ZSTD_C_PREDICT */ - - hashTable[h] = current; /* Update Hash Table */ - - while (nbCompares-- && (matchIndex > windowLow)) { - U32* const nextPtr = bt + 2*(matchIndex & btMask); - size_t matchLength = MIN(commonLengthSmaller, commonLengthLarger); /* guaranteed minimum nb of common bytes */ - -#ifdef ZSTD_C_PREDICT /* note : can create issues when hlog small <= 11 */ - const U32* predictPtr = bt + 2*((matchIndex-1) & btMask); /* written this way, as bt is a roll buffer */ - if (matchIndex == predictedSmall) { - /* no need to check length, result known */ - *smallerPtr = matchIndex; - if (matchIndex <= btLow) { smallerPtr=&dummy32; break; } /* beyond tree size, stop the search */ - smallerPtr = nextPtr+1; /* new "smaller" => larger of match */ - matchIndex = nextPtr[1]; /* new matchIndex larger than previous (closer to current) */ - predictedSmall = predictPtr[1] + (predictPtr[1]>0); - continue; - } - if (matchIndex == predictedLarge) { - *largerPtr = matchIndex; - if (matchIndex <= btLow) { largerPtr=&dummy32; break; } /* beyond tree size, stop the search */ - largerPtr = nextPtr; - matchIndex = nextPtr[0]; - predictedLarge = predictPtr[0] + (predictPtr[0]>0); - continue; - } -#endif - if ((!extDict) || (matchIndex+matchLength >= dictLimit)) { - match = base + matchIndex; - if (match[matchLength] == ip[matchLength]) - matchLength += ZSTD_count(ip+matchLength+1, match+matchLength+1, iend) +1; - } else { - match = dictBase + matchIndex; - matchLength += ZSTD_count_2segments(ip+matchLength, match+matchLength, iend, dictEnd, prefixStart); - if (matchIndex+matchLength >= dictLimit) - match = base + matchIndex; /* to prepare for next usage of match[matchLength] */ - } - - if (matchLength > bestLength) { - bestLength = matchLength; - if (matchLength > matchEndIdx - matchIndex) - matchEndIdx = matchIndex + (U32)matchLength; - } - - if (ip+matchLength == iend) /* equal : no way to know if inf or sup */ - break; /* drop , to guarantee consistency ; miss a bit of compression, but other solutions can corrupt the tree */ - - if (match[matchLength] < ip[matchLength]) { /* necessarily within correct buffer */ - /* match is smaller than current */ - *smallerPtr = matchIndex; /* update smaller idx */ - commonLengthSmaller = matchLength; /* all smaller will now have at least this guaranteed common length */ - if (matchIndex <= btLow) { smallerPtr=&dummy32; break; } /* beyond tree size, stop the search */ - smallerPtr = nextPtr+1; /* new "smaller" => larger of match */ - matchIndex = nextPtr[1]; /* new matchIndex larger than previous (closer to current) */ - } else { - /* match is larger than current */ - *largerPtr = matchIndex; - commonLengthLarger = matchLength; - if (matchIndex <= btLow) { largerPtr=&dummy32; break; } /* beyond tree size, stop the search */ - largerPtr = nextPtr; - matchIndex = nextPtr[0]; - } } - - *smallerPtr = *largerPtr = 0; - if (bestLength > 384) return MIN(192, (U32)(bestLength - 384)); /* speed optimization */ - if (matchEndIdx > current + 8) return matchEndIdx - current - 8; - return 1; -} - - -static size_t ZSTD_insertBtAndFindBestMatch ( - ZSTD_CCtx* zc, - const BYTE* const ip, const BYTE* const iend, - size_t* offsetPtr, - U32 nbCompares, const U32 mls, - U32 extDict) -{ - U32* const hashTable = zc->hashTable; - U32 const hashLog = zc->params.cParams.hashLog; - size_t const h = ZSTD_hashPtr(ip, hashLog, mls); - U32* const bt = zc->chainTable; - U32 const btLog = zc->params.cParams.chainLog - 1; - U32 const btMask = (1 << btLog) - 1; - U32 matchIndex = hashTable[h]; - size_t commonLengthSmaller=0, commonLengthLarger=0; - const BYTE* const base = zc->base; - const BYTE* const dictBase = zc->dictBase; - const U32 dictLimit = zc->dictLimit; - const BYTE* const dictEnd = dictBase + dictLimit; - const BYTE* const prefixStart = base + dictLimit; - const U32 current = (U32)(ip-base); - const U32 btLow = btMask >= current ? 0 : current - btMask; - const U32 windowLow = zc->lowLimit; - U32* smallerPtr = bt + 2*(current&btMask); - U32* largerPtr = bt + 2*(current&btMask) + 1; - U32 matchEndIdx = current+8; - U32 dummy32; /* to be nullified at the end */ - size_t bestLength = 0; - - hashTable[h] = current; /* Update Hash Table */ - - while (nbCompares-- && (matchIndex > windowLow)) { - U32* const nextPtr = bt + 2*(matchIndex & btMask); - size_t matchLength = MIN(commonLengthSmaller, commonLengthLarger); /* guaranteed minimum nb of common bytes */ - const BYTE* match; - - if ((!extDict) || (matchIndex+matchLength >= dictLimit)) { - match = base + matchIndex; - if (match[matchLength] == ip[matchLength]) - matchLength += ZSTD_count(ip+matchLength+1, match+matchLength+1, iend) +1; - } else { - match = dictBase + matchIndex; - matchLength += ZSTD_count_2segments(ip+matchLength, match+matchLength, iend, dictEnd, prefixStart); - if (matchIndex+matchLength >= dictLimit) - match = base + matchIndex; /* to prepare for next usage of match[matchLength] */ - } - - if (matchLength > bestLength) { - if (matchLength > matchEndIdx - matchIndex) - matchEndIdx = matchIndex + (U32)matchLength; - if ( (4*(int)(matchLength-bestLength)) > (int)(ZSTD_highbit32(current-matchIndex+1) - ZSTD_highbit32((U32)offsetPtr[0]+1)) ) - bestLength = matchLength, *offsetPtr = ZSTD_REP_MOVE + current - matchIndex; - if (ip+matchLength == iend) /* equal : no way to know if inf or sup */ - break; /* drop, to guarantee consistency (miss a little bit of compression) */ - } - - if (match[matchLength] < ip[matchLength]) { - /* match is smaller than current */ - *smallerPtr = matchIndex; /* update smaller idx */ - commonLengthSmaller = matchLength; /* all smaller will now have at least this guaranteed common length */ - if (matchIndex <= btLow) { smallerPtr=&dummy32; break; } /* beyond tree size, stop the search */ - smallerPtr = nextPtr+1; /* new "smaller" => larger of match */ - matchIndex = nextPtr[1]; /* new matchIndex larger than previous (closer to current) */ - } else { - /* match is larger than current */ - *largerPtr = matchIndex; - commonLengthLarger = matchLength; - if (matchIndex <= btLow) { largerPtr=&dummy32; break; } /* beyond tree size, stop the search */ - largerPtr = nextPtr; - matchIndex = nextPtr[0]; - } } - - *smallerPtr = *largerPtr = 0; - - zc->nextToUpdate = (matchEndIdx > current + 8) ? matchEndIdx - 8 : current+1; - return bestLength; -} - - -static void ZSTD_updateTree(ZSTD_CCtx* zc, const BYTE* const ip, const BYTE* const iend, const U32 nbCompares, const U32 mls) -{ - const BYTE* const base = zc->base; - const U32 target = (U32)(ip - base); - U32 idx = zc->nextToUpdate; - - while(idx < target) - idx += ZSTD_insertBt1(zc, base+idx, mls, iend, nbCompares, 0); -} - -/** ZSTD_BtFindBestMatch() : Tree updater, providing best match */ -static size_t ZSTD_BtFindBestMatch ( - ZSTD_CCtx* zc, - const BYTE* const ip, const BYTE* const iLimit, - size_t* offsetPtr, - const U32 maxNbAttempts, const U32 mls) -{ - if (ip < zc->base + zc->nextToUpdate) return 0; /* skipped area */ - ZSTD_updateTree(zc, ip, iLimit, maxNbAttempts, mls); - return ZSTD_insertBtAndFindBestMatch(zc, ip, iLimit, offsetPtr, maxNbAttempts, mls, 0); -} - - -static size_t ZSTD_BtFindBestMatch_selectMLS ( - ZSTD_CCtx* zc, /* Index table will be updated */ - const BYTE* ip, const BYTE* const iLimit, - size_t* offsetPtr, - const U32 maxNbAttempts, const U32 matchLengthSearch) -{ - switch(matchLengthSearch) - { - default : - case 4 : return ZSTD_BtFindBestMatch(zc, ip, iLimit, offsetPtr, maxNbAttempts, 4); - case 5 : return ZSTD_BtFindBestMatch(zc, ip, iLimit, offsetPtr, maxNbAttempts, 5); - case 6 : return ZSTD_BtFindBestMatch(zc, ip, iLimit, offsetPtr, maxNbAttempts, 6); - } -} - - -static void ZSTD_updateTree_extDict(ZSTD_CCtx* zc, const BYTE* const ip, const BYTE* const iend, const U32 nbCompares, const U32 mls) -{ - const BYTE* const base = zc->base; - const U32 target = (U32)(ip - base); - U32 idx = zc->nextToUpdate; - - while (idx < target) idx += ZSTD_insertBt1(zc, base+idx, mls, iend, nbCompares, 1); -} - - -/** Tree updater, providing best match */ -static size_t ZSTD_BtFindBestMatch_extDict ( - ZSTD_CCtx* zc, - const BYTE* const ip, const BYTE* const iLimit, - size_t* offsetPtr, - const U32 maxNbAttempts, const U32 mls) -{ - if (ip < zc->base + zc->nextToUpdate) return 0; /* skipped area */ - ZSTD_updateTree_extDict(zc, ip, iLimit, maxNbAttempts, mls); - return ZSTD_insertBtAndFindBestMatch(zc, ip, iLimit, offsetPtr, maxNbAttempts, mls, 1); -} - - -static size_t ZSTD_BtFindBestMatch_selectMLS_extDict ( - ZSTD_CCtx* zc, /* Index table will be updated */ - const BYTE* ip, const BYTE* const iLimit, - size_t* offsetPtr, - const U32 maxNbAttempts, const U32 matchLengthSearch) -{ - switch(matchLengthSearch) - { - default : - case 4 : return ZSTD_BtFindBestMatch_extDict(zc, ip, iLimit, offsetPtr, maxNbAttempts, 4); - case 5 : return ZSTD_BtFindBestMatch_extDict(zc, ip, iLimit, offsetPtr, maxNbAttempts, 5); - case 6 : return ZSTD_BtFindBestMatch_extDict(zc, ip, iLimit, offsetPtr, maxNbAttempts, 6); - } -} - - - -/* ********************************* -* Hash Chain -***********************************/ -#define NEXT_IN_CHAIN(d, mask) chainTable[(d) & mask] - -/* Update chains up to ip (excluded) - Assumption : always within prefix (ie. not within extDict) */ -FORCE_INLINE -U32 ZSTD_insertAndFindFirstIndex (ZSTD_CCtx* zc, const BYTE* ip, U32 mls) -{ - U32* const hashTable = zc->hashTable; - const U32 hashLog = zc->params.cParams.hashLog; - U32* const chainTable = zc->chainTable; - const U32 chainMask = (1 << zc->params.cParams.chainLog) - 1; - const BYTE* const base = zc->base; - const U32 target = (U32)(ip - base); - U32 idx = zc->nextToUpdate; - - while(idx < target) { /* catch up */ - size_t const h = ZSTD_hashPtr(base+idx, hashLog, mls); - NEXT_IN_CHAIN(idx, chainMask) = hashTable[h]; - hashTable[h] = idx; - idx++; - } - - zc->nextToUpdate = target; - return hashTable[ZSTD_hashPtr(ip, hashLog, mls)]; -} - - - -FORCE_INLINE /* inlining is important to hardwire a hot branch (template emulation) */ -size_t ZSTD_HcFindBestMatch_generic ( - ZSTD_CCtx* zc, /* Index table will be updated */ - const BYTE* const ip, const BYTE* const iLimit, - size_t* offsetPtr, - const U32 maxNbAttempts, const U32 mls, const U32 extDict) -{ - U32* const chainTable = zc->chainTable; - const U32 chainSize = (1 << zc->params.cParams.chainLog); - const U32 chainMask = chainSize-1; - const BYTE* const base = zc->base; - const BYTE* const dictBase = zc->dictBase; - const U32 dictLimit = zc->dictLimit; - const BYTE* const prefixStart = base + dictLimit; - const BYTE* const dictEnd = dictBase + dictLimit; - const U32 lowLimit = zc->lowLimit; - const U32 current = (U32)(ip-base); - const U32 minChain = current > chainSize ? current - chainSize : 0; - int nbAttempts=maxNbAttempts; - size_t ml=EQUAL_READ32-1; - - /* HC4 match finder */ - U32 matchIndex = ZSTD_insertAndFindFirstIndex (zc, ip, mls); - - for ( ; (matchIndex>lowLimit) & (nbAttempts>0) ; nbAttempts--) { - const BYTE* match; - size_t currentMl=0; - if ((!extDict) || matchIndex >= dictLimit) { - match = base + matchIndex; - if (match[ml] == ip[ml]) /* potentially better */ - currentMl = ZSTD_count(ip, match, iLimit); - } else { - match = dictBase + matchIndex; - if (MEM_read32(match) == MEM_read32(ip)) /* assumption : matchIndex <= dictLimit-4 (by table construction) */ - currentMl = ZSTD_count_2segments(ip+EQUAL_READ32, match+EQUAL_READ32, iLimit, dictEnd, prefixStart) + EQUAL_READ32; - } - - /* save best solution */ - if (currentMl > ml) { ml = currentMl; *offsetPtr = current - matchIndex + ZSTD_REP_MOVE; if (ip+currentMl == iLimit) break; /* best possible, and avoid read overflow*/ } - - if (matchIndex <= minChain) break; - matchIndex = NEXT_IN_CHAIN(matchIndex, chainMask); - } - - return ml; -} - - -FORCE_INLINE size_t ZSTD_HcFindBestMatch_selectMLS ( - ZSTD_CCtx* zc, - const BYTE* ip, const BYTE* const iLimit, - size_t* offsetPtr, - const U32 maxNbAttempts, const U32 matchLengthSearch) -{ - switch(matchLengthSearch) - { - default : - case 4 : return ZSTD_HcFindBestMatch_generic(zc, ip, iLimit, offsetPtr, maxNbAttempts, 4, 0); - case 5 : return ZSTD_HcFindBestMatch_generic(zc, ip, iLimit, offsetPtr, maxNbAttempts, 5, 0); - case 6 : return ZSTD_HcFindBestMatch_generic(zc, ip, iLimit, offsetPtr, maxNbAttempts, 6, 0); - } -} - - -FORCE_INLINE size_t ZSTD_HcFindBestMatch_extDict_selectMLS ( - ZSTD_CCtx* zc, - const BYTE* ip, const BYTE* const iLimit, - size_t* offsetPtr, - const U32 maxNbAttempts, const U32 matchLengthSearch) -{ - switch(matchLengthSearch) - { - default : - case 4 : return ZSTD_HcFindBestMatch_generic(zc, ip, iLimit, offsetPtr, maxNbAttempts, 4, 1); - case 5 : return ZSTD_HcFindBestMatch_generic(zc, ip, iLimit, offsetPtr, maxNbAttempts, 5, 1); - case 6 : return ZSTD_HcFindBestMatch_generic(zc, ip, iLimit, offsetPtr, maxNbAttempts, 6, 1); - } -} - - -/* ******************************* -* Common parser - lazy strategy -*********************************/ -FORCE_INLINE -void ZSTD_compressBlock_lazy_generic(ZSTD_CCtx* ctx, - const void* src, size_t srcSize, - const U32 searchMethod, const U32 depth) -{ - seqStore_t* seqStorePtr = &(ctx->seqStore); - const BYTE* const istart = (const BYTE*)src; - const BYTE* ip = istart; - const BYTE* anchor = istart; - const BYTE* const iend = istart + srcSize; - const BYTE* const ilimit = iend - 8; - const BYTE* const base = ctx->base + ctx->dictLimit; - - U32 const maxSearches = 1 << ctx->params.cParams.searchLog; - U32 const mls = ctx->params.cParams.searchLength; - - typedef size_t (*searchMax_f)(ZSTD_CCtx* zc, const BYTE* ip, const BYTE* iLimit, - size_t* offsetPtr, - U32 maxNbAttempts, U32 matchLengthSearch); - searchMax_f const searchMax = searchMethod ? ZSTD_BtFindBestMatch_selectMLS : ZSTD_HcFindBestMatch_selectMLS; - U32 offset_1 = ctx->rep[0], offset_2 = ctx->rep[1], savedOffset=0; - - /* init */ - ip += (ip==base); - ctx->nextToUpdate3 = ctx->nextToUpdate; - { U32 const maxRep = (U32)(ip-base); - if (offset_2 > maxRep) savedOffset = offset_2, offset_2 = 0; - if (offset_1 > maxRep) savedOffset = offset_1, offset_1 = 0; - } - - /* Match Loop */ - while (ip < ilimit) { - size_t matchLength=0; - size_t offset=0; - const BYTE* start=ip+1; - - /* check repCode */ - if ((offset_1>0) & (MEM_read32(ip+1) == MEM_read32(ip+1 - offset_1))) { - /* repcode : we take it */ - matchLength = ZSTD_count(ip+1+EQUAL_READ32, ip+1+EQUAL_READ32-offset_1, iend) + EQUAL_READ32; - if (depth==0) goto _storeSequence; - } - - /* first search (depth 0) */ - { size_t offsetFound = 99999999; - size_t const ml2 = searchMax(ctx, ip, iend, &offsetFound, maxSearches, mls); - if (ml2 > matchLength) - matchLength = ml2, start = ip, offset=offsetFound; - } - - if (matchLength < EQUAL_READ32) { - ip += ((ip-anchor) >> g_searchStrength) + 1; /* jump faster over incompressible sections */ - continue; - } - - /* let's try to find a better solution */ - if (depth>=1) - while (ip0) & (MEM_read32(ip) == MEM_read32(ip - offset_1)))) { - size_t const mlRep = ZSTD_count(ip+EQUAL_READ32, ip+EQUAL_READ32-offset_1, iend) + EQUAL_READ32; - int const gain2 = (int)(mlRep * 3); - int const gain1 = (int)(matchLength*3 - ZSTD_highbit32((U32)offset+1) + 1); - if ((mlRep >= EQUAL_READ32) && (gain2 > gain1)) - matchLength = mlRep, offset = 0, start = ip; - } - { size_t offset2=99999999; - size_t const ml2 = searchMax(ctx, ip, iend, &offset2, maxSearches, mls); - int const gain2 = (int)(ml2*4 - ZSTD_highbit32((U32)offset2+1)); /* raw approx */ - int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offset+1) + 4); - if ((ml2 >= EQUAL_READ32) && (gain2 > gain1)) { - matchLength = ml2, offset = offset2, start = ip; - continue; /* search a better one */ - } } - - /* let's find an even better one */ - if ((depth==2) && (ip0) & (MEM_read32(ip) == MEM_read32(ip - offset_1)))) { - size_t const ml2 = ZSTD_count(ip+EQUAL_READ32, ip+EQUAL_READ32-offset_1, iend) + EQUAL_READ32; - int const gain2 = (int)(ml2 * 4); - int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offset+1) + 1); - if ((ml2 >= EQUAL_READ32) && (gain2 > gain1)) - matchLength = ml2, offset = 0, start = ip; - } - { size_t offset2=99999999; - size_t const ml2 = searchMax(ctx, ip, iend, &offset2, maxSearches, mls); - int const gain2 = (int)(ml2*4 - ZSTD_highbit32((U32)offset2+1)); /* raw approx */ - int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offset+1) + 7); - if ((ml2 >= EQUAL_READ32) && (gain2 > gain1)) { - matchLength = ml2, offset = offset2, start = ip; - continue; - } } } - break; /* nothing found : store previous solution */ - } - - /* catch up */ - if (offset) { - while ((start>anchor) && (start>base+offset-ZSTD_REP_MOVE) && (start[-1] == start[-1-offset+ZSTD_REP_MOVE])) /* only search for offset within prefix */ - { start--; matchLength++; } - offset_2 = offset_1; offset_1 = (U32)(offset - ZSTD_REP_MOVE); - } - - /* store sequence */ -_storeSequence: - { size_t const litLength = start - anchor; - ZSTD_storeSeq(seqStorePtr, litLength, anchor, (U32)offset, matchLength-MINMATCH); - anchor = ip = start + matchLength; - } - - /* check immediate repcode */ - while ( (ip <= ilimit) - && ((offset_2>0) - & (MEM_read32(ip) == MEM_read32(ip - offset_2)) )) { - /* store sequence */ - matchLength = ZSTD_count(ip+EQUAL_READ32, ip+EQUAL_READ32-offset_2, iend) + EQUAL_READ32; - offset = offset_2; offset_2 = offset_1; offset_1 = (U32)offset; /* swap repcodes */ - ZSTD_storeSeq(seqStorePtr, 0, anchor, 0, matchLength-MINMATCH); - ip += matchLength; - anchor = ip; - continue; /* faster when present ... (?) */ - } } - - /* Save reps for next block */ - ctx->repToConfirm[0] = offset_1 ? offset_1 : savedOffset; - ctx->repToConfirm[1] = offset_2 ? offset_2 : savedOffset; - - /* Last Literals */ - { size_t const lastLLSize = iend - anchor; - memcpy(seqStorePtr->lit, anchor, lastLLSize); - seqStorePtr->lit += lastLLSize; - } -} - - -static void ZSTD_compressBlock_btlazy2(ZSTD_CCtx* ctx, const void* src, size_t srcSize) -{ - ZSTD_compressBlock_lazy_generic(ctx, src, srcSize, 1, 2); -} - -static void ZSTD_compressBlock_lazy2(ZSTD_CCtx* ctx, const void* src, size_t srcSize) -{ - ZSTD_compressBlock_lazy_generic(ctx, src, srcSize, 0, 2); -} - -static void ZSTD_compressBlock_lazy(ZSTD_CCtx* ctx, const void* src, size_t srcSize) -{ - ZSTD_compressBlock_lazy_generic(ctx, src, srcSize, 0, 1); -} - -static void ZSTD_compressBlock_greedy(ZSTD_CCtx* ctx, const void* src, size_t srcSize) -{ - ZSTD_compressBlock_lazy_generic(ctx, src, srcSize, 0, 0); -} - - -FORCE_INLINE -void ZSTD_compressBlock_lazy_extDict_generic(ZSTD_CCtx* ctx, - const void* src, size_t srcSize, - const U32 searchMethod, const U32 depth) -{ - seqStore_t* seqStorePtr = &(ctx->seqStore); - const BYTE* const istart = (const BYTE*)src; - const BYTE* ip = istart; - const BYTE* anchor = istart; - const BYTE* const iend = istart + srcSize; - const BYTE* const ilimit = iend - 8; - const BYTE* const base = ctx->base; - const U32 dictLimit = ctx->dictLimit; - const U32 lowestIndex = ctx->lowLimit; - const BYTE* const prefixStart = base + dictLimit; - const BYTE* const dictBase = ctx->dictBase; - const BYTE* const dictEnd = dictBase + dictLimit; - const BYTE* const dictStart = dictBase + ctx->lowLimit; - - const U32 maxSearches = 1 << ctx->params.cParams.searchLog; - const U32 mls = ctx->params.cParams.searchLength; - - typedef size_t (*searchMax_f)(ZSTD_CCtx* zc, const BYTE* ip, const BYTE* iLimit, - size_t* offsetPtr, - U32 maxNbAttempts, U32 matchLengthSearch); - searchMax_f searchMax = searchMethod ? ZSTD_BtFindBestMatch_selectMLS_extDict : ZSTD_HcFindBestMatch_extDict_selectMLS; - - U32 offset_1 = ctx->rep[0], offset_2 = ctx->rep[1]; - - /* init */ - ctx->nextToUpdate3 = ctx->nextToUpdate; - ip += (ip == prefixStart); - - /* Match Loop */ - while (ip < ilimit) { - size_t matchLength=0; - size_t offset=0; - const BYTE* start=ip+1; - U32 current = (U32)(ip-base); - - /* check repCode */ - { const U32 repIndex = (U32)(current+1 - offset_1); - const BYTE* const repBase = repIndex < dictLimit ? dictBase : base; - const BYTE* const repMatch = repBase + repIndex; - if (((U32)((dictLimit-1) - repIndex) >= 3) & (repIndex > lowestIndex)) /* intentional overflow */ - if (MEM_read32(ip+1) == MEM_read32(repMatch)) { - /* repcode detected we should take it */ - const BYTE* const repEnd = repIndex < dictLimit ? dictEnd : iend; - matchLength = ZSTD_count_2segments(ip+1+EQUAL_READ32, repMatch+EQUAL_READ32, iend, repEnd, prefixStart) + EQUAL_READ32; - if (depth==0) goto _storeSequence; - } } - - /* first search (depth 0) */ - { size_t offsetFound = 99999999; - size_t const ml2 = searchMax(ctx, ip, iend, &offsetFound, maxSearches, mls); - if (ml2 > matchLength) - matchLength = ml2, start = ip, offset=offsetFound; - } - - if (matchLength < EQUAL_READ32) { - ip += ((ip-anchor) >> g_searchStrength) + 1; /* jump faster over incompressible sections */ - continue; - } - - /* let's try to find a better solution */ - if (depth>=1) - while (ip= 3) & (repIndex > lowestIndex)) /* intentional overflow */ - if (MEM_read32(ip) == MEM_read32(repMatch)) { - /* repcode detected */ - const BYTE* const repEnd = repIndex < dictLimit ? dictEnd : iend; - size_t const repLength = ZSTD_count_2segments(ip+EQUAL_READ32, repMatch+EQUAL_READ32, iend, repEnd, prefixStart) + EQUAL_READ32; - int const gain2 = (int)(repLength * 3); - int const gain1 = (int)(matchLength*3 - ZSTD_highbit32((U32)offset+1) + 1); - if ((repLength >= EQUAL_READ32) && (gain2 > gain1)) - matchLength = repLength, offset = 0, start = ip; - } } - - /* search match, depth 1 */ - { size_t offset2=99999999; - size_t const ml2 = searchMax(ctx, ip, iend, &offset2, maxSearches, mls); - int const gain2 = (int)(ml2*4 - ZSTD_highbit32((U32)offset2+1)); /* raw approx */ - int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offset+1) + 4); - if ((ml2 >= EQUAL_READ32) && (gain2 > gain1)) { - matchLength = ml2, offset = offset2, start = ip; - continue; /* search a better one */ - } } - - /* let's find an even better one */ - if ((depth==2) && (ip= 3) & (repIndex > lowestIndex)) /* intentional overflow */ - if (MEM_read32(ip) == MEM_read32(repMatch)) { - /* repcode detected */ - const BYTE* const repEnd = repIndex < dictLimit ? dictEnd : iend; - size_t repLength = ZSTD_count_2segments(ip+EQUAL_READ32, repMatch+EQUAL_READ32, iend, repEnd, prefixStart) + EQUAL_READ32; - int gain2 = (int)(repLength * 4); - int gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offset+1) + 1); - if ((repLength >= EQUAL_READ32) && (gain2 > gain1)) - matchLength = repLength, offset = 0, start = ip; - } } - - /* search match, depth 2 */ - { size_t offset2=99999999; - size_t const ml2 = searchMax(ctx, ip, iend, &offset2, maxSearches, mls); - int const gain2 = (int)(ml2*4 - ZSTD_highbit32((U32)offset2+1)); /* raw approx */ - int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offset+1) + 7); - if ((ml2 >= EQUAL_READ32) && (gain2 > gain1)) { - matchLength = ml2, offset = offset2, start = ip; - continue; - } } } - break; /* nothing found : store previous solution */ - } - - /* catch up */ - if (offset) { - U32 const matchIndex = (U32)((start-base) - (offset - ZSTD_REP_MOVE)); - const BYTE* match = (matchIndex < dictLimit) ? dictBase + matchIndex : base + matchIndex; - const BYTE* const mStart = (matchIndex < dictLimit) ? dictStart : prefixStart; - while ((start>anchor) && (match>mStart) && (start[-1] == match[-1])) { start--; match--; matchLength++; } /* catch up */ - offset_2 = offset_1; offset_1 = (U32)(offset - ZSTD_REP_MOVE); - } - - /* store sequence */ -_storeSequence: - { size_t const litLength = start - anchor; - ZSTD_storeSeq(seqStorePtr, litLength, anchor, (U32)offset, matchLength-MINMATCH); - anchor = ip = start + matchLength; - } - - /* check immediate repcode */ - while (ip <= ilimit) { - const U32 repIndex = (U32)((ip-base) - offset_2); - const BYTE* const repBase = repIndex < dictLimit ? dictBase : base; - const BYTE* const repMatch = repBase + repIndex; - if (((U32)((dictLimit-1) - repIndex) >= 3) & (repIndex > lowestIndex)) /* intentional overflow */ - if (MEM_read32(ip) == MEM_read32(repMatch)) { - /* repcode detected we should take it */ - const BYTE* const repEnd = repIndex < dictLimit ? dictEnd : iend; - matchLength = ZSTD_count_2segments(ip+EQUAL_READ32, repMatch+EQUAL_READ32, iend, repEnd, prefixStart) + EQUAL_READ32; - offset = offset_2; offset_2 = offset_1; offset_1 = (U32)offset; /* swap offset history */ - ZSTD_storeSeq(seqStorePtr, 0, anchor, 0, matchLength-MINMATCH); - ip += matchLength; - anchor = ip; - continue; /* faster when present ... (?) */ - } - break; - } } - - /* Save reps for next block */ - ctx->repToConfirm[0] = offset_1; ctx->repToConfirm[1] = offset_2; - - /* Last Literals */ - { size_t const lastLLSize = iend - anchor; - memcpy(seqStorePtr->lit, anchor, lastLLSize); - seqStorePtr->lit += lastLLSize; - } -} - - -void ZSTD_compressBlock_greedy_extDict(ZSTD_CCtx* ctx, const void* src, size_t srcSize) -{ - ZSTD_compressBlock_lazy_extDict_generic(ctx, src, srcSize, 0, 0); -} - -static void ZSTD_compressBlock_lazy_extDict(ZSTD_CCtx* ctx, const void* src, size_t srcSize) -{ - ZSTD_compressBlock_lazy_extDict_generic(ctx, src, srcSize, 0, 1); -} - -static void ZSTD_compressBlock_lazy2_extDict(ZSTD_CCtx* ctx, const void* src, size_t srcSize) -{ - ZSTD_compressBlock_lazy_extDict_generic(ctx, src, srcSize, 0, 2); -} - -static void ZSTD_compressBlock_btlazy2_extDict(ZSTD_CCtx* ctx, const void* src, size_t srcSize) -{ - ZSTD_compressBlock_lazy_extDict_generic(ctx, src, srcSize, 1, 2); -} - - -/* The optimal parser */ -#include "zstd_opt.h" - -static void ZSTD_compressBlock_btopt(ZSTD_CCtx* ctx, const void* src, size_t srcSize) -{ -#ifdef ZSTD_OPT_H_91842398743 - ZSTD_compressBlock_opt_generic(ctx, src, srcSize, 0); -#else - (void)ctx; (void)src; (void)srcSize; - return; -#endif -} - -static void ZSTD_compressBlock_btopt2(ZSTD_CCtx* ctx, const void* src, size_t srcSize) -{ -#ifdef ZSTD_OPT_H_91842398743 - ZSTD_compressBlock_opt_generic(ctx, src, srcSize, 1); -#else - (void)ctx; (void)src; (void)srcSize; - return; -#endif -} - -static void ZSTD_compressBlock_btopt_extDict(ZSTD_CCtx* ctx, const void* src, size_t srcSize) -{ -#ifdef ZSTD_OPT_H_91842398743 - ZSTD_compressBlock_opt_extDict_generic(ctx, src, srcSize, 0); -#else - (void)ctx; (void)src; (void)srcSize; - return; -#endif -} - -static void ZSTD_compressBlock_btopt2_extDict(ZSTD_CCtx* ctx, const void* src, size_t srcSize) -{ -#ifdef ZSTD_OPT_H_91842398743 - ZSTD_compressBlock_opt_extDict_generic(ctx, src, srcSize, 1); -#else - (void)ctx; (void)src; (void)srcSize; - return; -#endif -} - - -typedef void (*ZSTD_blockCompressor) (ZSTD_CCtx* ctx, const void* src, size_t srcSize); - -static ZSTD_blockCompressor ZSTD_selectBlockCompressor(ZSTD_strategy strat, int extDict) -{ - static const ZSTD_blockCompressor blockCompressor[2][8] = { - { ZSTD_compressBlock_fast, ZSTD_compressBlock_doubleFast, ZSTD_compressBlock_greedy, ZSTD_compressBlock_lazy, ZSTD_compressBlock_lazy2, ZSTD_compressBlock_btlazy2, ZSTD_compressBlock_btopt, ZSTD_compressBlock_btopt2 }, - { ZSTD_compressBlock_fast_extDict, ZSTD_compressBlock_doubleFast_extDict, ZSTD_compressBlock_greedy_extDict, ZSTD_compressBlock_lazy_extDict,ZSTD_compressBlock_lazy2_extDict, ZSTD_compressBlock_btlazy2_extDict, ZSTD_compressBlock_btopt_extDict, ZSTD_compressBlock_btopt2_extDict } - }; - - return blockCompressor[extDict][(U32)strat]; -} - - -static size_t ZSTD_compressBlock_internal(ZSTD_CCtx* zc, void* dst, size_t dstCapacity, const void* src, size_t srcSize) -{ - ZSTD_blockCompressor const blockCompressor = ZSTD_selectBlockCompressor(zc->params.cParams.strategy, zc->lowLimit < zc->dictLimit); - const BYTE* const base = zc->base; - const BYTE* const istart = (const BYTE*)src; - const U32 current = (U32)(istart-base); - if (srcSize < MIN_CBLOCK_SIZE+ZSTD_blockHeaderSize+1) return 0; /* don't even attempt compression below a certain srcSize */ - ZSTD_resetSeqStore(&(zc->seqStore)); - if (current > zc->nextToUpdate + 384) - zc->nextToUpdate = current - MIN(192, (U32)(current - zc->nextToUpdate - 384)); /* update tree not updated after finding very long rep matches */ - blockCompressor(zc, src, srcSize); - return ZSTD_compressSequences(zc, dst, dstCapacity, srcSize); -} - - -/*! ZSTD_compress_generic() : +/*! ZSTD_compress_frameChunk() : * Compress a chunk of data into one or multiple blocks. * All blocks will be terminated, all input will be consumed. * Function will issue an error if there is not enough `dstCapacity` to hold the compressed content. * Frame is supposed already started (header already produced) * @return : compressed size, or an error code */ -static size_t ZSTD_compress_generic (ZSTD_CCtx* cctx, +static size_t ZSTD_compress_frameChunk (ZSTD_CCtx* cctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize, U32 lastFrameChunk) @@ -2295,61 +1984,62 @@ const BYTE* ip = (const BYTE*)src; BYTE* const ostart = (BYTE*)dst; BYTE* op = ostart; - U32 const maxDist = 1 << cctx->params.cParams.windowLog; - - if (cctx->params.fParams.checksumFlag && srcSize) + U32 const maxDist = (U32)1 << cctx->appliedParams.cParams.windowLog; + assert(cctx->appliedParams.cParams.windowLog <= 31); + + DEBUGLOG(5, "ZSTD_compress_frameChunk (blockSize=%u)", (U32)blockSize); + if (cctx->appliedParams.fParams.checksumFlag && srcSize) XXH64_update(&cctx->xxhState, src, srcSize); while (remaining) { + ZSTD_matchState_t* const ms = &cctx->blockState.matchState; U32 const lastBlock = lastFrameChunk & (blockSize >= remaining); - size_t cSize; - - if (dstCapacity < ZSTD_blockHeaderSize + MIN_CBLOCK_SIZE) return ERROR(dstSize_tooSmall); /* not enough space to store compressed block */ + + if (dstCapacity < ZSTD_blockHeaderSize + MIN_CBLOCK_SIZE) + return ERROR(dstSize_tooSmall); /* not enough space to store compressed block */ if (remaining < blockSize) blockSize = remaining; - /* preemptive overflow correction */ - if (cctx->lowLimit > (2U<<30)) { - U32 const cycleMask = (1 << ZSTD_cycleLog(cctx->params.cParams.hashLog, cctx->params.cParams.strategy)) - 1; - U32 const current = (U32)(ip - cctx->base); - U32 const newCurrent = (current & cycleMask) + (1 << cctx->params.cParams.windowLog); - U32 const correction = current - newCurrent; - ZSTD_STATIC_ASSERT(ZSTD_WINDOWLOG_MAX_64 <= 30); + if (ZSTD_window_needOverflowCorrection(ms->window, ip + blockSize)) { + U32 const cycleLog = ZSTD_cycleLog(cctx->appliedParams.cParams.chainLog, cctx->appliedParams.cParams.strategy); + U32 const correction = ZSTD_window_correctOverflow(&ms->window, cycleLog, maxDist, ip); + ZSTD_STATIC_ASSERT(ZSTD_CHAINLOG_MAX <= 30); + ZSTD_STATIC_ASSERT(ZSTD_WINDOWLOG_MAX_32 <= 30); + ZSTD_STATIC_ASSERT(ZSTD_WINDOWLOG_MAX <= 31); + ZSTD_reduceIndex(cctx, correction); - cctx->base += correction; - cctx->dictBase += correction; - cctx->lowLimit -= correction; - cctx->dictLimit -= correction; - if (cctx->nextToUpdate < correction) cctx->nextToUpdate = 0; - else cctx->nextToUpdate -= correction; + if (ms->nextToUpdate < correction) ms->nextToUpdate = 0; + else ms->nextToUpdate -= correction; + ms->loadedDictEnd = 0; } - - if ((U32)(ip+blockSize - cctx->base) > cctx->loadedDictEnd + maxDist) { - /* enforce maxDist */ - U32 const newLowLimit = (U32)(ip+blockSize - cctx->base) - maxDist; - if (cctx->lowLimit < newLowLimit) cctx->lowLimit = newLowLimit; - if (cctx->dictLimit < cctx->lowLimit) cctx->dictLimit = cctx->lowLimit; - } - - cSize = ZSTD_compressBlock_internal(cctx, op+ZSTD_blockHeaderSize, dstCapacity-ZSTD_blockHeaderSize, ip, blockSize); - if (ZSTD_isError(cSize)) return cSize; - - if (cSize == 0) { /* block is not compressible */ - U32 const cBlockHeader24 = lastBlock + (((U32)bt_raw)<<1) + (U32)(blockSize << 3); - if (blockSize + ZSTD_blockHeaderSize > dstCapacity) return ERROR(dstSize_tooSmall); - MEM_writeLE32(op, cBlockHeader24); /* no pb, 4th byte will be overwritten */ - memcpy(op + ZSTD_blockHeaderSize, ip, blockSize); - cSize = ZSTD_blockHeaderSize+blockSize; - } else { - U32 const cBlockHeader24 = lastBlock + (((U32)bt_compressed)<<1) + (U32)(cSize << 3); - MEM_writeLE24(op, cBlockHeader24); - cSize += ZSTD_blockHeaderSize; - } - - remaining -= blockSize; - dstCapacity -= cSize; - ip += blockSize; - op += cSize; - } + ZSTD_window_enforceMaxDist(&ms->window, ip + blockSize, maxDist, &ms->loadedDictEnd); + if (ms->nextToUpdate < ms->window.lowLimit) ms->nextToUpdate = ms->window.lowLimit; + + { size_t cSize = ZSTD_compressBlock_internal(cctx, + op+ZSTD_blockHeaderSize, dstCapacity-ZSTD_blockHeaderSize, + ip, blockSize); + if (ZSTD_isError(cSize)) return cSize; + + if (cSize == 0) { /* block is not compressible */ + U32 const cBlockHeader24 = lastBlock + (((U32)bt_raw)<<1) + (U32)(blockSize << 3); + if (blockSize + ZSTD_blockHeaderSize > dstCapacity) return ERROR(dstSize_tooSmall); + MEM_writeLE32(op, cBlockHeader24); /* 4th byte will be overwritten */ + memcpy(op + ZSTD_blockHeaderSize, ip, blockSize); + cSize = ZSTD_blockHeaderSize + blockSize; + } else { + U32 const cBlockHeader24 = lastBlock + (((U32)bt_compressed)<<1) + (U32)(cSize << 3); + MEM_writeLE24(op, cBlockHeader24); + cSize += ZSTD_blockHeaderSize; + } + + ip += blockSize; + assert(remaining >= blockSize); + remaining -= blockSize; + op += cSize; + assert(dstCapacity >= cSize); + dstCapacity -= cSize; + DEBUGLOG(5, "ZSTD_compress_frameChunk: adding a block of size %u", + (U32)cSize); + } } if (lastFrameChunk && (op>ostart)) cctx->stage = ZSTDcs_ending; return op-ostart; @@ -2357,27 +2047,32 @@ static size_t ZSTD_writeFrameHeader(void* dst, size_t dstCapacity, - ZSTD_parameters params, U64 pledgedSrcSize, U32 dictID) + ZSTD_CCtx_params params, U64 pledgedSrcSize, U32 dictID) { BYTE* const op = (BYTE*)dst; - U32 const dictIDSizeCode = (dictID>0) + (dictID>=256) + (dictID>=65536); /* 0-3 */ + U32 const dictIDSizeCodeLength = (dictID>0) + (dictID>=256) + (dictID>=65536); /* 0-3 */ + U32 const dictIDSizeCode = params.fParams.noDictIDFlag ? 0 : dictIDSizeCodeLength; /* 0-3 */ U32 const checksumFlag = params.fParams.checksumFlag>0; - U32 const windowSize = 1U << params.cParams.windowLog; - U32 const singleSegment = params.fParams.contentSizeFlag && (windowSize > (pledgedSrcSize-1)); + U32 const windowSize = (U32)1 << params.cParams.windowLog; + U32 const singleSegment = params.fParams.contentSizeFlag && (windowSize >= pledgedSrcSize); BYTE const windowLogByte = (BYTE)((params.cParams.windowLog - ZSTD_WINDOWLOG_ABSOLUTEMIN) << 3); U32 const fcsCode = params.fParams.contentSizeFlag ? - (pledgedSrcSize>=256) + (pledgedSrcSize>=65536+256) + (pledgedSrcSize>=0xFFFFFFFFU) : /* 0-3 */ - 0; + (pledgedSrcSize>=256) + (pledgedSrcSize>=65536+256) + (pledgedSrcSize>=0xFFFFFFFFU) : 0; /* 0-3 */ BYTE const frameHeaderDecriptionByte = (BYTE)(dictIDSizeCode + (checksumFlag<<2) + (singleSegment<<5) + (fcsCode<<6) ); - size_t pos; + size_t pos=0; if (dstCapacity < ZSTD_frameHeaderSize_max) return ERROR(dstSize_tooSmall); - - MEM_writeLE32(dst, ZSTD_MAGICNUMBER); - op[4] = frameHeaderDecriptionByte; pos=5; + DEBUGLOG(4, "ZSTD_writeFrameHeader : dictIDFlag : %u ; dictID : %u ; dictIDSizeCode : %u", + !params.fParams.noDictIDFlag, dictID, dictIDSizeCode); + + if (params.format == ZSTD_f_zstd1) { + MEM_writeLE32(dst, ZSTD_MAGICNUMBER); + pos = 4; + } + op[pos++] = frameHeaderDecriptionByte; if (!singleSegment) op[pos++] = windowLogByte; switch(dictIDSizeCode) { - default: /* impossible */ + default: assert(0); /* impossible */ case 0 : break; case 1 : op[pos] = (BYTE)(dictID); pos++; break; case 2 : MEM_writeLE16(op+pos, (U16)dictID); pos+=2; break; @@ -2385,7 +2080,7 @@ } switch(fcsCode) { - default: /* impossible */ + default: assert(0); /* impossible */ case 0 : if (singleSegment) op[pos++] = (BYTE)(pledgedSrcSize); break; case 1 : MEM_writeLE16(op+pos, (U16)(pledgedSrcSize-256)); pos+=2; break; case 2 : MEM_writeLE32(op+pos, (U32)(pledgedSrcSize)); pos+=4; break; @@ -2394,121 +2089,146 @@ return pos; } +/* ZSTD_writeLastEmptyBlock() : + * output an empty Block with end-of-frame mark to complete a frame + * @return : size of data written into `dst` (== ZSTD_blockHeaderSize (defined in zstd_internal.h)) + * or an error code if `dstCapcity` is too small (stage != ZSTDcs_init) + return ERROR(stage_wrong); + if (cctx->appliedParams.ldmParams.enableLdm) + return ERROR(parameter_unsupported); + cctx->externSeqStore.seq = seq; + cctx->externSeqStore.size = nbSeq; + cctx->externSeqStore.capacity = nbSeq; + cctx->externSeqStore.pos = 0; + return 0; +} + static size_t ZSTD_compressContinue_internal (ZSTD_CCtx* cctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize, U32 frame, U32 lastFrameChunk) { - const BYTE* const ip = (const BYTE*) src; + ZSTD_matchState_t* ms = &cctx->blockState.matchState; size_t fhSize = 0; + DEBUGLOG(5, "ZSTD_compressContinue_internal, stage: %u, srcSize: %u", + cctx->stage, (U32)srcSize); if (cctx->stage==ZSTDcs_created) return ERROR(stage_wrong); /* missing init (ZSTD_compressBegin) */ if (frame && (cctx->stage==ZSTDcs_init)) { - fhSize = ZSTD_writeFrameHeader(dst, dstCapacity, cctx->params, cctx->frameContentSize, cctx->dictID); + fhSize = ZSTD_writeFrameHeader(dst, dstCapacity, cctx->appliedParams, + cctx->pledgedSrcSizePlusOne-1, cctx->dictID); if (ZSTD_isError(fhSize)) return fhSize; dstCapacity -= fhSize; dst = (char*)dst + fhSize; cctx->stage = ZSTDcs_ongoing; } - /* Check if blocks follow each other */ - if (src != cctx->nextSrc) { - /* not contiguous */ - ptrdiff_t const delta = cctx->nextSrc - ip; - cctx->lowLimit = cctx->dictLimit; - cctx->dictLimit = (U32)(cctx->nextSrc - cctx->base); - cctx->dictBase = cctx->base; - cctx->base -= delta; - cctx->nextToUpdate = cctx->dictLimit; - if (cctx->dictLimit - cctx->lowLimit < HASH_READ_SIZE) cctx->lowLimit = cctx->dictLimit; /* too small extDict */ + if (!srcSize) return fhSize; /* do not generate an empty block if no input */ + + if (!ZSTD_window_update(&ms->window, src, srcSize)) { + ms->nextToUpdate = ms->window.dictLimit; } - - /* if input and dictionary overlap : reduce dictionary (area presumed modified by input) */ - if ((ip+srcSize > cctx->dictBase + cctx->lowLimit) & (ip < cctx->dictBase + cctx->dictLimit)) { - ptrdiff_t const highInputIdx = (ip + srcSize) - cctx->dictBase; - U32 const lowLimitMax = (highInputIdx > (ptrdiff_t)cctx->dictLimit) ? cctx->dictLimit : (U32)highInputIdx; - cctx->lowLimit = lowLimitMax; - } - - cctx->nextSrc = ip + srcSize; - - if (srcSize) { - size_t const cSize = frame ? - ZSTD_compress_generic (cctx, dst, dstCapacity, src, srcSize, lastFrameChunk) : + if (cctx->appliedParams.ldmParams.enableLdm) + ZSTD_window_update(&cctx->ldmState.window, src, srcSize); + + DEBUGLOG(5, "ZSTD_compressContinue_internal (blockSize=%u)", (U32)cctx->blockSize); + { size_t const cSize = frame ? + ZSTD_compress_frameChunk (cctx, dst, dstCapacity, src, srcSize, lastFrameChunk) : ZSTD_compressBlock_internal (cctx, dst, dstCapacity, src, srcSize); if (ZSTD_isError(cSize)) return cSize; + cctx->consumedSrcSize += srcSize; + cctx->producedCSize += (cSize + fhSize); + if (cctx->appliedParams.fParams.contentSizeFlag) { /* control src size */ + if (cctx->consumedSrcSize+1 > cctx->pledgedSrcSizePlusOne) { + DEBUGLOG(4, "error : pledgedSrcSize = %u, while realSrcSize >= %u", + (U32)cctx->pledgedSrcSizePlusOne-1, (U32)cctx->consumedSrcSize); + return ERROR(srcSize_wrong); + } + } return cSize + fhSize; - } else - return fhSize; + } } - size_t ZSTD_compressContinue (ZSTD_CCtx* cctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize) { - return ZSTD_compressContinue_internal(cctx, dst, dstCapacity, src, srcSize, 1, 0); + DEBUGLOG(5, "ZSTD_compressContinue (srcSize=%u)", (U32)srcSize); + return ZSTD_compressContinue_internal(cctx, dst, dstCapacity, src, srcSize, 1 /* frame mode */, 0 /* last chunk */); } -size_t ZSTD_getBlockSizeMax(ZSTD_CCtx* cctx) +size_t ZSTD_getBlockSize(const ZSTD_CCtx* cctx) { - return MIN (ZSTD_BLOCKSIZE_ABSOLUTEMAX, 1 << cctx->params.cParams.windowLog); + ZSTD_compressionParameters const cParams = cctx->appliedParams.cParams; + assert(!ZSTD_checkCParams(cParams)); + return MIN (ZSTD_BLOCKSIZE_MAX, (U32)1 << cParams.windowLog); } size_t ZSTD_compressBlock(ZSTD_CCtx* cctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize) { - size_t const blockSizeMax = ZSTD_getBlockSizeMax(cctx); + size_t const blockSizeMax = ZSTD_getBlockSize(cctx); if (srcSize > blockSizeMax) return ERROR(srcSize_wrong); - return ZSTD_compressContinue_internal(cctx, dst, dstCapacity, src, srcSize, 0, 0); + return ZSTD_compressContinue_internal(cctx, dst, dstCapacity, src, srcSize, 0 /* frame mode */, 0 /* last chunk */); } - -static size_t ZSTD_loadDictionaryContent(ZSTD_CCtx* zc, const void* src, size_t srcSize) +/*! ZSTD_loadDictionaryContent() : + * @return : 0, or an error code + */ +static size_t ZSTD_loadDictionaryContent(ZSTD_matchState_t* ms, ZSTD_CCtx_params const* params, const void* src, size_t srcSize) { const BYTE* const ip = (const BYTE*) src; const BYTE* const iend = ip + srcSize; - - /* input becomes current prefix */ - zc->lowLimit = zc->dictLimit; - zc->dictLimit = (U32)(zc->nextSrc - zc->base); - zc->dictBase = zc->base; - zc->base += ip - zc->nextSrc; - zc->nextToUpdate = zc->dictLimit; - zc->loadedDictEnd = zc->forceWindow ? 0 : (U32)(iend - zc->base); - - zc->nextSrc = iend; + ZSTD_compressionParameters const* cParams = ¶ms->cParams; + + ZSTD_window_update(&ms->window, src, srcSize); + ms->loadedDictEnd = params->forceWindow ? 0 : (U32)(iend - ms->window.base); + if (srcSize <= HASH_READ_SIZE) return 0; - switch(zc->params.cParams.strategy) + switch(params->cParams.strategy) { case ZSTD_fast: - ZSTD_fillHashTable (zc, iend, zc->params.cParams.searchLength); + ZSTD_fillHashTable(ms, cParams, iend); break; - case ZSTD_dfast: - ZSTD_fillDoubleHashTable (zc, iend, zc->params.cParams.searchLength); + ZSTD_fillDoubleHashTable(ms, cParams, iend); break; case ZSTD_greedy: case ZSTD_lazy: case ZSTD_lazy2: - ZSTD_insertAndFindFirstIndex (zc, iend-HASH_READ_SIZE, zc->params.cParams.searchLength); + if (srcSize >= HASH_READ_SIZE) + ZSTD_insertAndFindFirstIndex(ms, cParams, iend-HASH_READ_SIZE); break; - case ZSTD_btlazy2: + case ZSTD_btlazy2: /* we want the dictionary table fully sorted */ case ZSTD_btopt: - case ZSTD_btopt2: - ZSTD_updateTree(zc, iend-HASH_READ_SIZE, iend, 1 << zc->params.cParams.searchLog, zc->params.cParams.searchLength); + case ZSTD_btultra: + if (srcSize >= HASH_READ_SIZE) + ZSTD_updateTree(ms, cParams, iend-HASH_READ_SIZE, iend); break; default: - return ERROR(GENERIC); /* strategy doesn't exist; impossible */ + assert(0); /* not possible : not a valid strategy id */ } - zc->nextToUpdate = zc->loadedDictEnd; + ms->nextToUpdate = (U32)(iend - ms->window.base); return 0; } @@ -2528,27 +2248,32 @@ /* Dictionary format : - Magic == ZSTD_DICT_MAGIC (4 bytes) - HUF_writeCTable(256) - FSE_writeNCount(off) - FSE_writeNCount(ml) - FSE_writeNCount(ll) - RepOffsets - Dictionary content -*/ -/*! ZSTD_loadDictEntropyStats() : - @return : size read from dictionary - note : magic number supposed already checked */ -static size_t ZSTD_loadDictEntropyStats(ZSTD_CCtx* cctx, const void* dict, size_t dictSize) + * See : + * https://github.com/facebook/zstd/blob/master/doc/zstd_compression_format.md#dictionary-format + */ +/*! ZSTD_loadZstdDictionary() : + * @return : dictID, or an error code + * assumptions : magic number supposed already checked + * dictSize supposed > 8 + */ +static size_t ZSTD_loadZstdDictionary(ZSTD_compressedBlockState_t* bs, ZSTD_matchState_t* ms, ZSTD_CCtx_params const* params, const void* dict, size_t dictSize, void* workspace) { const BYTE* dictPtr = (const BYTE*)dict; const BYTE* const dictEnd = dictPtr + dictSize; short offcodeNCount[MaxOff+1]; unsigned offcodeMaxValue = MaxOff; - BYTE scratchBuffer[1<hufTable, 255, dict, dictSize); + size_t dictID; + + ZSTD_STATIC_ASSERT(HUF_WORKSPACE_SIZE >= (1<fParams.noDictIDFlag ? 0 : MEM_readLE32(dictPtr); + dictPtr += 4; + + { unsigned maxSymbolValue = 255; + size_t const hufHeaderSize = HUF_readCTable((HUF_CElt*)bs->entropy.hufCTable, &maxSymbolValue, dictPtr, dictEnd-dictPtr); if (HUF_isError(hufHeaderSize)) return ERROR(dictionary_corrupted); + if (maxSymbolValue < 255) return ERROR(dictionary_corrupted); dictPtr += hufHeaderSize; } @@ -2557,7 +2282,8 @@ if (FSE_isError(offcodeHeaderSize)) return ERROR(dictionary_corrupted); if (offcodeLog > OffFSELog) return ERROR(dictionary_corrupted); /* Defer checking offcodeMaxValue because we need to know the size of the dictionary content */ - CHECK_E (FSE_buildCTable_wksp(cctx->offcodeCTable, offcodeNCount, offcodeMaxValue, offcodeLog, scratchBuffer, sizeof(scratchBuffer)), dictionary_corrupted); + CHECK_E( FSE_buildCTable_wksp(bs->entropy.offcodeCTable, offcodeNCount, offcodeMaxValue, offcodeLog, workspace, HUF_WORKSPACE_SIZE), + dictionary_corrupted); dictPtr += offcodeHeaderSize; } @@ -2567,8 +2293,9 @@ if (FSE_isError(matchlengthHeaderSize)) return ERROR(dictionary_corrupted); if (matchlengthLog > MLFSELog) return ERROR(dictionary_corrupted); /* Every match length code must have non-zero probability */ - CHECK_F (ZSTD_checkDictNCount(matchlengthNCount, matchlengthMaxValue, MaxML)); - CHECK_E (FSE_buildCTable_wksp(cctx->matchlengthCTable, matchlengthNCount, matchlengthMaxValue, matchlengthLog, scratchBuffer, sizeof(scratchBuffer)), dictionary_corrupted); + CHECK_F( ZSTD_checkDictNCount(matchlengthNCount, matchlengthMaxValue, MaxML)); + CHECK_E( FSE_buildCTable_wksp(bs->entropy.matchlengthCTable, matchlengthNCount, matchlengthMaxValue, matchlengthLog, workspace, HUF_WORKSPACE_SIZE), + dictionary_corrupted); dictPtr += matchlengthHeaderSize; } @@ -2578,60 +2305,122 @@ if (FSE_isError(litlengthHeaderSize)) return ERROR(dictionary_corrupted); if (litlengthLog > LLFSELog) return ERROR(dictionary_corrupted); /* Every literal length code must have non-zero probability */ - CHECK_F (ZSTD_checkDictNCount(litlengthNCount, litlengthMaxValue, MaxLL)); - CHECK_E(FSE_buildCTable_wksp(cctx->litlengthCTable, litlengthNCount, litlengthMaxValue, litlengthLog, scratchBuffer, sizeof(scratchBuffer)), dictionary_corrupted); + CHECK_F( ZSTD_checkDictNCount(litlengthNCount, litlengthMaxValue, MaxLL)); + CHECK_E( FSE_buildCTable_wksp(bs->entropy.litlengthCTable, litlengthNCount, litlengthMaxValue, litlengthLog, workspace, HUF_WORKSPACE_SIZE), + dictionary_corrupted); dictPtr += litlengthHeaderSize; } if (dictPtr+12 > dictEnd) return ERROR(dictionary_corrupted); - cctx->rep[0] = MEM_readLE32(dictPtr+0); if (cctx->rep[0] == 0 || cctx->rep[0] >= dictSize) return ERROR(dictionary_corrupted); - cctx->rep[1] = MEM_readLE32(dictPtr+4); if (cctx->rep[1] == 0 || cctx->rep[1] >= dictSize) return ERROR(dictionary_corrupted); - cctx->rep[2] = MEM_readLE32(dictPtr+8); if (cctx->rep[2] == 0 || cctx->rep[2] >= dictSize) return ERROR(dictionary_corrupted); + bs->rep[0] = MEM_readLE32(dictPtr+0); + bs->rep[1] = MEM_readLE32(dictPtr+4); + bs->rep[2] = MEM_readLE32(dictPtr+8); dictPtr += 12; - { U32 offcodeMax = MaxOff; - if ((size_t)(dictEnd - dictPtr) <= ((U32)-1) - 128 KB) { - U32 const maxOffset = (U32)(dictEnd - dictPtr) + 128 KB; /* The maximum offset that must be supported */ - /* Calculate minimum offset code required to represent maxOffset */ - offcodeMax = ZSTD_highbit32(maxOffset); + { size_t const dictContentSize = (size_t)(dictEnd - dictPtr); + U32 offcodeMax = MaxOff; + if (dictContentSize <= ((U32)-1) - 128 KB) { + U32 const maxOffset = (U32)dictContentSize + 128 KB; /* The maximum offset that must be supported */ + offcodeMax = ZSTD_highbit32(maxOffset); /* Calculate minimum offset code required to represent maxOffset */ } - /* Every possible supported offset <= dictContentSize + 128 KB must be representable */ + /* All offset values <= dictContentSize + 128 KB must be representable */ CHECK_F (ZSTD_checkDictNCount(offcodeNCount, offcodeMaxValue, MIN(offcodeMax, MaxOff))); + /* All repCodes must be <= dictContentSize and != 0*/ + { U32 u; + for (u=0; u<3; u++) { + if (bs->rep[u] == 0) return ERROR(dictionary_corrupted); + if (bs->rep[u] > dictContentSize) return ERROR(dictionary_corrupted); + } } + + bs->entropy.hufCTable_repeatMode = HUF_repeat_valid; + bs->entropy.offcode_repeatMode = FSE_repeat_valid; + bs->entropy.matchlength_repeatMode = FSE_repeat_valid; + bs->entropy.litlength_repeatMode = FSE_repeat_valid; + CHECK_F(ZSTD_loadDictionaryContent(ms, params, dictPtr, dictContentSize)); + return dictID; } - - cctx->flagStaticTables = 1; - return dictPtr - (const BYTE*)dict; } /** ZSTD_compress_insertDictionary() : -* @return : 0, or an error code */ -static size_t ZSTD_compress_insertDictionary(ZSTD_CCtx* zc, const void* dict, size_t dictSize) +* @return : dictID, or an error code */ +static size_t ZSTD_compress_insertDictionary(ZSTD_compressedBlockState_t* bs, ZSTD_matchState_t* ms, + ZSTD_CCtx_params const* params, + const void* dict, size_t dictSize, + ZSTD_dictContentType_e dictContentType, + void* workspace) { + DEBUGLOG(4, "ZSTD_compress_insertDictionary (dictSize=%u)", (U32)dictSize); if ((dict==NULL) || (dictSize<=8)) return 0; - /* default : dict is pure content */ - if (MEM_readLE32(dict) != ZSTD_DICT_MAGIC) return ZSTD_loadDictionaryContent(zc, dict, dictSize); - zc->dictID = zc->params.fParams.noDictIDFlag ? 0 : MEM_readLE32((const char*)dict+4); - - /* known magic number : dict is parsed for entropy stats and content */ - { size_t const loadError = ZSTD_loadDictEntropyStats(zc, (const char*)dict+8 /* skip dictHeader */, dictSize-8); - size_t const eSize = loadError + 8; - if (ZSTD_isError(loadError)) return loadError; - return ZSTD_loadDictionaryContent(zc, (const char*)dict+eSize, dictSize-eSize); + ZSTD_reset_compressedBlockState(bs); + + /* dict restricted modes */ + if (dictContentType == ZSTD_dct_rawContent) + return ZSTD_loadDictionaryContent(ms, params, dict, dictSize); + + if (MEM_readLE32(dict) != ZSTD_MAGIC_DICTIONARY) { + if (dictContentType == ZSTD_dct_auto) { + DEBUGLOG(4, "raw content dictionary detected"); + return ZSTD_loadDictionaryContent(ms, params, dict, dictSize); + } + if (dictContentType == ZSTD_dct_fullDict) + return ERROR(dictionary_wrong); + assert(0); /* impossible */ } + + /* dict as full zstd dictionary */ + return ZSTD_loadZstdDictionary(bs, ms, params, dict, dictSize, workspace); } /*! ZSTD_compressBegin_internal() : -* @return : 0, or an error code */ -static size_t ZSTD_compressBegin_internal(ZSTD_CCtx* cctx, + * @return : 0, or an error code */ +size_t ZSTD_compressBegin_internal(ZSTD_CCtx* cctx, const void* dict, size_t dictSize, - ZSTD_parameters params, U64 pledgedSrcSize) + ZSTD_dictContentType_e dictContentType, + const ZSTD_CDict* cdict, + ZSTD_CCtx_params params, U64 pledgedSrcSize, + ZSTD_buffered_policy_e zbuff) { - ZSTD_compResetPolicy_e const crp = dictSize ? ZSTDcrp_fullReset : ZSTDcrp_continue; - CHECK_F(ZSTD_resetCCtx_advanced(cctx, params, pledgedSrcSize, crp)); - return ZSTD_compress_insertDictionary(cctx, dict, dictSize); + DEBUGLOG(4, "ZSTD_compressBegin_internal: wlog=%u", params.cParams.windowLog); + /* params are supposed to be fully validated at this point */ + assert(!ZSTD_isError(ZSTD_checkCParams(params.cParams))); + assert(!((dict) && (cdict))); /* either dict or cdict, not both */ + + if (cdict && cdict->dictContentSize>0) { + cctx->requestedParams = params; + return ZSTD_resetCCtx_usingCDict(cctx, cdict, params.cParams.windowLog, + params.fParams, pledgedSrcSize, zbuff); + } + + CHECK_F( ZSTD_resetCCtx_internal(cctx, params, pledgedSrcSize, + ZSTDcrp_continue, zbuff) ); + { + size_t const dictID = ZSTD_compress_insertDictionary( + cctx->blockState.prevCBlock, &cctx->blockState.matchState, + ¶ms, dict, dictSize, dictContentType, cctx->entropyWorkspace); + if (ZSTD_isError(dictID)) return dictID; + assert(dictID <= (size_t)(U32)-1); + cctx->dictID = (U32)dictID; + } + return 0; } +size_t ZSTD_compressBegin_advanced_internal(ZSTD_CCtx* cctx, + const void* dict, size_t dictSize, + ZSTD_dictContentType_e dictContentType, + const ZSTD_CDict* cdict, + ZSTD_CCtx_params params, + unsigned long long pledgedSrcSize) +{ + DEBUGLOG(4, "ZSTD_compressBegin_advanced_internal: wlog=%u", params.cParams.windowLog); + /* compression parameters verification and optimization */ + CHECK_F( ZSTD_checkCParams(params.cParams) ); + return ZSTD_compressBegin_internal(cctx, + dict, dictSize, dictContentType, + cdict, + params, pledgedSrcSize, + ZSTDb_not_buffered); +} /*! ZSTD_compressBegin_advanced() : * @return : 0, or an error code */ @@ -2639,19 +2428,24 @@ const void* dict, size_t dictSize, ZSTD_parameters params, unsigned long long pledgedSrcSize) { - /* compression parameters verification and optimization */ - CHECK_F(ZSTD_checkCParams(params.cParams)); - return ZSTD_compressBegin_internal(cctx, dict, dictSize, params, pledgedSrcSize); + ZSTD_CCtx_params const cctxParams = + ZSTD_assignParamsToCCtxParams(cctx->requestedParams, params); + return ZSTD_compressBegin_advanced_internal(cctx, + dict, dictSize, ZSTD_dct_auto, + NULL /*cdict*/, + cctxParams, pledgedSrcSize); } - size_t ZSTD_compressBegin_usingDict(ZSTD_CCtx* cctx, const void* dict, size_t dictSize, int compressionLevel) { - ZSTD_parameters const params = ZSTD_getParams(compressionLevel, 0, dictSize); - return ZSTD_compressBegin_internal(cctx, dict, dictSize, params, 0); + ZSTD_parameters const params = ZSTD_getParams(compressionLevel, ZSTD_CONTENTSIZE_UNKNOWN, dictSize); + ZSTD_CCtx_params const cctxParams = + ZSTD_assignParamsToCCtxParams(cctx->requestedParams, params); + DEBUGLOG(4, "ZSTD_compressBegin_usingDict (dictSize=%u)", (U32)dictSize); + return ZSTD_compressBegin_internal(cctx, dict, dictSize, ZSTD_dct_auto, NULL, + cctxParams, ZSTD_CONTENTSIZE_UNKNOWN, ZSTDb_not_buffered); } - size_t ZSTD_compressBegin(ZSTD_CCtx* cctx, int compressionLevel) { return ZSTD_compressBegin_usingDict(cctx, NULL, 0, compressionLevel); @@ -2667,11 +2461,12 @@ BYTE* op = ostart; size_t fhSize = 0; + DEBUGLOG(4, "ZSTD_writeEpilogue"); if (cctx->stage == ZSTDcs_created) return ERROR(stage_wrong); /* init missing */ /* special case : empty frame */ if (cctx->stage == ZSTDcs_init) { - fhSize = ZSTD_writeFrameHeader(dst, dstCapacity, cctx->params, 0, 0); + fhSize = ZSTD_writeFrameHeader(dst, dstCapacity, cctx->appliedParams, 0, 0); if (ZSTD_isError(fhSize)) return fhSize; dstCapacity -= fhSize; op += fhSize; @@ -2687,9 +2482,10 @@ dstCapacity -= ZSTD_blockHeaderSize; } - if (cctx->params.fParams.checksumFlag) { + if (cctx->appliedParams.fParams.checksumFlag) { U32 const checksum = (U32) XXH64_digest(&cctx->xxhState); if (dstCapacity<4) return ERROR(dstSize_tooSmall); + DEBUGLOG(4, "ZSTD_writeEpilogue: write checksum : %08X", checksum); MEM_writeLE32(op, checksum); op += 4; } @@ -2698,16 +2494,24 @@ return op-ostart; } - size_t ZSTD_compressEnd (ZSTD_CCtx* cctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize) { size_t endResult; - size_t const cSize = ZSTD_compressContinue_internal(cctx, dst, dstCapacity, src, srcSize, 1, 1); + size_t const cSize = ZSTD_compressContinue_internal(cctx, + dst, dstCapacity, src, srcSize, + 1 /* frame mode */, 1 /* last chunk */); if (ZSTD_isError(cSize)) return cSize; endResult = ZSTD_writeEpilogue(cctx, (char*)dst + cSize, dstCapacity-cSize); if (ZSTD_isError(endResult)) return endResult; + if (cctx->appliedParams.fParams.contentSizeFlag) { /* control src size */ + DEBUGLOG(4, "end of frame : controlling src size"); + if (cctx->pledgedSrcSizePlusOne != cctx->consumedSrcSize+1) { + DEBUGLOG(4, "error : pledgedSrcSize = %u, while realSrcSize = %u", + (U32)cctx->pledgedSrcSizePlusOne-1, (U32)cctx->consumedSrcSize); + return ERROR(srcSize_wrong); + } } return cSize + endResult; } @@ -2718,8 +2522,14 @@ const void* dict,size_t dictSize, ZSTD_parameters params) { - CHECK_F(ZSTD_compressBegin_internal(cctx, dict, dictSize, params, srcSize)); - return ZSTD_compressEnd(cctx, dst, dstCapacity, src, srcSize); + ZSTD_CCtx_params const cctxParams = + ZSTD_assignParamsToCCtxParams(cctx->requestedParams, params); + DEBUGLOG(4, "ZSTD_compress_internal"); + return ZSTD_compress_advanced_internal(cctx, + dst, dstCapacity, + src, srcSize, + dict, dictSize, + cctxParams); } size_t ZSTD_compress_advanced (ZSTD_CCtx* ctx, @@ -2728,20 +2538,40 @@ const void* dict,size_t dictSize, ZSTD_parameters params) { + DEBUGLOG(4, "ZSTD_compress_advanced"); CHECK_F(ZSTD_checkCParams(params.cParams)); return ZSTD_compress_internal(ctx, dst, dstCapacity, src, srcSize, dict, dictSize, params); } -size_t ZSTD_compress_usingDict(ZSTD_CCtx* ctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize, const void* dict, size_t dictSize, int compressionLevel) +/* Internal */ +size_t ZSTD_compress_advanced_internal( + ZSTD_CCtx* cctx, + void* dst, size_t dstCapacity, + const void* src, size_t srcSize, + const void* dict,size_t dictSize, + ZSTD_CCtx_params params) { - ZSTD_parameters params = ZSTD_getParams(compressionLevel, srcSize, dict ? dictSize : 0); - params.fParams.contentSizeFlag = 1; - return ZSTD_compress_internal(ctx, dst, dstCapacity, src, srcSize, dict, dictSize, params); + DEBUGLOG(4, "ZSTD_compress_advanced_internal (srcSize:%u)", + (U32)srcSize); + CHECK_F( ZSTD_compressBegin_internal(cctx, dict, dictSize, ZSTD_dct_auto, NULL, + params, srcSize, ZSTDb_not_buffered) ); + return ZSTD_compressEnd(cctx, dst, dstCapacity, src, srcSize); } -size_t ZSTD_compressCCtx (ZSTD_CCtx* ctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize, int compressionLevel) +size_t ZSTD_compress_usingDict(ZSTD_CCtx* cctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize, + const void* dict, size_t dictSize, int compressionLevel) { - return ZSTD_compress_usingDict(ctx, dst, dstCapacity, src, srcSize, NULL, 0, compressionLevel); + ZSTD_parameters const params = ZSTD_getParams(compressionLevel, srcSize ? srcSize : 1, dict ? dictSize : 0); + ZSTD_CCtx_params cctxParams = ZSTD_assignParamsToCCtxParams(cctx->requestedParams, params); + assert(params.fParams.contentSizeFlag == 1); + ZSTD_CCtxParam_setParameter(&cctxParams, ZSTD_p_compressLiterals, compressionLevel>=0); + return ZSTD_compress_advanced_internal(cctx, dst, dstCapacity, src, srcSize, dict, dictSize, cctxParams); +} + +size_t ZSTD_compressCCtx (ZSTD_CCtx* cctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize, int compressionLevel) +{ + DEBUGLOG(4, "ZSTD_compressCCtx (srcSize=%u)", (U32)srcSize); + return ZSTD_compress_usingDict(cctx, dst, dstCapacity, src, srcSize, NULL, 0, compressionLevel); } size_t ZSTD_compress(void* dst, size_t dstCapacity, const void* src, size_t srcSize, int compressionLevel) @@ -2749,123 +2579,264 @@ size_t result; ZSTD_CCtx ctxBody; memset(&ctxBody, 0, sizeof(ctxBody)); - memcpy(&ctxBody.customMem, &defaultCustomMem, sizeof(ZSTD_customMem)); + ctxBody.customMem = ZSTD_defaultCMem; result = ZSTD_compressCCtx(&ctxBody, dst, dstCapacity, src, srcSize, compressionLevel); - ZSTD_free(ctxBody.workSpace, defaultCustomMem); /* can't free ctxBody itself, as it's on stack; free only heap content */ + ZSTD_free(ctxBody.workSpace, ZSTD_defaultCMem); /* can't free ctxBody itself, as it's on stack; free only heap content */ return result; } /* ===== Dictionary API ===== */ -struct ZSTD_CDict_s { - void* dictBuffer; - const void* dictContent; - size_t dictContentSize; - ZSTD_CCtx* refContext; -}; /* typedef'd tp ZSTD_CDict within "zstd.h" */ +/*! ZSTD_estimateCDictSize_advanced() : + * Estimate amount of memory that will be needed to create a dictionary with following arguments */ +size_t ZSTD_estimateCDictSize_advanced( + size_t dictSize, ZSTD_compressionParameters cParams, + ZSTD_dictLoadMethod_e dictLoadMethod) +{ + DEBUGLOG(5, "sizeof(ZSTD_CDict) : %u", (U32)sizeof(ZSTD_CDict)); + return sizeof(ZSTD_CDict) + HUF_WORKSPACE_SIZE + ZSTD_sizeof_matchState(&cParams, /* forCCtx */ 0) + + (dictLoadMethod == ZSTD_dlm_byRef ? 0 : dictSize); +} + +size_t ZSTD_estimateCDictSize(size_t dictSize, int compressionLevel) +{ + ZSTD_compressionParameters const cParams = ZSTD_getCParams(compressionLevel, 0, dictSize); + return ZSTD_estimateCDictSize_advanced(dictSize, cParams, ZSTD_dlm_byCopy); +} size_t ZSTD_sizeof_CDict(const ZSTD_CDict* cdict) { if (cdict==NULL) return 0; /* support sizeof on NULL */ - return ZSTD_sizeof_CCtx(cdict->refContext) + (cdict->dictBuffer ? cdict->dictContentSize : 0) + sizeof(*cdict); + DEBUGLOG(5, "sizeof(*cdict) : %u", (U32)sizeof(*cdict)); + return cdict->workspaceSize + (cdict->dictBuffer ? cdict->dictContentSize : 0) + sizeof(*cdict); } -ZSTD_CDict* ZSTD_createCDict_advanced(const void* dictBuffer, size_t dictSize, unsigned byReference, - ZSTD_parameters params, ZSTD_customMem customMem) +static size_t ZSTD_initCDict_internal( + ZSTD_CDict* cdict, + const void* dictBuffer, size_t dictSize, + ZSTD_dictLoadMethod_e dictLoadMethod, + ZSTD_dictContentType_e dictContentType, + ZSTD_compressionParameters cParams) { - if (!customMem.customAlloc && !customMem.customFree) customMem = defaultCustomMem; - if (!customMem.customAlloc || !customMem.customFree) return NULL; - - { ZSTD_CDict* const cdict = (ZSTD_CDict*) ZSTD_malloc(sizeof(ZSTD_CDict), customMem); - ZSTD_CCtx* const cctx = ZSTD_createCCtx_advanced(customMem); - - if (!cdict || !cctx) { + DEBUGLOG(3, "ZSTD_initCDict_internal, dictContentType %u", (U32)dictContentType); + assert(!ZSTD_checkCParams(cParams)); + cdict->cParams = cParams; + if ((dictLoadMethod == ZSTD_dlm_byRef) || (!dictBuffer) || (!dictSize)) { + cdict->dictBuffer = NULL; + cdict->dictContent = dictBuffer; + } else { + void* const internalBuffer = ZSTD_malloc(dictSize, cdict->customMem); + cdict->dictBuffer = internalBuffer; + cdict->dictContent = internalBuffer; + if (!internalBuffer) return ERROR(memory_allocation); + memcpy(internalBuffer, dictBuffer, dictSize); + } + cdict->dictContentSize = dictSize; + + /* Reset the state to no dictionary */ + ZSTD_reset_compressedBlockState(&cdict->cBlockState); + { void* const end = ZSTD_reset_matchState( + &cdict->matchState, + (U32*)cdict->workspace + HUF_WORKSPACE_SIZE_U32, + &cParams, ZSTDcrp_continue, /* forCCtx */ 0); + assert(end == (char*)cdict->workspace + cdict->workspaceSize); + (void)end; + } + /* (Maybe) load the dictionary + * Skips loading the dictionary if it is <= 8 bytes. + */ + { ZSTD_CCtx_params params; + memset(¶ms, 0, sizeof(params)); + params.compressionLevel = ZSTD_CLEVEL_DEFAULT; + params.fParams.contentSizeFlag = 1; + params.cParams = cParams; + { size_t const dictID = ZSTD_compress_insertDictionary( + &cdict->cBlockState, &cdict->matchState, ¶ms, + cdict->dictContent, cdict->dictContentSize, + dictContentType, cdict->workspace); + if (ZSTD_isError(dictID)) return dictID; + assert(dictID <= (size_t)(U32)-1); + cdict->dictID = (U32)dictID; + } + } + + return 0; +} + +ZSTD_CDict* ZSTD_createCDict_advanced(const void* dictBuffer, size_t dictSize, + ZSTD_dictLoadMethod_e dictLoadMethod, + ZSTD_dictContentType_e dictContentType, + ZSTD_compressionParameters cParams, ZSTD_customMem customMem) +{ + DEBUGLOG(3, "ZSTD_createCDict_advanced, mode %u", (U32)dictContentType); + if (!customMem.customAlloc ^ !customMem.customFree) return NULL; + + { ZSTD_CDict* const cdict = (ZSTD_CDict*)ZSTD_malloc(sizeof(ZSTD_CDict), customMem); + size_t const workspaceSize = HUF_WORKSPACE_SIZE + ZSTD_sizeof_matchState(&cParams, /* forCCtx */ 0); + void* const workspace = ZSTD_malloc(workspaceSize, customMem); + + if (!cdict || !workspace) { ZSTD_free(cdict, customMem); - ZSTD_free(cctx, customMem); + ZSTD_free(workspace, customMem); return NULL; } - - if ((byReference) || (!dictBuffer) || (!dictSize)) { - cdict->dictBuffer = NULL; - cdict->dictContent = dictBuffer; - } else { - void* const internalBuffer = ZSTD_malloc(dictSize, customMem); - if (!internalBuffer) { ZSTD_free(cctx, customMem); ZSTD_free(cdict, customMem); return NULL; } - memcpy(internalBuffer, dictBuffer, dictSize); - cdict->dictBuffer = internalBuffer; - cdict->dictContent = internalBuffer; + cdict->customMem = customMem; + cdict->workspace = workspace; + cdict->workspaceSize = workspaceSize; + if (ZSTD_isError( ZSTD_initCDict_internal(cdict, + dictBuffer, dictSize, + dictLoadMethod, dictContentType, + cParams) )) { + ZSTD_freeCDict(cdict); + return NULL; } - { size_t const errorCode = ZSTD_compressBegin_advanced(cctx, cdict->dictContent, dictSize, params, 0); - if (ZSTD_isError(errorCode)) { - ZSTD_free(cdict->dictBuffer, customMem); - ZSTD_free(cctx, customMem); - ZSTD_free(cdict, customMem); - return NULL; - } } - - cdict->refContext = cctx; - cdict->dictContentSize = dictSize; return cdict; } } ZSTD_CDict* ZSTD_createCDict(const void* dict, size_t dictSize, int compressionLevel) { - ZSTD_customMem const allocator = { NULL, NULL, NULL }; - ZSTD_parameters params = ZSTD_getParams(compressionLevel, 0, dictSize); - params.fParams.contentSizeFlag = 1; - return ZSTD_createCDict_advanced(dict, dictSize, 0, params, allocator); + ZSTD_compressionParameters cParams = ZSTD_getCParams(compressionLevel, 0, dictSize); + return ZSTD_createCDict_advanced(dict, dictSize, + ZSTD_dlm_byCopy, ZSTD_dct_auto, + cParams, ZSTD_defaultCMem); } ZSTD_CDict* ZSTD_createCDict_byReference(const void* dict, size_t dictSize, int compressionLevel) { - ZSTD_customMem const allocator = { NULL, NULL, NULL }; - ZSTD_parameters params = ZSTD_getParams(compressionLevel, 0, dictSize); - params.fParams.contentSizeFlag = 1; - return ZSTD_createCDict_advanced(dict, dictSize, 1, params, allocator); + ZSTD_compressionParameters cParams = ZSTD_getCParams(compressionLevel, 0, dictSize); + return ZSTD_createCDict_advanced(dict, dictSize, + ZSTD_dlm_byRef, ZSTD_dct_auto, + cParams, ZSTD_defaultCMem); } size_t ZSTD_freeCDict(ZSTD_CDict* cdict) { if (cdict==NULL) return 0; /* support free on NULL */ - { ZSTD_customMem const cMem = cdict->refContext->customMem; - ZSTD_freeCCtx(cdict->refContext); + { ZSTD_customMem const cMem = cdict->customMem; + ZSTD_free(cdict->workspace, cMem); ZSTD_free(cdict->dictBuffer, cMem); ZSTD_free(cdict, cMem); return 0; } } -static ZSTD_parameters ZSTD_getParamsFromCDict(const ZSTD_CDict* cdict) { - return ZSTD_getParamsFromCCtx(cdict->refContext); +/*! ZSTD_initStaticCDict_advanced() : + * Generate a digested dictionary in provided memory area. + * workspace: The memory area to emplace the dictionary into. + * Provided pointer must 8-bytes aligned. + * It must outlive dictionary usage. + * workspaceSize: Use ZSTD_estimateCDictSize() + * to determine how large workspace must be. + * cParams : use ZSTD_getCParams() to transform a compression level + * into its relevants cParams. + * @return : pointer to ZSTD_CDict*, or NULL if error (size too small) + * Note : there is no corresponding "free" function. + * Since workspace was allocated externally, it must be freed externally. + */ +const ZSTD_CDict* ZSTD_initStaticCDict( + void* workspace, size_t workspaceSize, + const void* dict, size_t dictSize, + ZSTD_dictLoadMethod_e dictLoadMethod, + ZSTD_dictContentType_e dictContentType, + ZSTD_compressionParameters cParams) +{ + size_t const matchStateSize = ZSTD_sizeof_matchState(&cParams, /* forCCtx */ 0); + size_t const neededSize = sizeof(ZSTD_CDict) + (dictLoadMethod == ZSTD_dlm_byRef ? 0 : dictSize) + + HUF_WORKSPACE_SIZE + matchStateSize; + ZSTD_CDict* const cdict = (ZSTD_CDict*) workspace; + void* ptr; + if ((size_t)workspace & 7) return NULL; /* 8-aligned */ + DEBUGLOG(4, "(workspaceSize < neededSize) : (%u < %u) => %u", + (U32)workspaceSize, (U32)neededSize, (U32)(workspaceSize < neededSize)); + if (workspaceSize < neededSize) return NULL; + + if (dictLoadMethod == ZSTD_dlm_byCopy) { + memcpy(cdict+1, dict, dictSize); + dict = cdict+1; + ptr = (char*)workspace + sizeof(ZSTD_CDict) + dictSize; + } else { + ptr = cdict+1; + } + cdict->workspace = ptr; + cdict->workspaceSize = HUF_WORKSPACE_SIZE + matchStateSize; + + if (ZSTD_isError( ZSTD_initCDict_internal(cdict, + dict, dictSize, + ZSTD_dlm_byRef, dictContentType, + cParams) )) + return NULL; + + return cdict; } -size_t ZSTD_compressBegin_usingCDict(ZSTD_CCtx* cctx, const ZSTD_CDict* cdict, unsigned long long pledgedSrcSize) +ZSTD_compressionParameters ZSTD_getCParamsFromCDict(const ZSTD_CDict* cdict) +{ + assert(cdict != NULL); + return cdict->cParams; +} + +/* ZSTD_compressBegin_usingCDict_advanced() : + * cdict must be != NULL */ +size_t ZSTD_compressBegin_usingCDict_advanced( + ZSTD_CCtx* const cctx, const ZSTD_CDict* const cdict, + ZSTD_frameParameters const fParams, unsigned long long const pledgedSrcSize) { - if (cdict->dictContentSize) CHECK_F(ZSTD_copyCCtx(cctx, cdict->refContext, pledgedSrcSize)) - else CHECK_F(ZSTD_compressBegin_advanced(cctx, NULL, 0, cdict->refContext->params, pledgedSrcSize)); - return 0; + DEBUGLOG(4, "ZSTD_compressBegin_usingCDict_advanced"); + if (cdict==NULL) return ERROR(dictionary_wrong); + { ZSTD_CCtx_params params = cctx->requestedParams; + params.cParams = ZSTD_getCParamsFromCDict(cdict); + /* Increase window log to fit the entire dictionary and source if the + * source size is known. Limit the increase to 19, which is the + * window log for compression level 1 with the largest source size. + */ + if (pledgedSrcSize != ZSTD_CONTENTSIZE_UNKNOWN) { + U32 const limitedSrcSize = (U32)MIN(pledgedSrcSize, 1U << 19); + U32 const limitedSrcLog = limitedSrcSize > 1 ? ZSTD_highbit32(limitedSrcSize - 1) + 1 : 1; + params.cParams.windowLog = MAX(params.cParams.windowLog, limitedSrcLog); + } + params.fParams = fParams; + return ZSTD_compressBegin_internal(cctx, + NULL, 0, ZSTD_dct_auto, + cdict, + params, pledgedSrcSize, + ZSTDb_not_buffered); + } +} + +/* ZSTD_compressBegin_usingCDict() : + * pledgedSrcSize=0 means "unknown" + * if pledgedSrcSize>0, it will enable contentSizeFlag */ +size_t ZSTD_compressBegin_usingCDict(ZSTD_CCtx* cctx, const ZSTD_CDict* cdict) +{ + ZSTD_frameParameters const fParams = { 0 /*content*/, 0 /*checksum*/, 0 /*noDictID*/ }; + DEBUGLOG(4, "ZSTD_compressBegin_usingCDict : dictIDFlag == %u", !fParams.noDictIDFlag); + return ZSTD_compressBegin_usingCDict_advanced(cctx, cdict, fParams, 0); +} + +size_t ZSTD_compress_usingCDict_advanced(ZSTD_CCtx* cctx, + void* dst, size_t dstCapacity, + const void* src, size_t srcSize, + const ZSTD_CDict* cdict, ZSTD_frameParameters fParams) +{ + CHECK_F (ZSTD_compressBegin_usingCDict_advanced(cctx, cdict, fParams, srcSize)); /* will check if cdict != NULL */ + return ZSTD_compressEnd(cctx, dst, dstCapacity, src, srcSize); } /*! ZSTD_compress_usingCDict() : -* Compression using a digested Dictionary. -* Faster startup than ZSTD_compress_usingDict(), recommended when same dictionary is used multiple times. -* Note that compression level is decided during dictionary creation */ + * Compression using a digested Dictionary. + * Faster startup than ZSTD_compress_usingDict(), recommended when same dictionary is used multiple times. + * Note that compression parameters are decided at CDict creation time + * while frame parameters are hardcoded */ size_t ZSTD_compress_usingCDict(ZSTD_CCtx* cctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize, const ZSTD_CDict* cdict) { - CHECK_F(ZSTD_compressBegin_usingCDict(cctx, cdict, srcSize)); - - if (cdict->refContext->params.fParams.contentSizeFlag==1) { - cctx->params.fParams.contentSizeFlag = 1; - cctx->frameContentSize = srcSize; - } - - return ZSTD_compressEnd(cctx, dst, dstCapacity, src, srcSize); + ZSTD_frameParameters const fParams = { 1 /*content*/, 0 /*checksum*/, 0 /*noDictID*/ }; + return ZSTD_compress_usingCDict_advanced(cctx, dst, dstCapacity, src, srcSize, cdict, fParams); } @@ -2874,243 +2845,329 @@ * Streaming ********************************************************************/ -typedef enum { zcss_init, zcss_load, zcss_flush, zcss_final } ZSTD_cStreamStage; - -struct ZSTD_CStream_s { - ZSTD_CCtx* cctx; - ZSTD_CDict* cdictLocal; - const ZSTD_CDict* cdict; - char* inBuff; - size_t inBuffSize; - size_t inToCompress; - size_t inBuffPos; - size_t inBuffTarget; - size_t blockSize; - char* outBuff; - size_t outBuffSize; - size_t outBuffContentSize; - size_t outBuffFlushedSize; - ZSTD_cStreamStage stage; - U32 checksum; - U32 frameEnded; - U64 pledgedSrcSize; - U64 inputProcessed; - ZSTD_parameters params; - ZSTD_customMem customMem; -}; /* typedef'd to ZSTD_CStream within "zstd.h" */ - ZSTD_CStream* ZSTD_createCStream(void) { - return ZSTD_createCStream_advanced(defaultCustomMem); + DEBUGLOG(3, "ZSTD_createCStream"); + return ZSTD_createCStream_advanced(ZSTD_defaultCMem); +} + +ZSTD_CStream* ZSTD_initStaticCStream(void *workspace, size_t workspaceSize) +{ + return ZSTD_initStaticCCtx(workspace, workspaceSize); } ZSTD_CStream* ZSTD_createCStream_advanced(ZSTD_customMem customMem) -{ - ZSTD_CStream* zcs; - - if (!customMem.customAlloc && !customMem.customFree) customMem = defaultCustomMem; - if (!customMem.customAlloc || !customMem.customFree) return NULL; - - zcs = (ZSTD_CStream*)ZSTD_malloc(sizeof(ZSTD_CStream), customMem); - if (zcs==NULL) return NULL; - memset(zcs, 0, sizeof(ZSTD_CStream)); - memcpy(&zcs->customMem, &customMem, sizeof(ZSTD_customMem)); - zcs->cctx = ZSTD_createCCtx_advanced(customMem); - if (zcs->cctx == NULL) { ZSTD_freeCStream(zcs); return NULL; } - return zcs; +{ /* CStream and CCtx are now same object */ + return ZSTD_createCCtx_advanced(customMem); } size_t ZSTD_freeCStream(ZSTD_CStream* zcs) { - if (zcs==NULL) return 0; /* support free on NULL */ - { ZSTD_customMem const cMem = zcs->customMem; - ZSTD_freeCCtx(zcs->cctx); - ZSTD_freeCDict(zcs->cdictLocal); - ZSTD_free(zcs->inBuff, cMem); - ZSTD_free(zcs->outBuff, cMem); - ZSTD_free(zcs, cMem); - return 0; - } + return ZSTD_freeCCtx(zcs); /* same object */ } + /*====== Initialization ======*/ -size_t ZSTD_CStreamInSize(void) { return ZSTD_BLOCKSIZE_ABSOLUTEMAX; } -size_t ZSTD_CStreamOutSize(void) { return ZSTD_compressBound(ZSTD_BLOCKSIZE_ABSOLUTEMAX) + ZSTD_blockHeaderSize + 4 /* 32-bits hash */ ; } - -size_t ZSTD_resetCStream(ZSTD_CStream* zcs, unsigned long long pledgedSrcSize) +size_t ZSTD_CStreamInSize(void) { return ZSTD_BLOCKSIZE_MAX; } + +size_t ZSTD_CStreamOutSize(void) +{ + return ZSTD_compressBound(ZSTD_BLOCKSIZE_MAX) + ZSTD_blockHeaderSize + 4 /* 32-bits hash */ ; +} + +static size_t ZSTD_resetCStream_internal(ZSTD_CStream* cctx, + const void* const dict, size_t const dictSize, ZSTD_dictContentType_e const dictContentType, + const ZSTD_CDict* const cdict, + ZSTD_CCtx_params const params, unsigned long long const pledgedSrcSize) { - if (zcs->inBuffSize==0) return ERROR(stage_wrong); /* zcs has not been init at least once => can't reset */ - - if (zcs->cdict) CHECK_F(ZSTD_compressBegin_usingCDict(zcs->cctx, zcs->cdict, pledgedSrcSize)) - else CHECK_F(ZSTD_compressBegin_advanced(zcs->cctx, NULL, 0, zcs->params, pledgedSrcSize)); - - zcs->inToCompress = 0; - zcs->inBuffPos = 0; - zcs->inBuffTarget = zcs->blockSize; - zcs->outBuffContentSize = zcs->outBuffFlushedSize = 0; - zcs->stage = zcss_load; - zcs->frameEnded = 0; - zcs->pledgedSrcSize = pledgedSrcSize; - zcs->inputProcessed = 0; + DEBUGLOG(4, "ZSTD_resetCStream_internal (disableLiteralCompression=%i)", + params.disableLiteralCompression); + /* params are supposed to be fully validated at this point */ + assert(!ZSTD_isError(ZSTD_checkCParams(params.cParams))); + assert(!((dict) && (cdict))); /* either dict or cdict, not both */ + + CHECK_F( ZSTD_compressBegin_internal(cctx, + dict, dictSize, dictContentType, + cdict, + params, pledgedSrcSize, + ZSTDb_buffered) ); + + cctx->inToCompress = 0; + cctx->inBuffPos = 0; + cctx->inBuffTarget = cctx->blockSize + + (cctx->blockSize == pledgedSrcSize); /* for small input: avoid automatic flush on reaching end of block, since it would require to add a 3-bytes null block to end frame */ + cctx->outBuffContentSize = cctx->outBuffFlushedSize = 0; + cctx->streamStage = zcss_load; + cctx->frameEnded = 0; return 0; /* ready to go */ } -size_t ZSTD_initCStream_advanced(ZSTD_CStream* zcs, - const void* dict, size_t dictSize, - ZSTD_parameters params, unsigned long long pledgedSrcSize) +/* ZSTD_resetCStream(): + * pledgedSrcSize == 0 means "unknown" */ +size_t ZSTD_resetCStream(ZSTD_CStream* zcs, unsigned long long pledgedSrcSize) { - /* allocate buffers */ - { size_t const neededInBuffSize = (size_t)1 << params.cParams.windowLog; - if (zcs->inBuffSize < neededInBuffSize) { - zcs->inBuffSize = neededInBuffSize; - ZSTD_free(zcs->inBuff, zcs->customMem); - zcs->inBuff = (char*) ZSTD_malloc(neededInBuffSize, zcs->customMem); - if (zcs->inBuff == NULL) return ERROR(memory_allocation); - } - zcs->blockSize = MIN(ZSTD_BLOCKSIZE_ABSOLUTEMAX, neededInBuffSize); - } - if (zcs->outBuffSize < ZSTD_compressBound(zcs->blockSize)+1) { - zcs->outBuffSize = ZSTD_compressBound(zcs->blockSize)+1; - ZSTD_free(zcs->outBuff, zcs->customMem); - zcs->outBuff = (char*) ZSTD_malloc(zcs->outBuffSize, zcs->customMem); - if (zcs->outBuff == NULL) return ERROR(memory_allocation); - } + ZSTD_CCtx_params params = zcs->requestedParams; + DEBUGLOG(4, "ZSTD_resetCStream: pledgedSrcSize = %u", (U32)pledgedSrcSize); + if (pledgedSrcSize==0) pledgedSrcSize = ZSTD_CONTENTSIZE_UNKNOWN; + params.fParams.contentSizeFlag = 1; + params.cParams = ZSTD_getCParamsFromCCtxParams(¶ms, pledgedSrcSize, 0); + return ZSTD_resetCStream_internal(zcs, NULL, 0, ZSTD_dct_auto, zcs->cdict, params, pledgedSrcSize); +} + +/*! ZSTD_initCStream_internal() : + * Note : for lib/compress only. Used by zstdmt_compress.c. + * Assumption 1 : params are valid + * Assumption 2 : either dict, or cdict, is defined, not both */ +size_t ZSTD_initCStream_internal(ZSTD_CStream* zcs, + const void* dict, size_t dictSize, const ZSTD_CDict* cdict, + ZSTD_CCtx_params params, unsigned long long pledgedSrcSize) +{ + DEBUGLOG(4, "ZSTD_initCStream_internal"); + assert(!ZSTD_isError(ZSTD_checkCParams(params.cParams))); + assert(!((dict) && (cdict))); /* either dict or cdict, not both */ if (dict && dictSize >= 8) { + DEBUGLOG(4, "loading dictionary of size %u", (U32)dictSize); + if (zcs->staticSize) { /* static CCtx : never uses malloc */ + /* incompatible with internal cdict creation */ + return ERROR(memory_allocation); + } ZSTD_freeCDict(zcs->cdictLocal); - zcs->cdictLocal = ZSTD_createCDict_advanced(dict, dictSize, 0, params, zcs->customMem); - if (zcs->cdictLocal == NULL) return ERROR(memory_allocation); + zcs->cdictLocal = ZSTD_createCDict_advanced(dict, dictSize, + ZSTD_dlm_byCopy, ZSTD_dct_auto, + params.cParams, zcs->customMem); zcs->cdict = zcs->cdictLocal; - } else zcs->cdict = NULL; - - zcs->checksum = params.fParams.checksumFlag > 0; - zcs->params = params; - - return ZSTD_resetCStream(zcs, pledgedSrcSize); + if (zcs->cdictLocal == NULL) return ERROR(memory_allocation); + } else { + if (cdict) { + params.cParams = ZSTD_getCParamsFromCDict(cdict); /* cParams are enforced from cdict; it includes windowLog */ + } + ZSTD_freeCDict(zcs->cdictLocal); + zcs->cdictLocal = NULL; + zcs->cdict = cdict; + } + + return ZSTD_resetCStream_internal(zcs, NULL, 0, ZSTD_dct_auto, zcs->cdict, params, pledgedSrcSize); +} + +/* ZSTD_initCStream_usingCDict_advanced() : + * same as ZSTD_initCStream_usingCDict(), with control over frame parameters */ +size_t ZSTD_initCStream_usingCDict_advanced(ZSTD_CStream* zcs, + const ZSTD_CDict* cdict, + ZSTD_frameParameters fParams, + unsigned long long pledgedSrcSize) +{ + DEBUGLOG(4, "ZSTD_initCStream_usingCDict_advanced"); + if (!cdict) return ERROR(dictionary_wrong); /* cannot handle NULL cdict (does not know what to do) */ + { ZSTD_CCtx_params params = zcs->requestedParams; + params.cParams = ZSTD_getCParamsFromCDict(cdict); + params.fParams = fParams; + return ZSTD_initCStream_internal(zcs, + NULL, 0, cdict, + params, pledgedSrcSize); + } } /* note : cdict must outlive compression session */ size_t ZSTD_initCStream_usingCDict(ZSTD_CStream* zcs, const ZSTD_CDict* cdict) { - ZSTD_parameters const params = ZSTD_getParamsFromCDict(cdict); - size_t const initError = ZSTD_initCStream_advanced(zcs, NULL, 0, params, 0); - zcs->cdict = cdict; - zcs->cctx->dictID = params.fParams.noDictIDFlag ? 0 : cdict->refContext->dictID; - return initError; + ZSTD_frameParameters const fParams = { 0 /* contentSizeFlag */, 0 /* checksum */, 0 /* hideDictID */ }; + DEBUGLOG(4, "ZSTD_initCStream_usingCDict"); + return ZSTD_initCStream_usingCDict_advanced(zcs, cdict, fParams, ZSTD_CONTENTSIZE_UNKNOWN); /* note : will check that cdict != NULL */ +} + + +/* ZSTD_initCStream_advanced() : + * pledgedSrcSize must be exact. + * if srcSize is not known at init time, use value ZSTD_CONTENTSIZE_UNKNOWN. + * dict is loaded with default parameters ZSTD_dm_auto and ZSTD_dlm_byCopy. */ +size_t ZSTD_initCStream_advanced(ZSTD_CStream* zcs, + const void* dict, size_t dictSize, + ZSTD_parameters params, unsigned long long pledgedSrcSize) +{ + DEBUGLOG(4, "ZSTD_initCStream_advanced: pledgedSrcSize=%u, flag=%u", + (U32)pledgedSrcSize, params.fParams.contentSizeFlag); + CHECK_F( ZSTD_checkCParams(params.cParams) ); + if ((pledgedSrcSize==0) && (params.fParams.contentSizeFlag==0)) pledgedSrcSize = ZSTD_CONTENTSIZE_UNKNOWN; /* for compatibility with older programs relying on this behavior. Users should now specify ZSTD_CONTENTSIZE_UNKNOWN. This line will be removed in the future. */ + { ZSTD_CCtx_params const cctxParams = ZSTD_assignParamsToCCtxParams(zcs->requestedParams, params); + return ZSTD_initCStream_internal(zcs, dict, dictSize, NULL /*cdict*/, cctxParams, pledgedSrcSize); + } } size_t ZSTD_initCStream_usingDict(ZSTD_CStream* zcs, const void* dict, size_t dictSize, int compressionLevel) { ZSTD_parameters const params = ZSTD_getParams(compressionLevel, 0, dictSize); - return ZSTD_initCStream_advanced(zcs, dict, dictSize, params, 0); + ZSTD_CCtx_params const cctxParams = + ZSTD_assignParamsToCCtxParams(zcs->requestedParams, params); + return ZSTD_initCStream_internal(zcs, dict, dictSize, NULL, cctxParams, ZSTD_CONTENTSIZE_UNKNOWN); } -size_t ZSTD_initCStream_srcSize(ZSTD_CStream* zcs, int compressionLevel, unsigned long long pledgedSrcSize) +size_t ZSTD_initCStream_srcSize(ZSTD_CStream* zcs, int compressionLevel, unsigned long long pss) { - ZSTD_parameters params = ZSTD_getParams(compressionLevel, pledgedSrcSize, 0); - if (pledgedSrcSize) params.fParams.contentSizeFlag = 1; - return ZSTD_initCStream_advanced(zcs, NULL, 0, params, pledgedSrcSize); + U64 const pledgedSrcSize = (pss==0) ? ZSTD_CONTENTSIZE_UNKNOWN : pss; /* temporary : 0 interpreted as "unknown" during transition period. Users willing to specify "unknown" **must** use ZSTD_CONTENTSIZE_UNKNOWN. `0` will be interpreted as "empty" in the future */ + ZSTD_parameters const params = ZSTD_getParams(compressionLevel, pledgedSrcSize, 0); + ZSTD_CCtx_params const cctxParams = ZSTD_assignParamsToCCtxParams(zcs->requestedParams, params); + return ZSTD_initCStream_internal(zcs, NULL, 0, NULL, cctxParams, pledgedSrcSize); } size_t ZSTD_initCStream(ZSTD_CStream* zcs, int compressionLevel) { - return ZSTD_initCStream_usingDict(zcs, NULL, 0, compressionLevel); -} - -size_t ZSTD_sizeof_CStream(const ZSTD_CStream* zcs) -{ - if (zcs==NULL) return 0; /* support sizeof on NULL */ - return sizeof(zcs) + ZSTD_sizeof_CCtx(zcs->cctx) + ZSTD_sizeof_CDict(zcs->cdictLocal) + zcs->outBuffSize + zcs->inBuffSize; + DEBUGLOG(4, "ZSTD_initCStream"); + return ZSTD_initCStream_srcSize(zcs, compressionLevel, ZSTD_CONTENTSIZE_UNKNOWN); } /*====== Compression ======*/ -typedef enum { zsf_gather, zsf_flush, zsf_end } ZSTD_flush_e; - -MEM_STATIC size_t ZSTD_limitCopy(void* dst, size_t dstCapacity, const void* src, size_t srcSize) +MEM_STATIC size_t ZSTD_limitCopy(void* dst, size_t dstCapacity, + const void* src, size_t srcSize) { size_t const length = MIN(dstCapacity, srcSize); - memcpy(dst, src, length); + if (length) memcpy(dst, src, length); return length; } -static size_t ZSTD_compressStream_generic(ZSTD_CStream* zcs, - void* dst, size_t* dstCapacityPtr, - const void* src, size_t* srcSizePtr, - ZSTD_flush_e const flush) +/** ZSTD_compressStream_generic(): + * internal function for all *compressStream*() variants and *compress_generic() + * non-static, because can be called from zstdmt_compress.c + * @return : hint size for next input */ +size_t ZSTD_compressStream_generic(ZSTD_CStream* zcs, + ZSTD_outBuffer* output, + ZSTD_inBuffer* input, + ZSTD_EndDirective const flushMode) { + const char* const istart = (const char*)input->src; + const char* const iend = istart + input->size; + const char* ip = istart + input->pos; + char* const ostart = (char*)output->dst; + char* const oend = ostart + output->size; + char* op = ostart + output->pos; U32 someMoreWork = 1; - const char* const istart = (const char*)src; - const char* const iend = istart + *srcSizePtr; - const char* ip = istart; - char* const ostart = (char*)dst; - char* const oend = ostart + *dstCapacityPtr; - char* op = ostart; + + /* check expectations */ + DEBUGLOG(5, "ZSTD_compressStream_generic, flush=%u", (U32)flushMode); + assert(zcs->inBuff != NULL); + assert(zcs->inBuffSize > 0); + assert(zcs->outBuff != NULL); + assert(zcs->outBuffSize > 0); + assert(output->pos <= output->size); + assert(input->pos <= input->size); while (someMoreWork) { - switch(zcs->stage) + switch(zcs->streamStage) { - case zcss_init: return ERROR(init_missing); /* call ZBUFF_compressInit() first ! */ + case zcss_init: + /* call ZSTD_initCStream() first ! */ + return ERROR(init_missing); case zcss_load: - /* complete inBuffer */ + if ( (flushMode == ZSTD_e_end) + && ((size_t)(oend-op) >= ZSTD_compressBound(iend-ip)) /* enough dstCapacity */ + && (zcs->inBuffPos == 0) ) { + /* shortcut to compression pass directly into output buffer */ + size_t const cSize = ZSTD_compressEnd(zcs, + op, oend-op, ip, iend-ip); + DEBUGLOG(4, "ZSTD_compressEnd : %u", (U32)cSize); + if (ZSTD_isError(cSize)) return cSize; + ip = iend; + op += cSize; + zcs->frameEnded = 1; + ZSTD_startNewCompression(zcs); + someMoreWork = 0; break; + } + /* complete loading into inBuffer */ { size_t const toLoad = zcs->inBuffTarget - zcs->inBuffPos; - size_t const loaded = ZSTD_limitCopy(zcs->inBuff + zcs->inBuffPos, toLoad, ip, iend-ip); + size_t const loaded = ZSTD_limitCopy( + zcs->inBuff + zcs->inBuffPos, toLoad, + ip, iend-ip); zcs->inBuffPos += loaded; ip += loaded; - if ( (zcs->inBuffPos==zcs->inToCompress) || (!flush && (toLoad != loaded)) ) { - someMoreWork = 0; break; /* not enough input to get a full block : stop there, wait for more */ - } } + if ( (flushMode == ZSTD_e_continue) + && (zcs->inBuffPos < zcs->inBuffTarget) ) { + /* not enough input to fill full block : stop here */ + someMoreWork = 0; break; + } + if ( (flushMode == ZSTD_e_flush) + && (zcs->inBuffPos == zcs->inToCompress) ) { + /* empty */ + someMoreWork = 0; break; + } + } /* compress current block (note : this stage cannot be stopped in the middle) */ + DEBUGLOG(5, "stream compression stage (flushMode==%u)", flushMode); { void* cDst; size_t cSize; size_t const iSize = zcs->inBuffPos - zcs->inToCompress; size_t oSize = oend-op; + unsigned const lastBlock = (flushMode == ZSTD_e_end) && (ip==iend); if (oSize >= ZSTD_compressBound(iSize)) - cDst = op; /* compress directly into output buffer (avoid flush stage) */ + cDst = op; /* compress into output buffer, to skip flush stage */ else cDst = zcs->outBuff, oSize = zcs->outBuffSize; - cSize = (flush == zsf_end) ? - ZSTD_compressEnd(zcs->cctx, cDst, oSize, zcs->inBuff + zcs->inToCompress, iSize) : - ZSTD_compressContinue(zcs->cctx, cDst, oSize, zcs->inBuff + zcs->inToCompress, iSize); + cSize = lastBlock ? + ZSTD_compressEnd(zcs, cDst, oSize, + zcs->inBuff + zcs->inToCompress, iSize) : + ZSTD_compressContinue(zcs, cDst, oSize, + zcs->inBuff + zcs->inToCompress, iSize); if (ZSTD_isError(cSize)) return cSize; - if (flush == zsf_end) zcs->frameEnded = 1; + zcs->frameEnded = lastBlock; /* prepare next block */ zcs->inBuffTarget = zcs->inBuffPos + zcs->blockSize; if (zcs->inBuffTarget > zcs->inBuffSize) - zcs->inBuffPos = 0, zcs->inBuffTarget = zcs->blockSize; /* note : inBuffSize >= blockSize */ + zcs->inBuffPos = 0, zcs->inBuffTarget = zcs->blockSize; + DEBUGLOG(5, "inBuffTarget:%u / inBuffSize:%u", + (U32)zcs->inBuffTarget, (U32)zcs->inBuffSize); + if (!lastBlock) + assert(zcs->inBuffTarget <= zcs->inBuffSize); zcs->inToCompress = zcs->inBuffPos; - if (cDst == op) { op += cSize; break; } /* no need to flush */ + if (cDst == op) { /* no need to flush */ + op += cSize; + if (zcs->frameEnded) { + DEBUGLOG(5, "Frame completed directly in outBuffer"); + someMoreWork = 0; + ZSTD_startNewCompression(zcs); + } + break; + } zcs->outBuffContentSize = cSize; zcs->outBuffFlushedSize = 0; - zcs->stage = zcss_flush; /* pass-through to flush stage */ + zcs->streamStage = zcss_flush; /* pass-through to flush stage */ } - + /* fall-through */ case zcss_flush: + DEBUGLOG(5, "flush stage"); { size_t const toFlush = zcs->outBuffContentSize - zcs->outBuffFlushedSize; - size_t const flushed = ZSTD_limitCopy(op, oend-op, zcs->outBuff + zcs->outBuffFlushedSize, toFlush); + size_t const flushed = ZSTD_limitCopy(op, oend-op, + zcs->outBuff + zcs->outBuffFlushedSize, toFlush); + DEBUGLOG(5, "toFlush: %u into %u ==> flushed: %u", + (U32)toFlush, (U32)(oend-op), (U32)flushed); op += flushed; zcs->outBuffFlushedSize += flushed; - if (toFlush!=flushed) { someMoreWork = 0; break; } /* dst too small to store flushed data : stop there */ + if (toFlush!=flushed) { + /* flush not fully completed, presumably because dst is too small */ + assert(op==oend); + someMoreWork = 0; + break; + } zcs->outBuffContentSize = zcs->outBuffFlushedSize = 0; - zcs->stage = zcss_load; + if (zcs->frameEnded) { + DEBUGLOG(5, "Frame completed on flush"); + someMoreWork = 0; + ZSTD_startNewCompression(zcs); + break; + } + zcs->streamStage = zcss_load; break; } - case zcss_final: - someMoreWork = 0; /* do nothing */ - break; - - default: - return ERROR(GENERIC); /* impossible */ + default: /* impossible */ + assert(0); } } - *srcSizePtr = ip - istart; - *dstCapacityPtr = op - ostart; - zcs->inputProcessed += *srcSizePtr; + input->pos = ip - istart; + output->pos = op - ostart; if (zcs->frameEnded) return 0; { size_t hintInSize = zcs->inBuffTarget - zcs->inBuffPos; if (hintInSize==0) hintInSize = zcs->blockSize; @@ -3120,111 +3177,169 @@ size_t ZSTD_compressStream(ZSTD_CStream* zcs, ZSTD_outBuffer* output, ZSTD_inBuffer* input) { - size_t sizeRead = input->size - input->pos; - size_t sizeWritten = output->size - output->pos; - size_t const result = ZSTD_compressStream_generic(zcs, - (char*)(output->dst) + output->pos, &sizeWritten, - (const char*)(input->src) + input->pos, &sizeRead, zsf_gather); - input->pos += sizeRead; - output->pos += sizeWritten; - return result; + /* check conditions */ + if (output->pos > output->size) return ERROR(GENERIC); + if (input->pos > input->size) return ERROR(GENERIC); + + return ZSTD_compressStream_generic(zcs, output, input, ZSTD_e_continue); +} + + +size_t ZSTD_compress_generic (ZSTD_CCtx* cctx, + ZSTD_outBuffer* output, + ZSTD_inBuffer* input, + ZSTD_EndDirective endOp) +{ + DEBUGLOG(5, "ZSTD_compress_generic, endOp=%u ", (U32)endOp); + /* check conditions */ + if (output->pos > output->size) return ERROR(GENERIC); + if (input->pos > input->size) return ERROR(GENERIC); + assert(cctx!=NULL); + + /* transparent initialization stage */ + if (cctx->streamStage == zcss_init) { + ZSTD_CCtx_params params = cctx->requestedParams; + ZSTD_prefixDict const prefixDict = cctx->prefixDict; + memset(&cctx->prefixDict, 0, sizeof(cctx->prefixDict)); /* single usage */ + assert(prefixDict.dict==NULL || cctx->cdict==NULL); /* only one can be set */ + DEBUGLOG(4, "ZSTD_compress_generic : transparent init stage"); + if (endOp == ZSTD_e_end) cctx->pledgedSrcSizePlusOne = input->size + 1; /* auto-fix pledgedSrcSize */ + params.cParams = ZSTD_getCParamsFromCCtxParams( + &cctx->requestedParams, cctx->pledgedSrcSizePlusOne-1, 0 /*dictSize*/); + +#ifdef ZSTD_MULTITHREAD + if ((cctx->pledgedSrcSizePlusOne-1) <= ZSTDMT_JOBSIZE_MIN) { + params.nbWorkers = 0; /* do not invoke multi-threading when src size is too small */ + } + if (params.nbWorkers > 0) { + /* mt context creation */ + if (cctx->mtctx == NULL || (params.nbWorkers != ZSTDMT_getNbWorkers(cctx->mtctx))) { + DEBUGLOG(4, "ZSTD_compress_generic: creating new mtctx for nbWorkers=%u", + params.nbWorkers); + if (cctx->mtctx != NULL) + DEBUGLOG(4, "ZSTD_compress_generic: previous nbWorkers was %u", + ZSTDMT_getNbWorkers(cctx->mtctx)); + ZSTDMT_freeCCtx(cctx->mtctx); + cctx->mtctx = ZSTDMT_createCCtx_advanced(params.nbWorkers, cctx->customMem); + if (cctx->mtctx == NULL) return ERROR(memory_allocation); + } + /* mt compression */ + DEBUGLOG(4, "call ZSTDMT_initCStream_internal as nbWorkers=%u", params.nbWorkers); + CHECK_F( ZSTDMT_initCStream_internal( + cctx->mtctx, + prefixDict.dict, prefixDict.dictSize, ZSTD_dct_rawContent, + cctx->cdict, params, cctx->pledgedSrcSizePlusOne-1) ); + cctx->streamStage = zcss_load; + cctx->appliedParams.nbWorkers = params.nbWorkers; + } else +#endif + { CHECK_F( ZSTD_resetCStream_internal(cctx, + prefixDict.dict, prefixDict.dictSize, prefixDict.dictContentType, + cctx->cdict, + params, cctx->pledgedSrcSizePlusOne-1) ); + assert(cctx->streamStage == zcss_load); + assert(cctx->appliedParams.nbWorkers == 0); + } } + + /* compression stage */ +#ifdef ZSTD_MULTITHREAD + if (cctx->appliedParams.nbWorkers > 0) { + if (cctx->cParamsChanged) { + ZSTDMT_updateCParams_whileCompressing(cctx->mtctx, &cctx->requestedParams); + cctx->cParamsChanged = 0; + } + { size_t const flushMin = ZSTDMT_compressStream_generic(cctx->mtctx, output, input, endOp); + if ( ZSTD_isError(flushMin) + || (endOp == ZSTD_e_end && flushMin == 0) ) { /* compression completed */ + ZSTD_startNewCompression(cctx); + } + return flushMin; + } } +#endif + CHECK_F( ZSTD_compressStream_generic(cctx, output, input, endOp) ); + DEBUGLOG(5, "completed ZSTD_compress_generic"); + return cctx->outBuffContentSize - cctx->outBuffFlushedSize; /* remaining to flush */ +} + +size_t ZSTD_compress_generic_simpleArgs ( + ZSTD_CCtx* cctx, + void* dst, size_t dstCapacity, size_t* dstPos, + const void* src, size_t srcSize, size_t* srcPos, + ZSTD_EndDirective endOp) +{ + ZSTD_outBuffer output = { dst, dstCapacity, *dstPos }; + ZSTD_inBuffer input = { src, srcSize, *srcPos }; + /* ZSTD_compress_generic() will check validity of dstPos and srcPos */ + size_t const cErr = ZSTD_compress_generic(cctx, &output, &input, endOp); + *dstPos = output.pos; + *srcPos = input.pos; + return cErr; } /*====== Finalize ======*/ /*! ZSTD_flushStream() : -* @return : amount of data remaining to flush */ + * @return : amount of data remaining to flush */ size_t ZSTD_flushStream(ZSTD_CStream* zcs, ZSTD_outBuffer* output) { - size_t srcSize = 0; - size_t sizeWritten = output->size - output->pos; - size_t const result = ZSTD_compressStream_generic(zcs, - (char*)(output->dst) + output->pos, &sizeWritten, - &srcSize, &srcSize, /* use a valid src address instead of NULL */ - zsf_flush); - output->pos += sizeWritten; - if (ZSTD_isError(result)) return result; - return zcs->outBuffContentSize - zcs->outBuffFlushedSize; /* remaining to flush */ + ZSTD_inBuffer input = { NULL, 0, 0 }; + if (output->pos > output->size) return ERROR(GENERIC); + CHECK_F( ZSTD_compressStream_generic(zcs, output, &input, ZSTD_e_flush) ); + return zcs->outBuffContentSize - zcs->outBuffFlushedSize; /* remaining to flush */ } size_t ZSTD_endStream(ZSTD_CStream* zcs, ZSTD_outBuffer* output) { - BYTE* const ostart = (BYTE*)(output->dst) + output->pos; - BYTE* const oend = (BYTE*)(output->dst) + output->size; - BYTE* op = ostart; - - if ((zcs->pledgedSrcSize) && (zcs->inputProcessed != zcs->pledgedSrcSize)) - return ERROR(srcSize_wrong); /* pledgedSrcSize not respected */ - - if (zcs->stage != zcss_final) { - /* flush whatever remains */ - size_t srcSize = 0; - size_t sizeWritten = output->size - output->pos; - size_t const notEnded = ZSTD_compressStream_generic(zcs, ostart, &sizeWritten, &srcSize, &srcSize, zsf_end); /* use a valid src address instead of NULL */ - size_t const remainingToFlush = zcs->outBuffContentSize - zcs->outBuffFlushedSize; - op += sizeWritten; - if (remainingToFlush) { - output->pos += sizeWritten; - return remainingToFlush + ZSTD_BLOCKHEADERSIZE /* final empty block */ + (zcs->checksum * 4); - } - /* create epilogue */ - zcs->stage = zcss_final; - zcs->outBuffContentSize = !notEnded ? 0 : - ZSTD_compressEnd(zcs->cctx, zcs->outBuff, zcs->outBuffSize, NULL, 0); /* write epilogue, including final empty block, into outBuff */ - } - - /* flush epilogue */ - { size_t const toFlush = zcs->outBuffContentSize - zcs->outBuffFlushedSize; - size_t const flushed = ZSTD_limitCopy(op, oend-op, zcs->outBuff + zcs->outBuffFlushedSize, toFlush); - op += flushed; - zcs->outBuffFlushedSize += flushed; - output->pos += op-ostart; - if (toFlush==flushed) zcs->stage = zcss_init; /* end reached */ - return toFlush - flushed; + ZSTD_inBuffer input = { NULL, 0, 0 }; + if (output->pos > output->size) return ERROR(GENERIC); + CHECK_F( ZSTD_compressStream_generic(zcs, output, &input, ZSTD_e_end) ); + { size_t const lastBlockSize = zcs->frameEnded ? 0 : ZSTD_BLOCKHEADERSIZE; + size_t const checksumSize = zcs->frameEnded ? 0 : zcs->appliedParams.fParams.checksumFlag * 4; + size_t const toFlush = zcs->outBuffContentSize - zcs->outBuffFlushedSize + lastBlockSize + checksumSize; + DEBUGLOG(4, "ZSTD_endStream : remaining to flush : %u", (U32)toFlush); + return toFlush; } } - /*-===== Pre-defined compression levels =====-*/ -#define ZSTD_DEFAULT_CLEVEL 1 #define ZSTD_MAX_CLEVEL 22 int ZSTD_maxCLevel(void) { return ZSTD_MAX_CLEVEL; } static const ZSTD_compressionParameters ZSTD_defaultCParameters[4][ZSTD_MAX_CLEVEL+1] = { -{ /* "default" */ +{ /* "default" - guarantees a monotonically increasing memory budget */ /* W, C, H, S, L, TL, strat */ - { 18, 12, 12, 1, 7, 16, ZSTD_fast }, /* level 0 - never used */ - { 19, 13, 14, 1, 7, 16, ZSTD_fast }, /* level 1 */ - { 19, 15, 16, 1, 6, 16, ZSTD_fast }, /* level 2 */ - { 20, 16, 17, 1, 5, 16, ZSTD_dfast }, /* level 3.*/ - { 20, 18, 18, 1, 5, 16, ZSTD_dfast }, /* level 4.*/ - { 20, 15, 18, 3, 5, 16, ZSTD_greedy }, /* level 5 */ - { 21, 16, 19, 2, 5, 16, ZSTD_lazy }, /* level 6 */ - { 21, 17, 20, 3, 5, 16, ZSTD_lazy }, /* level 7 */ + { 19, 12, 13, 1, 6, 1, ZSTD_fast }, /* base for negative levels */ + { 19, 13, 14, 1, 7, 1, ZSTD_fast }, /* level 1 */ + { 19, 15, 16, 1, 6, 1, ZSTD_fast }, /* level 2 */ + { 20, 16, 17, 1, 5, 8, ZSTD_dfast }, /* level 3 */ + { 20, 17, 18, 1, 5, 8, ZSTD_dfast }, /* level 4 */ + { 20, 17, 18, 2, 5, 16, ZSTD_greedy }, /* level 5 */ + { 21, 17, 19, 2, 5, 16, ZSTD_lazy }, /* level 6 */ + { 21, 18, 19, 3, 5, 16, ZSTD_lazy }, /* level 7 */ { 21, 18, 20, 3, 5, 16, ZSTD_lazy2 }, /* level 8 */ - { 21, 20, 20, 3, 5, 16, ZSTD_lazy2 }, /* level 9 */ + { 21, 19, 20, 3, 5, 16, ZSTD_lazy2 }, /* level 9 */ { 21, 19, 21, 4, 5, 16, ZSTD_lazy2 }, /* level 10 */ { 22, 20, 22, 4, 5, 16, ZSTD_lazy2 }, /* level 11 */ { 22, 20, 22, 5, 5, 16, ZSTD_lazy2 }, /* level 12 */ - { 22, 21, 22, 5, 5, 16, ZSTD_lazy2 }, /* level 13 */ - { 22, 21, 22, 6, 5, 16, ZSTD_lazy2 }, /* level 14 */ - { 22, 21, 21, 5, 5, 16, ZSTD_btlazy2 }, /* level 15 */ - { 23, 22, 22, 5, 5, 16, ZSTD_btlazy2 }, /* level 16 */ - { 23, 21, 22, 4, 5, 24, ZSTD_btopt }, /* level 17 */ - { 23, 23, 22, 6, 5, 32, ZSTD_btopt }, /* level 18 */ - { 23, 23, 22, 6, 3, 48, ZSTD_btopt }, /* level 19 */ - { 25, 25, 23, 7, 3, 64, ZSTD_btopt2 }, /* level 20 */ - { 26, 26, 23, 7, 3,256, ZSTD_btopt2 }, /* level 21 */ - { 27, 27, 25, 9, 3,512, ZSTD_btopt2 }, /* level 22 */ + { 22, 21, 22, 4, 5, 32, ZSTD_btlazy2 }, /* level 13 */ + { 22, 21, 22, 5, 5, 32, ZSTD_btlazy2 }, /* level 14 */ + { 22, 22, 22, 6, 5, 32, ZSTD_btlazy2 }, /* level 15 */ + { 22, 21, 22, 4, 5, 48, ZSTD_btopt }, /* level 16 */ + { 23, 22, 22, 4, 4, 48, ZSTD_btopt }, /* level 17 */ + { 23, 22, 22, 5, 3, 64, ZSTD_btopt }, /* level 18 */ + { 23, 23, 22, 7, 3,128, ZSTD_btopt }, /* level 19 */ + { 25, 25, 23, 7, 3,128, ZSTD_btultra }, /* level 20 */ + { 26, 26, 24, 7, 3,256, ZSTD_btultra }, /* level 21 */ + { 27, 27, 25, 9, 3,512, ZSTD_btultra }, /* level 22 */ }, { /* for srcSize <= 256 KB */ /* W, C, H, S, L, T, strat */ - { 0, 0, 0, 0, 0, 0, ZSTD_fast }, /* level 0 - not used */ - { 18, 13, 14, 1, 6, 8, ZSTD_fast }, /* level 1 */ + { 18, 12, 13, 1, 5, 1, ZSTD_fast }, /* base for negative levels */ + { 18, 13, 14, 1, 6, 1, ZSTD_fast }, /* level 1 */ { 18, 14, 13, 1, 5, 8, ZSTD_dfast }, /* level 2 */ { 18, 16, 15, 1, 5, 8, ZSTD_dfast }, /* level 3 */ { 18, 15, 17, 1, 5, 8, ZSTD_greedy }, /* level 4.*/ @@ -3235,23 +3350,23 @@ { 18, 17, 17, 5, 4, 8, ZSTD_lazy2 }, /* level 9 */ { 18, 17, 17, 6, 4, 8, ZSTD_lazy2 }, /* level 10 */ { 18, 18, 17, 6, 4, 8, ZSTD_lazy2 }, /* level 11.*/ - { 18, 18, 17, 7, 4, 8, ZSTD_lazy2 }, /* level 12.*/ - { 18, 19, 17, 6, 4, 8, ZSTD_btlazy2 }, /* level 13 */ + { 18, 18, 17, 5, 4, 8, ZSTD_btlazy2 }, /* level 12.*/ + { 18, 19, 17, 7, 4, 8, ZSTD_btlazy2 }, /* level 13 */ { 18, 18, 18, 4, 4, 16, ZSTD_btopt }, /* level 14.*/ { 18, 18, 18, 4, 3, 16, ZSTD_btopt }, /* level 15.*/ { 18, 19, 18, 6, 3, 32, ZSTD_btopt }, /* level 16.*/ { 18, 19, 18, 8, 3, 64, ZSTD_btopt }, /* level 17.*/ { 18, 19, 18, 9, 3,128, ZSTD_btopt }, /* level 18.*/ { 18, 19, 18, 10, 3,256, ZSTD_btopt }, /* level 19.*/ - { 18, 19, 18, 11, 3,512, ZSTD_btopt2 }, /* level 20.*/ - { 18, 19, 18, 12, 3,512, ZSTD_btopt2 }, /* level 21.*/ - { 18, 19, 18, 13, 3,512, ZSTD_btopt2 }, /* level 22.*/ + { 18, 19, 18, 11, 3,512, ZSTD_btultra }, /* level 20.*/ + { 18, 19, 18, 12, 3,512, ZSTD_btultra }, /* level 21.*/ + { 18, 19, 18, 13, 3,512, ZSTD_btultra }, /* level 22.*/ }, { /* for srcSize <= 128 KB */ /* W, C, H, S, L, T, strat */ - { 17, 12, 12, 1, 7, 8, ZSTD_fast }, /* level 0 - not used */ - { 17, 12, 13, 1, 6, 8, ZSTD_fast }, /* level 1 */ - { 17, 13, 16, 1, 5, 8, ZSTD_fast }, /* level 2 */ + { 17, 12, 12, 1, 5, 1, ZSTD_fast }, /* level 0 - not used */ + { 17, 12, 13, 1, 6, 1, ZSTD_fast }, /* level 1 */ + { 17, 13, 16, 1, 5, 1, ZSTD_fast }, /* level 2 */ { 17, 16, 16, 2, 5, 8, ZSTD_dfast }, /* level 3 */ { 17, 13, 15, 3, 4, 8, ZSTD_greedy }, /* level 4 */ { 17, 15, 17, 4, 4, 8, ZSTD_greedy }, /* level 5 */ @@ -3269,15 +3384,15 @@ { 17, 18, 17, 7, 3, 64, ZSTD_btopt }, /* level 17.*/ { 17, 18, 17, 7, 3,256, ZSTD_btopt }, /* level 18.*/ { 17, 18, 17, 8, 3,256, ZSTD_btopt }, /* level 19.*/ - { 17, 18, 17, 9, 3,256, ZSTD_btopt2 }, /* level 20.*/ - { 17, 18, 17, 10, 3,256, ZSTD_btopt2 }, /* level 21.*/ - { 17, 18, 17, 11, 3,512, ZSTD_btopt2 }, /* level 22.*/ + { 17, 18, 17, 9, 3,256, ZSTD_btultra }, /* level 20.*/ + { 17, 18, 17, 10, 3,256, ZSTD_btultra }, /* level 21.*/ + { 17, 18, 17, 11, 3,512, ZSTD_btultra }, /* level 22.*/ }, { /* for srcSize <= 16 KB */ /* W, C, H, S, L, T, strat */ - { 14, 12, 12, 1, 7, 6, ZSTD_fast }, /* level 0 - not used */ - { 14, 14, 14, 1, 6, 6, ZSTD_fast }, /* level 1 */ - { 14, 14, 14, 1, 4, 6, ZSTD_fast }, /* level 2 */ + { 14, 12, 13, 1, 5, 1, ZSTD_fast }, /* base for negative levels */ + { 14, 14, 14, 1, 6, 1, ZSTD_fast }, /* level 1 */ + { 14, 14, 14, 1, 4, 1, ZSTD_fast }, /* level 2 */ { 14, 14, 14, 1, 4, 6, ZSTD_dfast }, /* level 3.*/ { 14, 14, 14, 4, 4, 6, ZSTD_greedy }, /* level 4.*/ { 14, 14, 14, 3, 4, 6, ZSTD_lazy }, /* level 5.*/ @@ -3295,40 +3410,40 @@ { 14, 15, 15, 6, 3,128, ZSTD_btopt }, /* level 17.*/ { 14, 15, 15, 6, 3,256, ZSTD_btopt }, /* level 18.*/ { 14, 15, 15, 7, 3,256, ZSTD_btopt }, /* level 19.*/ - { 14, 15, 15, 8, 3,256, ZSTD_btopt2 }, /* level 20.*/ - { 14, 15, 15, 9, 3,256, ZSTD_btopt2 }, /* level 21.*/ - { 14, 15, 15, 10, 3,256, ZSTD_btopt2 }, /* level 22.*/ + { 14, 15, 15, 8, 3,256, ZSTD_btultra }, /* level 20.*/ + { 14, 15, 15, 9, 3,256, ZSTD_btultra }, /* level 21.*/ + { 14, 15, 15, 10, 3,256, ZSTD_btultra }, /* level 22.*/ }, }; /*! ZSTD_getCParams() : -* @return ZSTD_compressionParameters structure for a selected compression level, `srcSize` and `dictSize`. +* @return ZSTD_compressionParameters structure for a selected compression level, srcSize and dictSize. * Size values are optional, provide 0 if not known or unused */ -ZSTD_compressionParameters ZSTD_getCParams(int compressionLevel, unsigned long long srcSize, size_t dictSize) +ZSTD_compressionParameters ZSTD_getCParams(int compressionLevel, unsigned long long srcSizeHint, size_t dictSize) { - ZSTD_compressionParameters cp; - size_t const addedSize = srcSize ? 0 : 500; - U64 const rSize = srcSize+dictSize ? srcSize+dictSize+addedSize : (U64)-1; + size_t const addedSize = srcSizeHint ? 0 : 500; + U64 const rSize = srcSizeHint+dictSize ? srcSizeHint+dictSize+addedSize : (U64)-1; U32 const tableID = (rSize <= 256 KB) + (rSize <= 128 KB) + (rSize <= 16 KB); /* intentional underflow for srcSizeHint == 0 */ - if (compressionLevel <= 0) compressionLevel = ZSTD_DEFAULT_CLEVEL; /* 0 == default; no negative compressionLevel yet */ - if (compressionLevel > ZSTD_MAX_CLEVEL) compressionLevel = ZSTD_MAX_CLEVEL; - cp = ZSTD_defaultCParameters[tableID][compressionLevel]; - if (MEM_32bits()) { /* auto-correction, for 32-bits mode */ - if (cp.windowLog > ZSTD_WINDOWLOG_MAX) cp.windowLog = ZSTD_WINDOWLOG_MAX; - if (cp.chainLog > ZSTD_CHAINLOG_MAX) cp.chainLog = ZSTD_CHAINLOG_MAX; - if (cp.hashLog > ZSTD_HASHLOG_MAX) cp.hashLog = ZSTD_HASHLOG_MAX; - } - cp = ZSTD_adjustCParams(cp, srcSize, dictSize); - return cp; + int row = compressionLevel; + DEBUGLOG(5, "ZSTD_getCParams (cLevel=%i)", compressionLevel); + if (compressionLevel == 0) row = ZSTD_CLEVEL_DEFAULT; /* 0 == default */ + if (compressionLevel < 0) row = 0; /* entry 0 is baseline for fast mode */ + if (compressionLevel > ZSTD_MAX_CLEVEL) row = ZSTD_MAX_CLEVEL; + { ZSTD_compressionParameters cp = ZSTD_defaultCParameters[tableID][row]; + if (compressionLevel < 0) cp.targetLength = (unsigned)(-compressionLevel); /* acceleration factor */ + return ZSTD_adjustCParams_internal(cp, srcSizeHint, dictSize); } + } /*! ZSTD_getParams() : * same as ZSTD_getCParams(), but @return a `ZSTD_parameters` object (instead of `ZSTD_compressionParameters`). * All fields of `ZSTD_frameParameters` are set to default (0) */ -ZSTD_parameters ZSTD_getParams(int compressionLevel, unsigned long long srcSize, size_t dictSize) { +ZSTD_parameters ZSTD_getParams(int compressionLevel, unsigned long long srcSizeHint, size_t dictSize) { ZSTD_parameters params; - ZSTD_compressionParameters const cParams = ZSTD_getCParams(compressionLevel, srcSize, dictSize); + ZSTD_compressionParameters const cParams = ZSTD_getCParams(compressionLevel, srcSizeHint, dictSize); + DEBUGLOG(5, "ZSTD_getParams (cLevel=%i)", compressionLevel); memset(¶ms, 0, sizeof(params)); params.cParams = cParams; + params.fParams.contentSizeFlag = 1; return params; } diff -r fb92df8b634c -r ed5448edcbfa contrib/python-zstandard/zstd/compress/zstd_compress_internal.h --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/contrib/python-zstandard/zstd/compress/zstd_compress_internal.h Wed Apr 18 15:32:08 2018 -0400 @@ -0,0 +1,709 @@ +/* + * Copyright (c) 2016-present, Yann Collet, Facebook, Inc. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the + * LICENSE file in the root directory of this source tree) and the GPLv2 (found + * in the COPYING file in the root directory of this source tree). + * You may select, at your option, one of the above-listed licenses. + */ + +/* This header contains definitions + * that shall **only** be used by modules within lib/compress. + */ + +#ifndef ZSTD_COMPRESS_H +#define ZSTD_COMPRESS_H + +/*-************************************* +* Dependencies +***************************************/ +#include "zstd_internal.h" +#ifdef ZSTD_MULTITHREAD +# include "zstdmt_compress.h" +#endif + +#if defined (__cplusplus) +extern "C" { +#endif + +/*-************************************* +* Constants +***************************************/ +#define kSearchStrength 8 +#define HASH_READ_SIZE 8 +#define ZSTD_DUBT_UNSORTED_MARK 1 /* For btlazy2 strategy, index 1 now means "unsorted". + It could be confused for a real successor at index "1", if sorted as larger than its predecessor. + It's not a big deal though : candidate will just be sorted again. + Additionnally, candidate position 1 will be lost. + But candidate 1 cannot hide a large tree of candidates, so it's a minimal loss. + The benefit is that ZSTD_DUBT_UNSORTED_MARK cannot be misdhandled after table re-use with a different strategy */ + + +/*-************************************* +* Context memory management +***************************************/ +typedef enum { ZSTDcs_created=0, ZSTDcs_init, ZSTDcs_ongoing, ZSTDcs_ending } ZSTD_compressionStage_e; +typedef enum { zcss_init=0, zcss_load, zcss_flush } ZSTD_cStreamStage; + +typedef struct ZSTD_prefixDict_s { + const void* dict; + size_t dictSize; + ZSTD_dictContentType_e dictContentType; +} ZSTD_prefixDict; + +typedef struct { + U32 hufCTable[HUF_CTABLE_SIZE_U32(255)]; + FSE_CTable offcodeCTable[FSE_CTABLE_SIZE_U32(OffFSELog, MaxOff)]; + FSE_CTable matchlengthCTable[FSE_CTABLE_SIZE_U32(MLFSELog, MaxML)]; + FSE_CTable litlengthCTable[FSE_CTABLE_SIZE_U32(LLFSELog, MaxLL)]; + HUF_repeat hufCTable_repeatMode; + FSE_repeat offcode_repeatMode; + FSE_repeat matchlength_repeatMode; + FSE_repeat litlength_repeatMode; +} ZSTD_entropyCTables_t; + +typedef struct { + U32 off; + U32 len; +} ZSTD_match_t; + +typedef struct { + int price; + U32 off; + U32 mlen; + U32 litlen; + U32 rep[ZSTD_REP_NUM]; +} ZSTD_optimal_t; + +typedef struct { + /* All tables are allocated inside cctx->workspace by ZSTD_resetCCtx_internal() */ + U32* litFreq; /* table of literals statistics, of size 256 */ + U32* litLengthFreq; /* table of litLength statistics, of size (MaxLL+1) */ + U32* matchLengthFreq; /* table of matchLength statistics, of size (MaxML+1) */ + U32* offCodeFreq; /* table of offCode statistics, of size (MaxOff+1) */ + ZSTD_match_t* matchTable; /* list of found matches, of size ZSTD_OPT_NUM+1 */ + ZSTD_optimal_t* priceTable; /* All positions tracked by optimal parser, of size ZSTD_OPT_NUM+1 */ + + U32 litSum; /* nb of literals */ + U32 litLengthSum; /* nb of litLength codes */ + U32 matchLengthSum; /* nb of matchLength codes */ + U32 offCodeSum; /* nb of offset codes */ + /* begin updated by ZSTD_setLog2Prices */ + U32 log2litSum; /* pow2 to compare log2(litfreq) to */ + U32 log2litLengthSum; /* pow2 to compare log2(llfreq) to */ + U32 log2matchLengthSum; /* pow2 to compare log2(mlfreq) to */ + U32 log2offCodeSum; /* pow2 to compare log2(offreq) to */ + /* end : updated by ZSTD_setLog2Prices */ + U32 staticPrices; /* prices follow a pre-defined cost structure, statistics are irrelevant */ +} optState_t; + +typedef struct { + ZSTD_entropyCTables_t entropy; + U32 rep[ZSTD_REP_NUM]; +} ZSTD_compressedBlockState_t; + +typedef struct { + BYTE const* nextSrc; /* next block here to continue on current prefix */ + BYTE const* base; /* All regular indexes relative to this position */ + BYTE const* dictBase; /* extDict indexes relative to this position */ + U32 dictLimit; /* below that point, need extDict */ + U32 lowLimit; /* below that point, no more data */ +} ZSTD_window_t; + +typedef struct { + ZSTD_window_t window; /* State for window round buffer management */ + U32 loadedDictEnd; /* index of end of dictionary */ + U32 nextToUpdate; /* index from which to continue table update */ + U32 nextToUpdate3; /* index from which to continue table update */ + U32 hashLog3; /* dispatch table : larger == faster, more memory */ + U32* hashTable; + U32* hashTable3; + U32* chainTable; + optState_t opt; /* optimal parser state */ +} ZSTD_matchState_t; + +typedef struct { + ZSTD_compressedBlockState_t* prevCBlock; + ZSTD_compressedBlockState_t* nextCBlock; + ZSTD_matchState_t matchState; +} ZSTD_blockState_t; + +typedef struct { + U32 offset; + U32 checksum; +} ldmEntry_t; + +typedef struct { + ZSTD_window_t window; /* State for the window round buffer management */ + ldmEntry_t* hashTable; + BYTE* bucketOffsets; /* Next position in bucket to insert entry */ + U64 hashPower; /* Used to compute the rolling hash. + * Depends on ldmParams.minMatchLength */ +} ldmState_t; + +typedef struct { + U32 enableLdm; /* 1 if enable long distance matching */ + U32 hashLog; /* Log size of hashTable */ + U32 bucketSizeLog; /* Log bucket size for collision resolution, at most 8 */ + U32 minMatchLength; /* Minimum match length */ + U32 hashEveryLog; /* Log number of entries to skip */ + U32 windowLog; /* Window log for the LDM */ +} ldmParams_t; + +typedef struct { + U32 offset; + U32 litLength; + U32 matchLength; +} rawSeq; + +typedef struct { + rawSeq* seq; /* The start of the sequences */ + size_t pos; /* The position where reading stopped. <= size. */ + size_t size; /* The number of sequences. <= capacity. */ + size_t capacity; /* The capacity of the `seq` pointer */ +} rawSeqStore_t; + +struct ZSTD_CCtx_params_s { + ZSTD_format_e format; + ZSTD_compressionParameters cParams; + ZSTD_frameParameters fParams; + + int compressionLevel; + int disableLiteralCompression; + int forceWindow; /* force back-references to respect limit of + * 1< 63) ? ZSTD_highbit32(litLength) + LL_deltaCode : LL_Code[litLength]; +} + +/* ZSTD_MLcode() : + * note : mlBase = matchLength - MINMATCH; + * because it's the format it's stored in seqStore->sequences */ +MEM_STATIC U32 ZSTD_MLcode(U32 mlBase) +{ + static const BYTE ML_Code[128] = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, + 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, + 32, 32, 33, 33, 34, 34, 35, 35, 36, 36, 36, 36, 37, 37, 37, 37, + 38, 38, 38, 38, 38, 38, 38, 38, 39, 39, 39, 39, 39, 39, 39, 39, + 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, + 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, + 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, + 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42 }; + static const U32 ML_deltaCode = 36; + return (mlBase > 127) ? ZSTD_highbit32(mlBase) + ML_deltaCode : ML_Code[mlBase]; +} + +/*! ZSTD_storeSeq() : + * Store a sequence (literal length, literals, offset code and match length code) into seqStore_t. + * `offsetCode` : distance to match + 3 (values 1-3 are repCodes). + * `mlBase` : matchLength - MINMATCH +*/ +MEM_STATIC void ZSTD_storeSeq(seqStore_t* seqStorePtr, size_t litLength, const void* literals, U32 offsetCode, size_t mlBase) +{ +#if defined(ZSTD_DEBUG) && (ZSTD_DEBUG >= 6) + static const BYTE* g_start = NULL; + if (g_start==NULL) g_start = (const BYTE*)literals; /* note : index only works for compression within a single segment */ + { U32 const pos = (U32)((const BYTE*)literals - g_start); + DEBUGLOG(6, "Cpos%7u :%3u literals, match%3u bytes at dist.code%7u", + pos, (U32)litLength, (U32)mlBase+MINMATCH, (U32)offsetCode); + } +#endif + /* copy Literals */ + assert(seqStorePtr->lit + litLength <= seqStorePtr->litStart + 128 KB); + ZSTD_wildcopy(seqStorePtr->lit, literals, litLength); + seqStorePtr->lit += litLength; + + /* literal Length */ + if (litLength>0xFFFF) { + assert(seqStorePtr->longLengthID == 0); /* there can only be a single long length */ + seqStorePtr->longLengthID = 1; + seqStorePtr->longLengthPos = (U32)(seqStorePtr->sequences - seqStorePtr->sequencesStart); + } + seqStorePtr->sequences[0].litLength = (U16)litLength; + + /* match offset */ + seqStorePtr->sequences[0].offset = offsetCode + 1; + + /* match Length */ + if (mlBase>0xFFFF) { + assert(seqStorePtr->longLengthID == 0); /* there can only be a single long length */ + seqStorePtr->longLengthID = 2; + seqStorePtr->longLengthPos = (U32)(seqStorePtr->sequences - seqStorePtr->sequencesStart); + } + seqStorePtr->sequences[0].matchLength = (U16)mlBase; + + seqStorePtr->sequences++; +} + + +/*-************************************* +* Match length counter +***************************************/ +static unsigned ZSTD_NbCommonBytes (size_t val) +{ + if (MEM_isLittleEndian()) { + if (MEM_64bits()) { +# if defined(_MSC_VER) && defined(_WIN64) + unsigned long r = 0; + _BitScanForward64( &r, (U64)val ); + return (unsigned)(r>>3); +# elif defined(__GNUC__) && (__GNUC__ >= 4) + return (__builtin_ctzll((U64)val) >> 3); +# else + static const int DeBruijnBytePos[64] = { 0, 0, 0, 0, 0, 1, 1, 2, + 0, 3, 1, 3, 1, 4, 2, 7, + 0, 2, 3, 6, 1, 5, 3, 5, + 1, 3, 4, 4, 2, 5, 6, 7, + 7, 0, 1, 2, 3, 3, 4, 6, + 2, 6, 5, 5, 3, 4, 5, 6, + 7, 1, 2, 4, 6, 4, 4, 5, + 7, 2, 6, 5, 7, 6, 7, 7 }; + return DeBruijnBytePos[((U64)((val & -(long long)val) * 0x0218A392CDABBD3FULL)) >> 58]; +# endif + } else { /* 32 bits */ +# if defined(_MSC_VER) + unsigned long r=0; + _BitScanForward( &r, (U32)val ); + return (unsigned)(r>>3); +# elif defined(__GNUC__) && (__GNUC__ >= 3) + return (__builtin_ctz((U32)val) >> 3); +# else + static const int DeBruijnBytePos[32] = { 0, 0, 3, 0, 3, 1, 3, 0, + 3, 2, 2, 1, 3, 2, 0, 1, + 3, 3, 1, 2, 2, 2, 2, 0, + 3, 1, 2, 0, 1, 0, 1, 1 }; + return DeBruijnBytePos[((U32)((val & -(S32)val) * 0x077CB531U)) >> 27]; +# endif + } + } else { /* Big Endian CPU */ + if (MEM_64bits()) { +# if defined(_MSC_VER) && defined(_WIN64) + unsigned long r = 0; + _BitScanReverse64( &r, val ); + return (unsigned)(r>>3); +# elif defined(__GNUC__) && (__GNUC__ >= 4) + return (__builtin_clzll(val) >> 3); +# else + unsigned r; + const unsigned n32 = sizeof(size_t)*4; /* calculate this way due to compiler complaining in 32-bits mode */ + if (!(val>>n32)) { r=4; } else { r=0; val>>=n32; } + if (!(val>>16)) { r+=2; val>>=8; } else { val>>=24; } + r += (!val); + return r; +# endif + } else { /* 32 bits */ +# if defined(_MSC_VER) + unsigned long r = 0; + _BitScanReverse( &r, (unsigned long)val ); + return (unsigned)(r>>3); +# elif defined(__GNUC__) && (__GNUC__ >= 3) + return (__builtin_clz((U32)val) >> 3); +# else + unsigned r; + if (!(val>>16)) { r=2; val>>=8; } else { r=0; val>>=24; } + r += (!val); + return r; +# endif + } } +} + + +MEM_STATIC size_t ZSTD_count(const BYTE* pIn, const BYTE* pMatch, const BYTE* const pInLimit) +{ + const BYTE* const pStart = pIn; + const BYTE* const pInLoopLimit = pInLimit - (sizeof(size_t)-1); + + if (pIn < pInLoopLimit) { + { size_t const diff = MEM_readST(pMatch) ^ MEM_readST(pIn); + if (diff) return ZSTD_NbCommonBytes(diff); } + pIn+=sizeof(size_t); pMatch+=sizeof(size_t); + while (pIn < pInLoopLimit) { + size_t const diff = MEM_readST(pMatch) ^ MEM_readST(pIn); + if (!diff) { pIn+=sizeof(size_t); pMatch+=sizeof(size_t); continue; } + pIn += ZSTD_NbCommonBytes(diff); + return (size_t)(pIn - pStart); + } } + if (MEM_64bits() && (pIn<(pInLimit-3)) && (MEM_read32(pMatch) == MEM_read32(pIn))) { pIn+=4; pMatch+=4; } + if ((pIn<(pInLimit-1)) && (MEM_read16(pMatch) == MEM_read16(pIn))) { pIn+=2; pMatch+=2; } + if ((pIn> (32-h) ; } +MEM_STATIC size_t ZSTD_hash3Ptr(const void* ptr, U32 h) { return ZSTD_hash3(MEM_readLE32(ptr), h); } /* only in zstd_opt.h */ + +static const U32 prime4bytes = 2654435761U; +static U32 ZSTD_hash4(U32 u, U32 h) { return (u * prime4bytes) >> (32-h) ; } +static size_t ZSTD_hash4Ptr(const void* ptr, U32 h) { return ZSTD_hash4(MEM_read32(ptr), h); } + +static const U64 prime5bytes = 889523592379ULL; +static size_t ZSTD_hash5(U64 u, U32 h) { return (size_t)(((u << (64-40)) * prime5bytes) >> (64-h)) ; } +static size_t ZSTD_hash5Ptr(const void* p, U32 h) { return ZSTD_hash5(MEM_readLE64(p), h); } + +static const U64 prime6bytes = 227718039650203ULL; +static size_t ZSTD_hash6(U64 u, U32 h) { return (size_t)(((u << (64-48)) * prime6bytes) >> (64-h)) ; } +static size_t ZSTD_hash6Ptr(const void* p, U32 h) { return ZSTD_hash6(MEM_readLE64(p), h); } + +static const U64 prime7bytes = 58295818150454627ULL; +static size_t ZSTD_hash7(U64 u, U32 h) { return (size_t)(((u << (64-56)) * prime7bytes) >> (64-h)) ; } +static size_t ZSTD_hash7Ptr(const void* p, U32 h) { return ZSTD_hash7(MEM_readLE64(p), h); } + +static const U64 prime8bytes = 0xCF1BBCDCB7A56463ULL; +static size_t ZSTD_hash8(U64 u, U32 h) { return (size_t)(((u) * prime8bytes) >> (64-h)) ; } +static size_t ZSTD_hash8Ptr(const void* p, U32 h) { return ZSTD_hash8(MEM_readLE64(p), h); } + +MEM_STATIC size_t ZSTD_hashPtr(const void* p, U32 hBits, U32 mls) +{ + switch(mls) + { + default: + case 4: return ZSTD_hash4Ptr(p, hBits); + case 5: return ZSTD_hash5Ptr(p, hBits); + case 6: return ZSTD_hash6Ptr(p, hBits); + case 7: return ZSTD_hash7Ptr(p, hBits); + case 8: return ZSTD_hash8Ptr(p, hBits); + } +} + +/*-************************************* +* Round buffer management +***************************************/ +/* Max current allowed */ +#define ZSTD_CURRENT_MAX ((3U << 29) + (1U << ZSTD_WINDOWLOG_MAX)) +/* Maximum chunk size before overflow correction needs to be called again */ +#define ZSTD_CHUNKSIZE_MAX \ + ( ((U32)-1) /* Maximum ending current index */ \ + - ZSTD_CURRENT_MAX) /* Maximum beginning lowLimit */ + +/** + * ZSTD_window_clear(): + * Clears the window containing the history by simply setting it to empty. + */ +MEM_STATIC void ZSTD_window_clear(ZSTD_window_t* window) +{ + size_t const endT = (size_t)(window->nextSrc - window->base); + U32 const end = (U32)endT; + + window->lowLimit = end; + window->dictLimit = end; +} + +/** + * ZSTD_window_hasExtDict(): + * Returns non-zero if the window has a non-empty extDict. + */ +MEM_STATIC U32 ZSTD_window_hasExtDict(ZSTD_window_t const window) +{ + return window.lowLimit < window.dictLimit; +} + +/** + * ZSTD_window_needOverflowCorrection(): + * Returns non-zero if the indices are getting too large and need overflow + * protection. + */ +MEM_STATIC U32 ZSTD_window_needOverflowCorrection(ZSTD_window_t const window, + void const* srcEnd) +{ + U32 const current = (U32)((BYTE const*)srcEnd - window.base); + return current > ZSTD_CURRENT_MAX; +} + +/** + * ZSTD_window_correctOverflow(): + * Reduces the indices to protect from index overflow. + * Returns the correction made to the indices, which must be applied to every + * stored index. + * + * The least significant cycleLog bits of the indices must remain the same, + * which may be 0. Every index up to maxDist in the past must be valid. + * NOTE: (maxDist & cycleMask) must be zero. + */ +MEM_STATIC U32 ZSTD_window_correctOverflow(ZSTD_window_t* window, U32 cycleLog, + U32 maxDist, void const* src) +{ + /* preemptive overflow correction: + * 1. correction is large enough: + * lowLimit > (3<<29) ==> current > 3<<29 + 1< (3<<29 + 1< (3<<29) - (1< (3<<29) - (1<<30) (NOTE: chainLog <= 30) + * > 1<<29 + * + * 2. (ip+ZSTD_CHUNKSIZE_MAX - cctx->base) doesn't overflow: + * After correction, current is less than (1<base < 1<<32. + * 3. (cctx->lowLimit + 1< 3<<29 + 1<base); + U32 const newCurrent = (current & cycleMask) + maxDist; + U32 const correction = current - newCurrent; + assert((maxDist & cycleMask) == 0); + assert(current > newCurrent); + /* Loose bound, should be around 1<<29 (see above) */ + assert(correction > 1<<28); + + window->base += correction; + window->dictBase += correction; + window->lowLimit -= correction; + window->dictLimit -= correction; + + DEBUGLOG(4, "Correction of 0x%x bytes to lowLimit=0x%x", correction, + window->lowLimit); + return correction; +} + +/** + * ZSTD_window_enforceMaxDist(): + * Updates lowLimit so that: + * (srcEnd - base) - lowLimit == maxDist + loadedDictEnd + * This allows a simple check that index >= lowLimit to see if index is valid. + * This must be called before a block compression call, with srcEnd as the block + * source end. + * If loadedDictEndPtr is not NULL, we set it to zero once we update lowLimit. + * This is because dictionaries are allowed to be referenced as long as the last + * byte of the dictionary is in the window, but once they are out of range, + * they cannot be referenced. If loadedDictEndPtr is NULL, we use + * loadedDictEnd == 0. + */ +MEM_STATIC void ZSTD_window_enforceMaxDist(ZSTD_window_t* window, + void const* srcEnd, U32 maxDist, + U32* loadedDictEndPtr) +{ + U32 const current = (U32)((BYTE const*)srcEnd - window->base); + U32 loadedDictEnd = loadedDictEndPtr != NULL ? *loadedDictEndPtr : 0; + if (current > maxDist + loadedDictEnd) { + U32 const newLowLimit = current - maxDist; + if (window->lowLimit < newLowLimit) window->lowLimit = newLowLimit; + if (window->dictLimit < window->lowLimit) { + DEBUGLOG(5, "Update dictLimit from %u to %u", window->dictLimit, + window->lowLimit); + window->dictLimit = window->lowLimit; + } + if (loadedDictEndPtr) + *loadedDictEndPtr = 0; + } +} + +/** + * ZSTD_window_update(): + * Updates the window by appending [src, src + srcSize) to the window. + * If it is not contiguous, the current prefix becomes the extDict, and we + * forget about the extDict. Handles overlap of the prefix and extDict. + * Returns non-zero if the segment is contiguous. + */ +MEM_STATIC U32 ZSTD_window_update(ZSTD_window_t* window, + void const* src, size_t srcSize) +{ + BYTE const* const ip = (BYTE const*)src; + U32 contiguous = 1; + /* Check if blocks follow each other */ + if (src != window->nextSrc) { + /* not contiguous */ + size_t const distanceFromBase = (size_t)(window->nextSrc - window->base); + DEBUGLOG(5, "Non contiguous blocks, new segment starts at %u", + window->dictLimit); + window->lowLimit = window->dictLimit; + assert(distanceFromBase == (size_t)(U32)distanceFromBase); /* should never overflow */ + window->dictLimit = (U32)distanceFromBase; + window->dictBase = window->base; + window->base = ip - distanceFromBase; + // ms->nextToUpdate = window->dictLimit; + if (window->dictLimit - window->lowLimit < HASH_READ_SIZE) window->lowLimit = window->dictLimit; /* too small extDict */ + contiguous = 0; + } + window->nextSrc = ip + srcSize; + /* if input and dictionary overlap : reduce dictionary (area presumed modified by input) */ + if ( (ip+srcSize > window->dictBase + window->lowLimit) + & (ip < window->dictBase + window->dictLimit)) { + ptrdiff_t const highInputIdx = (ip + srcSize) - window->dictBase; + U32 const lowLimitMax = (highInputIdx > (ptrdiff_t)window->dictLimit) ? window->dictLimit : (U32)highInputIdx; + window->lowLimit = lowLimitMax; + } + return contiguous; +} + +#if defined (__cplusplus) +} +#endif + + +/* ============================================================== + * Private declarations + * These prototypes shall only be called from within lib/compress + * ============================================================== */ + +/* ZSTD_getCParamsFromCCtxParams() : + * cParams are built depending on compressionLevel, src size hints, + * LDM and manually set compression parameters. + */ +ZSTD_compressionParameters ZSTD_getCParamsFromCCtxParams( + const ZSTD_CCtx_params* CCtxParams, U64 srcSizeHint, size_t dictSize); + +/*! ZSTD_initCStream_internal() : + * Private use only. Init streaming operation. + * expects params to be valid. + * must receive dict, or cdict, or none, but not both. + * @return : 0, or an error code */ +size_t ZSTD_initCStream_internal(ZSTD_CStream* zcs, + const void* dict, size_t dictSize, + const ZSTD_CDict* cdict, + ZSTD_CCtx_params params, unsigned long long pledgedSrcSize); + +/*! ZSTD_compressStream_generic() : + * Private use only. To be called from zstdmt_compress.c in single-thread mode. */ +size_t ZSTD_compressStream_generic(ZSTD_CStream* zcs, + ZSTD_outBuffer* output, + ZSTD_inBuffer* input, + ZSTD_EndDirective const flushMode); + +/*! ZSTD_getCParamsFromCDict() : + * as the name implies */ +ZSTD_compressionParameters ZSTD_getCParamsFromCDict(const ZSTD_CDict* cdict); + +/* ZSTD_compressBegin_advanced_internal() : + * Private use only. To be called from zstdmt_compress.c. */ +size_t ZSTD_compressBegin_advanced_internal(ZSTD_CCtx* cctx, + const void* dict, size_t dictSize, + ZSTD_dictContentType_e dictContentType, + const ZSTD_CDict* cdict, + ZSTD_CCtx_params params, + unsigned long long pledgedSrcSize); + +/* ZSTD_compress_advanced_internal() : + * Private use only. To be called from zstdmt_compress.c. */ +size_t ZSTD_compress_advanced_internal(ZSTD_CCtx* cctx, + void* dst, size_t dstCapacity, + const void* src, size_t srcSize, + const void* dict,size_t dictSize, + ZSTD_CCtx_params params); + + +/* ZSTD_writeLastEmptyBlock() : + * output an empty Block with end-of-frame mark to complete a frame + * @return : size of data written into `dst` (== ZSTD_blockHeaderSize (defined in zstd_internal.h)) + * or an error code if `dstCapcity` is too small (hashTable; + U32 const hBitsL = cParams->hashLog; + U32 const mls = cParams->searchLength; + U32* const hashSmall = ms->chainTable; + U32 const hBitsS = cParams->chainLog; + const BYTE* const base = ms->window.base; + const BYTE* ip = base + ms->nextToUpdate; + const BYTE* const iend = ((const BYTE*)end) - HASH_READ_SIZE; + const U32 fastHashFillStep = 3; + + /* Always insert every fastHashFillStep position into the hash tables. + * Insert the other positions into the large hash table if their entry + * is empty. + */ + for (; ip + fastHashFillStep - 1 <= iend; ip += fastHashFillStep) { + U32 const current = (U32)(ip - base); + U32 i; + for (i = 0; i < fastHashFillStep; ++i) { + size_t const smHash = ZSTD_hashPtr(ip + i, hBitsS, mls); + size_t const lgHash = ZSTD_hashPtr(ip + i, hBitsL, 8); + if (i == 0) + hashSmall[smHash] = current + i; + if (i == 0 || hashLarge[lgHash] == 0) + hashLarge[lgHash] = current + i; + } + } +} + + +FORCE_INLINE_TEMPLATE +size_t ZSTD_compressBlock_doubleFast_generic( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + ZSTD_compressionParameters const* cParams, void const* src, size_t srcSize, + U32 const mls /* template */) +{ + U32* const hashLong = ms->hashTable; + const U32 hBitsL = cParams->hashLog; + U32* const hashSmall = ms->chainTable; + const U32 hBitsS = cParams->chainLog; + const BYTE* const base = ms->window.base; + const BYTE* const istart = (const BYTE*)src; + const BYTE* ip = istart; + const BYTE* anchor = istart; + const U32 lowestIndex = ms->window.dictLimit; + const BYTE* const lowest = base + lowestIndex; + const BYTE* const iend = istart + srcSize; + const BYTE* const ilimit = iend - HASH_READ_SIZE; + U32 offset_1=rep[0], offset_2=rep[1]; + U32 offsetSaved = 0; + + /* init */ + ip += (ip==lowest); + { U32 const maxRep = (U32)(ip-lowest); + if (offset_2 > maxRep) offsetSaved = offset_2, offset_2 = 0; + if (offset_1 > maxRep) offsetSaved = offset_1, offset_1 = 0; + } + + /* Main Search Loop */ + while (ip < ilimit) { /* < instead of <=, because repcode check at (ip+1) */ + size_t mLength; + size_t const h2 = ZSTD_hashPtr(ip, hBitsL, 8); + size_t const h = ZSTD_hashPtr(ip, hBitsS, mls); + U32 const current = (U32)(ip-base); + U32 const matchIndexL = hashLong[h2]; + U32 const matchIndexS = hashSmall[h]; + const BYTE* matchLong = base + matchIndexL; + const BYTE* match = base + matchIndexS; + hashLong[h2] = hashSmall[h] = current; /* update hash tables */ + + assert(offset_1 <= current); /* supposed guaranteed by construction */ + if ((offset_1 > 0) & (MEM_read32(ip+1-offset_1) == MEM_read32(ip+1))) { + /* favor repcode */ + mLength = ZSTD_count(ip+1+4, ip+1+4-offset_1, iend) + 4; + ip++; + ZSTD_storeSeq(seqStore, ip-anchor, anchor, 0, mLength-MINMATCH); + } else { + U32 offset; + if ( (matchIndexL > lowestIndex) && (MEM_read64(matchLong) == MEM_read64(ip)) ) { + mLength = ZSTD_count(ip+8, matchLong+8, iend) + 8; + offset = (U32)(ip-matchLong); + while (((ip>anchor) & (matchLong>lowest)) && (ip[-1] == matchLong[-1])) { ip--; matchLong--; mLength++; } /* catch up */ + } else if ( (matchIndexS > lowestIndex) && (MEM_read32(match) == MEM_read32(ip)) ) { + size_t const hl3 = ZSTD_hashPtr(ip+1, hBitsL, 8); + U32 const matchIndexL3 = hashLong[hl3]; + const BYTE* matchL3 = base + matchIndexL3; + hashLong[hl3] = current + 1; + if ( (matchIndexL3 > lowestIndex) && (MEM_read64(matchL3) == MEM_read64(ip+1)) ) { + mLength = ZSTD_count(ip+9, matchL3+8, iend) + 8; + ip++; + offset = (U32)(ip-matchL3); + while (((ip>anchor) & (matchL3>lowest)) && (ip[-1] == matchL3[-1])) { ip--; matchL3--; mLength++; } /* catch up */ + } else { + mLength = ZSTD_count(ip+4, match+4, iend) + 4; + offset = (U32)(ip-match); + while (((ip>anchor) & (match>lowest)) && (ip[-1] == match[-1])) { ip--; match--; mLength++; } /* catch up */ + } + } else { + ip += ((ip-anchor) >> kSearchStrength) + 1; + continue; + } + + offset_2 = offset_1; + offset_1 = offset; + + ZSTD_storeSeq(seqStore, ip-anchor, anchor, offset + ZSTD_REP_MOVE, mLength-MINMATCH); + } + + /* match found */ + ip += mLength; + anchor = ip; + + if (ip <= ilimit) { + /* Fill Table */ + hashLong[ZSTD_hashPtr(base+current+2, hBitsL, 8)] = + hashSmall[ZSTD_hashPtr(base+current+2, hBitsS, mls)] = current+2; /* here because current+2 could be > iend-8 */ + hashLong[ZSTD_hashPtr(ip-2, hBitsL, 8)] = + hashSmall[ZSTD_hashPtr(ip-2, hBitsS, mls)] = (U32)(ip-2-base); + + /* check immediate repcode */ + while ( (ip <= ilimit) + && ( (offset_2>0) + & (MEM_read32(ip) == MEM_read32(ip - offset_2)) )) { + /* store sequence */ + size_t const rLength = ZSTD_count(ip+4, ip+4-offset_2, iend) + 4; + { U32 const tmpOff = offset_2; offset_2 = offset_1; offset_1 = tmpOff; } /* swap offset_2 <=> offset_1 */ + hashSmall[ZSTD_hashPtr(ip, hBitsS, mls)] = (U32)(ip-base); + hashLong[ZSTD_hashPtr(ip, hBitsL, 8)] = (U32)(ip-base); + ZSTD_storeSeq(seqStore, 0, anchor, 0, rLength-MINMATCH); + ip += rLength; + anchor = ip; + continue; /* faster when present ... (?) */ + } } } + + /* save reps for next block */ + rep[0] = offset_1 ? offset_1 : offsetSaved; + rep[1] = offset_2 ? offset_2 : offsetSaved; + + /* Return the last literals size */ + return iend - anchor; +} + + +size_t ZSTD_compressBlock_doubleFast( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + ZSTD_compressionParameters const* cParams, void const* src, size_t srcSize) +{ + const U32 mls = cParams->searchLength; + switch(mls) + { + default: /* includes case 3 */ + case 4 : + return ZSTD_compressBlock_doubleFast_generic(ms, seqStore, rep, cParams, src, srcSize, 4); + case 5 : + return ZSTD_compressBlock_doubleFast_generic(ms, seqStore, rep, cParams, src, srcSize, 5); + case 6 : + return ZSTD_compressBlock_doubleFast_generic(ms, seqStore, rep, cParams, src, srcSize, 6); + case 7 : + return ZSTD_compressBlock_doubleFast_generic(ms, seqStore, rep, cParams, src, srcSize, 7); + } +} + + +static size_t ZSTD_compressBlock_doubleFast_extDict_generic( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + ZSTD_compressionParameters const* cParams, void const* src, size_t srcSize, + U32 const mls /* template */) +{ + U32* const hashLong = ms->hashTable; + U32 const hBitsL = cParams->hashLog; + U32* const hashSmall = ms->chainTable; + U32 const hBitsS = cParams->chainLog; + const BYTE* const base = ms->window.base; + const BYTE* const dictBase = ms->window.dictBase; + const BYTE* const istart = (const BYTE*)src; + const BYTE* ip = istart; + const BYTE* anchor = istart; + const U32 lowestIndex = ms->window.lowLimit; + const BYTE* const dictStart = dictBase + lowestIndex; + const U32 dictLimit = ms->window.dictLimit; + const BYTE* const lowPrefixPtr = base + dictLimit; + const BYTE* const dictEnd = dictBase + dictLimit; + const BYTE* const iend = istart + srcSize; + const BYTE* const ilimit = iend - 8; + U32 offset_1=rep[0], offset_2=rep[1]; + + /* Search Loop */ + while (ip < ilimit) { /* < instead of <=, because (ip+1) */ + const size_t hSmall = ZSTD_hashPtr(ip, hBitsS, mls); + const U32 matchIndex = hashSmall[hSmall]; + const BYTE* matchBase = matchIndex < dictLimit ? dictBase : base; + const BYTE* match = matchBase + matchIndex; + + const size_t hLong = ZSTD_hashPtr(ip, hBitsL, 8); + const U32 matchLongIndex = hashLong[hLong]; + const BYTE* matchLongBase = matchLongIndex < dictLimit ? dictBase : base; + const BYTE* matchLong = matchLongBase + matchLongIndex; + + const U32 current = (U32)(ip-base); + const U32 repIndex = current + 1 - offset_1; /* offset_1 expected <= current +1 */ + const BYTE* repBase = repIndex < dictLimit ? dictBase : base; + const BYTE* repMatch = repBase + repIndex; + size_t mLength; + hashSmall[hSmall] = hashLong[hLong] = current; /* update hash table */ + + if ( (((U32)((dictLimit-1) - repIndex) >= 3) /* intentional underflow */ & (repIndex > lowestIndex)) + && (MEM_read32(repMatch) == MEM_read32(ip+1)) ) { + const BYTE* repMatchEnd = repIndex < dictLimit ? dictEnd : iend; + mLength = ZSTD_count_2segments(ip+1+4, repMatch+4, iend, repMatchEnd, lowPrefixPtr) + 4; + ip++; + ZSTD_storeSeq(seqStore, ip-anchor, anchor, 0, mLength-MINMATCH); + } else { + if ((matchLongIndex > lowestIndex) && (MEM_read64(matchLong) == MEM_read64(ip))) { + const BYTE* matchEnd = matchLongIndex < dictLimit ? dictEnd : iend; + const BYTE* lowMatchPtr = matchLongIndex < dictLimit ? dictStart : lowPrefixPtr; + U32 offset; + mLength = ZSTD_count_2segments(ip+8, matchLong+8, iend, matchEnd, lowPrefixPtr) + 8; + offset = current - matchLongIndex; + while (((ip>anchor) & (matchLong>lowMatchPtr)) && (ip[-1] == matchLong[-1])) { ip--; matchLong--; mLength++; } /* catch up */ + offset_2 = offset_1; + offset_1 = offset; + ZSTD_storeSeq(seqStore, ip-anchor, anchor, offset + ZSTD_REP_MOVE, mLength-MINMATCH); + + } else if ((matchIndex > lowestIndex) && (MEM_read32(match) == MEM_read32(ip))) { + size_t const h3 = ZSTD_hashPtr(ip+1, hBitsL, 8); + U32 const matchIndex3 = hashLong[h3]; + const BYTE* const match3Base = matchIndex3 < dictLimit ? dictBase : base; + const BYTE* match3 = match3Base + matchIndex3; + U32 offset; + hashLong[h3] = current + 1; + if ( (matchIndex3 > lowestIndex) && (MEM_read64(match3) == MEM_read64(ip+1)) ) { + const BYTE* matchEnd = matchIndex3 < dictLimit ? dictEnd : iend; + const BYTE* lowMatchPtr = matchIndex3 < dictLimit ? dictStart : lowPrefixPtr; + mLength = ZSTD_count_2segments(ip+9, match3+8, iend, matchEnd, lowPrefixPtr) + 8; + ip++; + offset = current+1 - matchIndex3; + while (((ip>anchor) & (match3>lowMatchPtr)) && (ip[-1] == match3[-1])) { ip--; match3--; mLength++; } /* catch up */ + } else { + const BYTE* matchEnd = matchIndex < dictLimit ? dictEnd : iend; + const BYTE* lowMatchPtr = matchIndex < dictLimit ? dictStart : lowPrefixPtr; + mLength = ZSTD_count_2segments(ip+4, match+4, iend, matchEnd, lowPrefixPtr) + 4; + offset = current - matchIndex; + while (((ip>anchor) & (match>lowMatchPtr)) && (ip[-1] == match[-1])) { ip--; match--; mLength++; } /* catch up */ + } + offset_2 = offset_1; + offset_1 = offset; + ZSTD_storeSeq(seqStore, ip-anchor, anchor, offset + ZSTD_REP_MOVE, mLength-MINMATCH); + + } else { + ip += ((ip-anchor) >> kSearchStrength) + 1; + continue; + } } + + /* found a match : store it */ + ip += mLength; + anchor = ip; + + if (ip <= ilimit) { + /* Fill Table */ + hashSmall[ZSTD_hashPtr(base+current+2, hBitsS, mls)] = current+2; + hashLong[ZSTD_hashPtr(base+current+2, hBitsL, 8)] = current+2; + hashSmall[ZSTD_hashPtr(ip-2, hBitsS, mls)] = (U32)(ip-2-base); + hashLong[ZSTD_hashPtr(ip-2, hBitsL, 8)] = (U32)(ip-2-base); + /* check immediate repcode */ + while (ip <= ilimit) { + U32 const current2 = (U32)(ip-base); + U32 const repIndex2 = current2 - offset_2; + const BYTE* repMatch2 = repIndex2 < dictLimit ? dictBase + repIndex2 : base + repIndex2; + if ( (((U32)((dictLimit-1) - repIndex2) >= 3) & (repIndex2 > lowestIndex)) /* intentional overflow */ + && (MEM_read32(repMatch2) == MEM_read32(ip)) ) { + const BYTE* const repEnd2 = repIndex2 < dictLimit ? dictEnd : iend; + size_t const repLength2 = ZSTD_count_2segments(ip+4, repMatch2+4, iend, repEnd2, lowPrefixPtr) + 4; + U32 tmpOffset = offset_2; offset_2 = offset_1; offset_1 = tmpOffset; /* swap offset_2 <=> offset_1 */ + ZSTD_storeSeq(seqStore, 0, anchor, 0, repLength2-MINMATCH); + hashSmall[ZSTD_hashPtr(ip, hBitsS, mls)] = current2; + hashLong[ZSTD_hashPtr(ip, hBitsL, 8)] = current2; + ip += repLength2; + anchor = ip; + continue; + } + break; + } } } + + /* save reps for next block */ + rep[0] = offset_1; + rep[1] = offset_2; + + /* Return the last literals size */ + return iend - anchor; +} + + +size_t ZSTD_compressBlock_doubleFast_extDict( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + ZSTD_compressionParameters const* cParams, void const* src, size_t srcSize) +{ + U32 const mls = cParams->searchLength; + switch(mls) + { + default: /* includes case 3 */ + case 4 : + return ZSTD_compressBlock_doubleFast_extDict_generic(ms, seqStore, rep, cParams, src, srcSize, 4); + case 5 : + return ZSTD_compressBlock_doubleFast_extDict_generic(ms, seqStore, rep, cParams, src, srcSize, 5); + case 6 : + return ZSTD_compressBlock_doubleFast_extDict_generic(ms, seqStore, rep, cParams, src, srcSize, 6); + case 7 : + return ZSTD_compressBlock_doubleFast_extDict_generic(ms, seqStore, rep, cParams, src, srcSize, 7); + } +} diff -r fb92df8b634c -r ed5448edcbfa contrib/python-zstandard/zstd/compress/zstd_double_fast.h --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/contrib/python-zstandard/zstd/compress/zstd_double_fast.h Wed Apr 18 15:32:08 2018 -0400 @@ -0,0 +1,36 @@ +/* + * Copyright (c) 2016-present, Yann Collet, Facebook, Inc. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the + * LICENSE file in the root directory of this source tree) and the GPLv2 (found + * in the COPYING file in the root directory of this source tree). + * You may select, at your option, one of the above-listed licenses. + */ + +#ifndef ZSTD_DOUBLE_FAST_H +#define ZSTD_DOUBLE_FAST_H + +#if defined (__cplusplus) +extern "C" { +#endif + +#include "mem.h" /* U32 */ +#include "zstd_compress_internal.h" /* ZSTD_CCtx, size_t */ + +void ZSTD_fillDoubleHashTable(ZSTD_matchState_t* ms, + ZSTD_compressionParameters const* cParams, + void const* end); +size_t ZSTD_compressBlock_doubleFast( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + ZSTD_compressionParameters const* cParams, void const* src, size_t srcSize); +size_t ZSTD_compressBlock_doubleFast_extDict( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + ZSTD_compressionParameters const* cParams, void const* src, size_t srcSize); + + +#if defined (__cplusplus) +} +#endif + +#endif /* ZSTD_DOUBLE_FAST_H */ diff -r fb92df8b634c -r ed5448edcbfa contrib/python-zstandard/zstd/compress/zstd_fast.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/contrib/python-zstandard/zstd/compress/zstd_fast.c Wed Apr 18 15:32:08 2018 -0400 @@ -0,0 +1,259 @@ +/* + * Copyright (c) 2016-present, Yann Collet, Facebook, Inc. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the + * LICENSE file in the root directory of this source tree) and the GPLv2 (found + * in the COPYING file in the root directory of this source tree). + * You may select, at your option, one of the above-listed licenses. + */ + +#include "zstd_compress_internal.h" +#include "zstd_fast.h" + + +void ZSTD_fillHashTable(ZSTD_matchState_t* ms, + ZSTD_compressionParameters const* cParams, + void const* end) +{ + U32* const hashTable = ms->hashTable; + U32 const hBits = cParams->hashLog; + U32 const mls = cParams->searchLength; + const BYTE* const base = ms->window.base; + const BYTE* ip = base + ms->nextToUpdate; + const BYTE* const iend = ((const BYTE*)end) - HASH_READ_SIZE; + const U32 fastHashFillStep = 3; + + /* Always insert every fastHashFillStep position into the hash table. + * Insert the other positions if their hash entry is empty. + */ + for (; ip + fastHashFillStep - 1 <= iend; ip += fastHashFillStep) { + U32 const current = (U32)(ip - base); + U32 i; + for (i = 0; i < fastHashFillStep; ++i) { + size_t const hash = ZSTD_hashPtr(ip + i, hBits, mls); + if (i == 0 || hashTable[hash] == 0) + hashTable[hash] = current + i; + } + } +} + +FORCE_INLINE_TEMPLATE +size_t ZSTD_compressBlock_fast_generic( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize, + U32 const hlog, U32 const stepSize, U32 const mls) +{ + U32* const hashTable = ms->hashTable; + const BYTE* const base = ms->window.base; + const BYTE* const istart = (const BYTE*)src; + const BYTE* ip = istart; + const BYTE* anchor = istart; + const U32 lowestIndex = ms->window.dictLimit; + const BYTE* const lowest = base + lowestIndex; + const BYTE* const iend = istart + srcSize; + const BYTE* const ilimit = iend - HASH_READ_SIZE; + U32 offset_1=rep[0], offset_2=rep[1]; + U32 offsetSaved = 0; + + /* init */ + ip += (ip==lowest); + { U32 const maxRep = (U32)(ip-lowest); + if (offset_2 > maxRep) offsetSaved = offset_2, offset_2 = 0; + if (offset_1 > maxRep) offsetSaved = offset_1, offset_1 = 0; + } + + /* Main Search Loop */ + while (ip < ilimit) { /* < instead of <=, because repcode check at (ip+1) */ + size_t mLength; + size_t const h = ZSTD_hashPtr(ip, hlog, mls); + U32 const current = (U32)(ip-base); + U32 const matchIndex = hashTable[h]; + const BYTE* match = base + matchIndex; + hashTable[h] = current; /* update hash table */ + + if ((offset_1 > 0) & (MEM_read32(ip+1-offset_1) == MEM_read32(ip+1))) { + mLength = ZSTD_count(ip+1+4, ip+1+4-offset_1, iend) + 4; + ip++; + ZSTD_storeSeq(seqStore, ip-anchor, anchor, 0, mLength-MINMATCH); + } else { + if ( (matchIndex <= lowestIndex) + || (MEM_read32(match) != MEM_read32(ip)) ) { + assert(stepSize >= 1); + ip += ((ip-anchor) >> kSearchStrength) + stepSize; + continue; + } + mLength = ZSTD_count(ip+4, match+4, iend) + 4; + { U32 const offset = (U32)(ip-match); + while (((ip>anchor) & (match>lowest)) && (ip[-1] == match[-1])) { ip--; match--; mLength++; } /* catch up */ + offset_2 = offset_1; + offset_1 = offset; + ZSTD_storeSeq(seqStore, ip-anchor, anchor, offset + ZSTD_REP_MOVE, mLength-MINMATCH); + } } + + /* match found */ + ip += mLength; + anchor = ip; + + if (ip <= ilimit) { + /* Fill Table */ + hashTable[ZSTD_hashPtr(base+current+2, hlog, mls)] = current+2; /* here because current+2 could be > iend-8 */ + hashTable[ZSTD_hashPtr(ip-2, hlog, mls)] = (U32)(ip-2-base); + /* check immediate repcode */ + while ( (ip <= ilimit) + && ( (offset_2>0) + & (MEM_read32(ip) == MEM_read32(ip - offset_2)) )) { + /* store sequence */ + size_t const rLength = ZSTD_count(ip+4, ip+4-offset_2, iend) + 4; + { U32 const tmpOff = offset_2; offset_2 = offset_1; offset_1 = tmpOff; } /* swap offset_2 <=> offset_1 */ + hashTable[ZSTD_hashPtr(ip, hlog, mls)] = (U32)(ip-base); + ZSTD_storeSeq(seqStore, 0, anchor, 0, rLength-MINMATCH); + ip += rLength; + anchor = ip; + continue; /* faster when present ... (?) */ + } } } + + /* save reps for next block */ + rep[0] = offset_1 ? offset_1 : offsetSaved; + rep[1] = offset_2 ? offset_2 : offsetSaved; + + /* Return the last literals size */ + return iend - anchor; +} + + +size_t ZSTD_compressBlock_fast( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + ZSTD_compressionParameters const* cParams, void const* src, size_t srcSize) +{ + U32 const hlog = cParams->hashLog; + U32 const mls = cParams->searchLength; + U32 const stepSize = cParams->targetLength; + switch(mls) + { + default: /* includes case 3 */ + case 4 : + return ZSTD_compressBlock_fast_generic(ms, seqStore, rep, src, srcSize, hlog, stepSize, 4); + case 5 : + return ZSTD_compressBlock_fast_generic(ms, seqStore, rep, src, srcSize, hlog, stepSize, 5); + case 6 : + return ZSTD_compressBlock_fast_generic(ms, seqStore, rep, src, srcSize, hlog, stepSize, 6); + case 7 : + return ZSTD_compressBlock_fast_generic(ms, seqStore, rep, src, srcSize, hlog, stepSize, 7); + } +} + + +static size_t ZSTD_compressBlock_fast_extDict_generic( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize, + U32 const hlog, U32 const stepSize, U32 const mls) +{ + U32* hashTable = ms->hashTable; + const BYTE* const base = ms->window.base; + const BYTE* const dictBase = ms->window.dictBase; + const BYTE* const istart = (const BYTE*)src; + const BYTE* ip = istart; + const BYTE* anchor = istart; + const U32 lowestIndex = ms->window.lowLimit; + const BYTE* const dictStart = dictBase + lowestIndex; + const U32 dictLimit = ms->window.dictLimit; + const BYTE* const lowPrefixPtr = base + dictLimit; + const BYTE* const dictEnd = dictBase + dictLimit; + const BYTE* const iend = istart + srcSize; + const BYTE* const ilimit = iend - 8; + U32 offset_1=rep[0], offset_2=rep[1]; + + /* Search Loop */ + while (ip < ilimit) { /* < instead of <=, because (ip+1) */ + const size_t h = ZSTD_hashPtr(ip, hlog, mls); + const U32 matchIndex = hashTable[h]; + const BYTE* matchBase = matchIndex < dictLimit ? dictBase : base; + const BYTE* match = matchBase + matchIndex; + const U32 current = (U32)(ip-base); + const U32 repIndex = current + 1 - offset_1; /* offset_1 expected <= current +1 */ + const BYTE* repBase = repIndex < dictLimit ? dictBase : base; + const BYTE* repMatch = repBase + repIndex; + size_t mLength; + hashTable[h] = current; /* update hash table */ + + if ( (((U32)((dictLimit-1) - repIndex) >= 3) /* intentional underflow */ & (repIndex > lowestIndex)) + && (MEM_read32(repMatch) == MEM_read32(ip+1)) ) { + const BYTE* repMatchEnd = repIndex < dictLimit ? dictEnd : iend; + mLength = ZSTD_count_2segments(ip+1+4, repMatch+4, iend, repMatchEnd, lowPrefixPtr) + 4; + ip++; + ZSTD_storeSeq(seqStore, ip-anchor, anchor, 0, mLength-MINMATCH); + } else { + if ( (matchIndex < lowestIndex) || + (MEM_read32(match) != MEM_read32(ip)) ) { + assert(stepSize >= 1); + ip += ((ip-anchor) >> kSearchStrength) + stepSize; + continue; + } + { const BYTE* matchEnd = matchIndex < dictLimit ? dictEnd : iend; + const BYTE* lowMatchPtr = matchIndex < dictLimit ? dictStart : lowPrefixPtr; + U32 offset; + mLength = ZSTD_count_2segments(ip+4, match+4, iend, matchEnd, lowPrefixPtr) + 4; + while (((ip>anchor) & (match>lowMatchPtr)) && (ip[-1] == match[-1])) { ip--; match--; mLength++; } /* catch up */ + offset = current - matchIndex; + offset_2 = offset_1; + offset_1 = offset; + ZSTD_storeSeq(seqStore, ip-anchor, anchor, offset + ZSTD_REP_MOVE, mLength-MINMATCH); + } } + + /* found a match : store it */ + ip += mLength; + anchor = ip; + + if (ip <= ilimit) { + /* Fill Table */ + hashTable[ZSTD_hashPtr(base+current+2, hlog, mls)] = current+2; + hashTable[ZSTD_hashPtr(ip-2, hlog, mls)] = (U32)(ip-2-base); + /* check immediate repcode */ + while (ip <= ilimit) { + U32 const current2 = (U32)(ip-base); + U32 const repIndex2 = current2 - offset_2; + const BYTE* repMatch2 = repIndex2 < dictLimit ? dictBase + repIndex2 : base + repIndex2; + if ( (((U32)((dictLimit-1) - repIndex2) >= 3) & (repIndex2 > lowestIndex)) /* intentional overflow */ + && (MEM_read32(repMatch2) == MEM_read32(ip)) ) { + const BYTE* const repEnd2 = repIndex2 < dictLimit ? dictEnd : iend; + size_t const repLength2 = ZSTD_count_2segments(ip+4, repMatch2+4, iend, repEnd2, lowPrefixPtr) + 4; + U32 tmpOffset = offset_2; offset_2 = offset_1; offset_1 = tmpOffset; /* swap offset_2 <=> offset_1 */ + ZSTD_storeSeq(seqStore, 0, anchor, 0, repLength2-MINMATCH); + hashTable[ZSTD_hashPtr(ip, hlog, mls)] = current2; + ip += repLength2; + anchor = ip; + continue; + } + break; + } } } + + /* save reps for next block */ + rep[0] = offset_1; + rep[1] = offset_2; + + /* Return the last literals size */ + return iend - anchor; +} + + +size_t ZSTD_compressBlock_fast_extDict( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + ZSTD_compressionParameters const* cParams, void const* src, size_t srcSize) +{ + U32 const hlog = cParams->hashLog; + U32 const mls = cParams->searchLength; + U32 const stepSize = cParams->targetLength; + switch(mls) + { + default: /* includes case 3 */ + case 4 : + return ZSTD_compressBlock_fast_extDict_generic(ms, seqStore, rep, src, srcSize, hlog, stepSize, 4); + case 5 : + return ZSTD_compressBlock_fast_extDict_generic(ms, seqStore, rep, src, srcSize, hlog, stepSize, 5); + case 6 : + return ZSTD_compressBlock_fast_extDict_generic(ms, seqStore, rep, src, srcSize, hlog, stepSize, 6); + case 7 : + return ZSTD_compressBlock_fast_extDict_generic(ms, seqStore, rep, src, srcSize, hlog, stepSize, 7); + } +} diff -r fb92df8b634c -r ed5448edcbfa contrib/python-zstandard/zstd/compress/zstd_fast.h --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/contrib/python-zstandard/zstd/compress/zstd_fast.h Wed Apr 18 15:32:08 2018 -0400 @@ -0,0 +1,35 @@ +/* + * Copyright (c) 2016-present, Yann Collet, Facebook, Inc. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the + * LICENSE file in the root directory of this source tree) and the GPLv2 (found + * in the COPYING file in the root directory of this source tree). + * You may select, at your option, one of the above-listed licenses. + */ + +#ifndef ZSTD_FAST_H +#define ZSTD_FAST_H + +#if defined (__cplusplus) +extern "C" { +#endif + +#include "mem.h" /* U32 */ +#include "zstd_compress_internal.h" + +void ZSTD_fillHashTable(ZSTD_matchState_t* ms, + ZSTD_compressionParameters const* cParams, + void const* end); +size_t ZSTD_compressBlock_fast( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + ZSTD_compressionParameters const* cParams, void const* src, size_t srcSize); +size_t ZSTD_compressBlock_fast_extDict( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + ZSTD_compressionParameters const* cParams, void const* src, size_t srcSize); + +#if defined (__cplusplus) +} +#endif + +#endif /* ZSTD_FAST_H */ diff -r fb92df8b634c -r ed5448edcbfa contrib/python-zstandard/zstd/compress/zstd_lazy.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/contrib/python-zstandard/zstd/compress/zstd_lazy.c Wed Apr 18 15:32:08 2018 -0400 @@ -0,0 +1,824 @@ +/* + * Copyright (c) 2016-present, Yann Collet, Facebook, Inc. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the + * LICENSE file in the root directory of this source tree) and the GPLv2 (found + * in the COPYING file in the root directory of this source tree). + * You may select, at your option, one of the above-listed licenses. + */ + +#include "zstd_compress_internal.h" +#include "zstd_lazy.h" + + +/*-************************************* +* Binary Tree search +***************************************/ + +void ZSTD_updateDUBT( + ZSTD_matchState_t* ms, ZSTD_compressionParameters const* cParams, + const BYTE* ip, const BYTE* iend, + U32 mls) +{ + U32* const hashTable = ms->hashTable; + U32 const hashLog = cParams->hashLog; + + U32* const bt = ms->chainTable; + U32 const btLog = cParams->chainLog - 1; + U32 const btMask = (1 << btLog) - 1; + + const BYTE* const base = ms->window.base; + U32 const target = (U32)(ip - base); + U32 idx = ms->nextToUpdate; + + if (idx != target) + DEBUGLOG(7, "ZSTD_updateDUBT, from %u to %u (dictLimit:%u)", + idx, target, ms->window.dictLimit); + assert(ip + 8 <= iend); /* condition for ZSTD_hashPtr */ + (void)iend; + + assert(idx >= ms->window.dictLimit); /* condition for valid base+idx */ + for ( ; idx < target ; idx++) { + size_t const h = ZSTD_hashPtr(base + idx, hashLog, mls); /* assumption : ip + 8 <= iend */ + U32 const matchIndex = hashTable[h]; + + U32* const nextCandidatePtr = bt + 2*(idx&btMask); + U32* const sortMarkPtr = nextCandidatePtr + 1; + + DEBUGLOG(8, "ZSTD_updateDUBT: insert %u", idx); + hashTable[h] = idx; /* Update Hash Table */ + *nextCandidatePtr = matchIndex; /* update BT like a chain */ + *sortMarkPtr = ZSTD_DUBT_UNSORTED_MARK; + } + ms->nextToUpdate = target; +} + + +/** ZSTD_insertDUBT1() : + * sort one already inserted but unsorted position + * assumption : current >= btlow == (current - btmask) + * doesn't fail */ +static void ZSTD_insertDUBT1( + ZSTD_matchState_t* ms, ZSTD_compressionParameters const* cParams, + U32 current, const BYTE* inputEnd, + U32 nbCompares, U32 btLow, int extDict) +{ + U32* const bt = ms->chainTable; + U32 const btLog = cParams->chainLog - 1; + U32 const btMask = (1 << btLog) - 1; + size_t commonLengthSmaller=0, commonLengthLarger=0; + const BYTE* const base = ms->window.base; + const BYTE* const dictBase = ms->window.dictBase; + const U32 dictLimit = ms->window.dictLimit; + const BYTE* const ip = (current>=dictLimit) ? base + current : dictBase + current; + const BYTE* const iend = (current>=dictLimit) ? inputEnd : dictBase + dictLimit; + const BYTE* const dictEnd = dictBase + dictLimit; + const BYTE* const prefixStart = base + dictLimit; + const BYTE* match; + U32* smallerPtr = bt + 2*(current&btMask); + U32* largerPtr = smallerPtr + 1; + U32 matchIndex = *smallerPtr; + U32 dummy32; /* to be nullified at the end */ + U32 const windowLow = ms->window.lowLimit; + + DEBUGLOG(8, "ZSTD_insertDUBT1(%u) (dictLimit=%u, lowLimit=%u)", + current, dictLimit, windowLow); + assert(current >= btLow); + assert(ip < iend); /* condition for ZSTD_count */ + + while (nbCompares-- && (matchIndex > windowLow)) { + U32* const nextPtr = bt + 2*(matchIndex & btMask); + size_t matchLength = MIN(commonLengthSmaller, commonLengthLarger); /* guaranteed minimum nb of common bytes */ + assert(matchIndex < current); + + if ( (!extDict) + || (matchIndex+matchLength >= dictLimit) /* both in current segment*/ + || (current < dictLimit) /* both in extDict */) { + const BYTE* const mBase = !extDict || ((matchIndex+matchLength) >= dictLimit) ? base : dictBase; + assert( (matchIndex+matchLength >= dictLimit) /* might be wrong if extDict is incorrectly set to 0 */ + || (current < dictLimit) ); + match = mBase + matchIndex; + matchLength += ZSTD_count(ip+matchLength, match+matchLength, iend); + } else { + match = dictBase + matchIndex; + matchLength += ZSTD_count_2segments(ip+matchLength, match+matchLength, iend, dictEnd, prefixStart); + if (matchIndex+matchLength >= dictLimit) + match = base + matchIndex; /* to prepare for next usage of match[matchLength] */ + } + + DEBUGLOG(8, "ZSTD_insertDUBT1: comparing %u with %u : found %u common bytes ", + current, matchIndex, (U32)matchLength); + + if (ip+matchLength == iend) { /* equal : no way to know if inf or sup */ + break; /* drop , to guarantee consistency ; miss a bit of compression, but other solutions can corrupt tree */ + } + + if (match[matchLength] < ip[matchLength]) { /* necessarily within buffer */ + /* match is smaller than current */ + *smallerPtr = matchIndex; /* update smaller idx */ + commonLengthSmaller = matchLength; /* all smaller will now have at least this guaranteed common length */ + if (matchIndex <= btLow) { smallerPtr=&dummy32; break; } /* beyond tree size, stop searching */ + DEBUGLOG(8, "ZSTD_insertDUBT1: %u (>btLow=%u) is smaller : next => %u", + matchIndex, btLow, nextPtr[1]); + smallerPtr = nextPtr+1; /* new "candidate" => larger than match, which was smaller than target */ + matchIndex = nextPtr[1]; /* new matchIndex, larger than previous and closer to current */ + } else { + /* match is larger than current */ + *largerPtr = matchIndex; + commonLengthLarger = matchLength; + if (matchIndex <= btLow) { largerPtr=&dummy32; break; } /* beyond tree size, stop searching */ + DEBUGLOG(8, "ZSTD_insertDUBT1: %u (>btLow=%u) is larger => %u", + matchIndex, btLow, nextPtr[0]); + largerPtr = nextPtr; + matchIndex = nextPtr[0]; + } } + + *smallerPtr = *largerPtr = 0; +} + + +static size_t ZSTD_DUBT_findBestMatch ( + ZSTD_matchState_t* ms, ZSTD_compressionParameters const* cParams, + const BYTE* const ip, const BYTE* const iend, + size_t* offsetPtr, + U32 const mls, + U32 const extDict) +{ + U32* const hashTable = ms->hashTable; + U32 const hashLog = cParams->hashLog; + size_t const h = ZSTD_hashPtr(ip, hashLog, mls); + U32 matchIndex = hashTable[h]; + + const BYTE* const base = ms->window.base; + U32 const current = (U32)(ip-base); + U32 const windowLow = ms->window.lowLimit; + + U32* const bt = ms->chainTable; + U32 const btLog = cParams->chainLog - 1; + U32 const btMask = (1 << btLog) - 1; + U32 const btLow = (btMask >= current) ? 0 : current - btMask; + U32 const unsortLimit = MAX(btLow, windowLow); + + U32* nextCandidate = bt + 2*(matchIndex&btMask); + U32* unsortedMark = bt + 2*(matchIndex&btMask) + 1; + U32 nbCompares = 1U << cParams->searchLog; + U32 nbCandidates = nbCompares; + U32 previousCandidate = 0; + + DEBUGLOG(7, "ZSTD_DUBT_findBestMatch (%u) ", current); + assert(ip <= iend-8); /* required for h calculation */ + + /* reach end of unsorted candidates list */ + while ( (matchIndex > unsortLimit) + && (*unsortedMark == ZSTD_DUBT_UNSORTED_MARK) + && (nbCandidates > 1) ) { + DEBUGLOG(8, "ZSTD_DUBT_findBestMatch: candidate %u is unsorted", + matchIndex); + *unsortedMark = previousCandidate; + previousCandidate = matchIndex; + matchIndex = *nextCandidate; + nextCandidate = bt + 2*(matchIndex&btMask); + unsortedMark = bt + 2*(matchIndex&btMask) + 1; + nbCandidates --; + } + + if ( (matchIndex > unsortLimit) + && (*unsortedMark==ZSTD_DUBT_UNSORTED_MARK) ) { + DEBUGLOG(7, "ZSTD_DUBT_findBestMatch: nullify last unsorted candidate %u", + matchIndex); + *nextCandidate = *unsortedMark = 0; /* nullify next candidate if it's still unsorted (note : simplification, detrimental to compression ratio, beneficial for speed) */ + } + + /* batch sort stacked candidates */ + matchIndex = previousCandidate; + while (matchIndex) { /* will end on matchIndex == 0 */ + U32* const nextCandidateIdxPtr = bt + 2*(matchIndex&btMask) + 1; + U32 const nextCandidateIdx = *nextCandidateIdxPtr; + ZSTD_insertDUBT1(ms, cParams, matchIndex, iend, + nbCandidates, unsortLimit, extDict); + matchIndex = nextCandidateIdx; + nbCandidates++; + } + + /* find longest match */ + { size_t commonLengthSmaller=0, commonLengthLarger=0; + const BYTE* const dictBase = ms->window.dictBase; + const U32 dictLimit = ms->window.dictLimit; + const BYTE* const dictEnd = dictBase + dictLimit; + const BYTE* const prefixStart = base + dictLimit; + U32* smallerPtr = bt + 2*(current&btMask); + U32* largerPtr = bt + 2*(current&btMask) + 1; + U32 matchEndIdx = current+8+1; + U32 dummy32; /* to be nullified at the end */ + size_t bestLength = 0; + + matchIndex = hashTable[h]; + hashTable[h] = current; /* Update Hash Table */ + + while (nbCompares-- && (matchIndex > windowLow)) { + U32* const nextPtr = bt + 2*(matchIndex & btMask); + size_t matchLength = MIN(commonLengthSmaller, commonLengthLarger); /* guaranteed minimum nb of common bytes */ + const BYTE* match; + + if ((!extDict) || (matchIndex+matchLength >= dictLimit)) { + match = base + matchIndex; + matchLength += ZSTD_count(ip+matchLength, match+matchLength, iend); + } else { + match = dictBase + matchIndex; + matchLength += ZSTD_count_2segments(ip+matchLength, match+matchLength, iend, dictEnd, prefixStart); + if (matchIndex+matchLength >= dictLimit) + match = base + matchIndex; /* to prepare for next usage of match[matchLength] */ + } + + if (matchLength > bestLength) { + if (matchLength > matchEndIdx - matchIndex) + matchEndIdx = matchIndex + (U32)matchLength; + if ( (4*(int)(matchLength-bestLength)) > (int)(ZSTD_highbit32(current-matchIndex+1) - ZSTD_highbit32((U32)offsetPtr[0]+1)) ) + bestLength = matchLength, *offsetPtr = ZSTD_REP_MOVE + current - matchIndex; + if (ip+matchLength == iend) { /* equal : no way to know if inf or sup */ + break; /* drop, to guarantee consistency (miss a little bit of compression) */ + } + } + + if (match[matchLength] < ip[matchLength]) { + /* match is smaller than current */ + *smallerPtr = matchIndex; /* update smaller idx */ + commonLengthSmaller = matchLength; /* all smaller will now have at least this guaranteed common length */ + if (matchIndex <= btLow) { smallerPtr=&dummy32; break; } /* beyond tree size, stop the search */ + smallerPtr = nextPtr+1; /* new "smaller" => larger of match */ + matchIndex = nextPtr[1]; /* new matchIndex larger than previous (closer to current) */ + } else { + /* match is larger than current */ + *largerPtr = matchIndex; + commonLengthLarger = matchLength; + if (matchIndex <= btLow) { largerPtr=&dummy32; break; } /* beyond tree size, stop the search */ + largerPtr = nextPtr; + matchIndex = nextPtr[0]; + } } + + *smallerPtr = *largerPtr = 0; + + assert(matchEndIdx > current+8); /* ensure nextToUpdate is increased */ + ms->nextToUpdate = matchEndIdx - 8; /* skip repetitive patterns */ + if (bestLength >= MINMATCH) { + U32 const mIndex = current - ((U32)*offsetPtr - ZSTD_REP_MOVE); (void)mIndex; + DEBUGLOG(8, "ZSTD_DUBT_findBestMatch(%u) : found match of length %u and offsetCode %u (pos %u)", + current, (U32)bestLength, (U32)*offsetPtr, mIndex); + } + return bestLength; + } +} + + +/** ZSTD_BtFindBestMatch() : Tree updater, providing best match */ +static size_t ZSTD_BtFindBestMatch ( + ZSTD_matchState_t* ms, ZSTD_compressionParameters const* cParams, + const BYTE* const ip, const BYTE* const iLimit, + size_t* offsetPtr, + const U32 mls /* template */) +{ + DEBUGLOG(7, "ZSTD_BtFindBestMatch"); + if (ip < ms->window.base + ms->nextToUpdate) return 0; /* skipped area */ + ZSTD_updateDUBT(ms, cParams, ip, iLimit, mls); + return ZSTD_DUBT_findBestMatch(ms, cParams, ip, iLimit, offsetPtr, mls, 0); +} + + +static size_t ZSTD_BtFindBestMatch_selectMLS ( + ZSTD_matchState_t* ms, ZSTD_compressionParameters const* cParams, + const BYTE* ip, const BYTE* const iLimit, + size_t* offsetPtr) +{ + switch(cParams->searchLength) + { + default : /* includes case 3 */ + case 4 : return ZSTD_BtFindBestMatch(ms, cParams, ip, iLimit, offsetPtr, 4); + case 5 : return ZSTD_BtFindBestMatch(ms, cParams, ip, iLimit, offsetPtr, 5); + case 7 : + case 6 : return ZSTD_BtFindBestMatch(ms, cParams, ip, iLimit, offsetPtr, 6); + } +} + + +/** Tree updater, providing best match */ +static size_t ZSTD_BtFindBestMatch_extDict ( + ZSTD_matchState_t* ms, ZSTD_compressionParameters const* cParams, + const BYTE* const ip, const BYTE* const iLimit, + size_t* offsetPtr, + const U32 mls) +{ + DEBUGLOG(7, "ZSTD_BtFindBestMatch_extDict"); + if (ip < ms->window.base + ms->nextToUpdate) return 0; /* skipped area */ + ZSTD_updateDUBT(ms, cParams, ip, iLimit, mls); + return ZSTD_DUBT_findBestMatch(ms, cParams, ip, iLimit, offsetPtr, mls, 1); +} + + +static size_t ZSTD_BtFindBestMatch_selectMLS_extDict ( + ZSTD_matchState_t* ms, ZSTD_compressionParameters const* cParams, + const BYTE* ip, const BYTE* const iLimit, + size_t* offsetPtr) +{ + switch(cParams->searchLength) + { + default : /* includes case 3 */ + case 4 : return ZSTD_BtFindBestMatch_extDict(ms, cParams, ip, iLimit, offsetPtr, 4); + case 5 : return ZSTD_BtFindBestMatch_extDict(ms, cParams, ip, iLimit, offsetPtr, 5); + case 7 : + case 6 : return ZSTD_BtFindBestMatch_extDict(ms, cParams, ip, iLimit, offsetPtr, 6); + } +} + + + +/* ********************************* +* Hash Chain +***********************************/ +#define NEXT_IN_CHAIN(d, mask) chainTable[(d) & mask] + +/* Update chains up to ip (excluded) + Assumption : always within prefix (i.e. not within extDict) */ +static U32 ZSTD_insertAndFindFirstIndex_internal( + ZSTD_matchState_t* ms, ZSTD_compressionParameters const* cParams, + const BYTE* ip, U32 const mls) +{ + U32* const hashTable = ms->hashTable; + const U32 hashLog = cParams->hashLog; + U32* const chainTable = ms->chainTable; + const U32 chainMask = (1 << cParams->chainLog) - 1; + const BYTE* const base = ms->window.base; + const U32 target = (U32)(ip - base); + U32 idx = ms->nextToUpdate; + + while(idx < target) { /* catch up */ + size_t const h = ZSTD_hashPtr(base+idx, hashLog, mls); + NEXT_IN_CHAIN(idx, chainMask) = hashTable[h]; + hashTable[h] = idx; + idx++; + } + + ms->nextToUpdate = target; + return hashTable[ZSTD_hashPtr(ip, hashLog, mls)]; +} + +U32 ZSTD_insertAndFindFirstIndex( + ZSTD_matchState_t* ms, ZSTD_compressionParameters const* cParams, + const BYTE* ip) +{ + return ZSTD_insertAndFindFirstIndex_internal(ms, cParams, ip, cParams->searchLength); +} + + +/* inlining is important to hardwire a hot branch (template emulation) */ +FORCE_INLINE_TEMPLATE +size_t ZSTD_HcFindBestMatch_generic ( + ZSTD_matchState_t* ms, ZSTD_compressionParameters const* cParams, + const BYTE* const ip, const BYTE* const iLimit, + size_t* offsetPtr, + const U32 mls, const U32 extDict) +{ + U32* const chainTable = ms->chainTable; + const U32 chainSize = (1 << cParams->chainLog); + const U32 chainMask = chainSize-1; + const BYTE* const base = ms->window.base; + const BYTE* const dictBase = ms->window.dictBase; + const U32 dictLimit = ms->window.dictLimit; + const BYTE* const prefixStart = base + dictLimit; + const BYTE* const dictEnd = dictBase + dictLimit; + const U32 lowLimit = ms->window.lowLimit; + const U32 current = (U32)(ip-base); + const U32 minChain = current > chainSize ? current - chainSize : 0; + U32 nbAttempts = 1U << cParams->searchLog; + size_t ml=4-1; + + /* HC4 match finder */ + U32 matchIndex = ZSTD_insertAndFindFirstIndex_internal(ms, cParams, ip, mls); + + for ( ; (matchIndex>lowLimit) & (nbAttempts>0) ; nbAttempts--) { + size_t currentMl=0; + if ((!extDict) || matchIndex >= dictLimit) { + const BYTE* const match = base + matchIndex; + if (match[ml] == ip[ml]) /* potentially better */ + currentMl = ZSTD_count(ip, match, iLimit); + } else { + const BYTE* const match = dictBase + matchIndex; + assert(match+4 <= dictEnd); + if (MEM_read32(match) == MEM_read32(ip)) /* assumption : matchIndex <= dictLimit-4 (by table construction) */ + currentMl = ZSTD_count_2segments(ip+4, match+4, iLimit, dictEnd, prefixStart) + 4; + } + + /* save best solution */ + if (currentMl > ml) { + ml = currentMl; + *offsetPtr = current - matchIndex + ZSTD_REP_MOVE; + if (ip+currentMl == iLimit) break; /* best possible, avoids read overflow on next attempt */ + } + + if (matchIndex <= minChain) break; + matchIndex = NEXT_IN_CHAIN(matchIndex, chainMask); + } + + return ml; +} + + +FORCE_INLINE_TEMPLATE size_t ZSTD_HcFindBestMatch_selectMLS ( + ZSTD_matchState_t* ms, ZSTD_compressionParameters const* cParams, + const BYTE* ip, const BYTE* const iLimit, + size_t* offsetPtr) +{ + switch(cParams->searchLength) + { + default : /* includes case 3 */ + case 4 : return ZSTD_HcFindBestMatch_generic(ms, cParams, ip, iLimit, offsetPtr, 4, 0); + case 5 : return ZSTD_HcFindBestMatch_generic(ms, cParams, ip, iLimit, offsetPtr, 5, 0); + case 7 : + case 6 : return ZSTD_HcFindBestMatch_generic(ms, cParams, ip, iLimit, offsetPtr, 6, 0); + } +} + + +FORCE_INLINE_TEMPLATE size_t ZSTD_HcFindBestMatch_extDict_selectMLS ( + ZSTD_matchState_t* ms, ZSTD_compressionParameters const* cParams, + const BYTE* ip, const BYTE* const iLimit, + size_t* const offsetPtr) +{ + switch(cParams->searchLength) + { + default : /* includes case 3 */ + case 4 : return ZSTD_HcFindBestMatch_generic(ms, cParams, ip, iLimit, offsetPtr, 4, 1); + case 5 : return ZSTD_HcFindBestMatch_generic(ms, cParams, ip, iLimit, offsetPtr, 5, 1); + case 7 : + case 6 : return ZSTD_HcFindBestMatch_generic(ms, cParams, ip, iLimit, offsetPtr, 6, 1); + } +} + + +/* ******************************* +* Common parser - lazy strategy +*********************************/ +FORCE_INLINE_TEMPLATE +size_t ZSTD_compressBlock_lazy_generic( + ZSTD_matchState_t* ms, seqStore_t* seqStore, + U32 rep[ZSTD_REP_NUM], + ZSTD_compressionParameters const* cParams, + const void* src, size_t srcSize, + const U32 searchMethod, const U32 depth) +{ + const BYTE* const istart = (const BYTE*)src; + const BYTE* ip = istart; + const BYTE* anchor = istart; + const BYTE* const iend = istart + srcSize; + const BYTE* const ilimit = iend - 8; + const BYTE* const base = ms->window.base + ms->window.dictLimit; + + typedef size_t (*searchMax_f)( + ZSTD_matchState_t* ms, ZSTD_compressionParameters const* cParams, + const BYTE* ip, const BYTE* iLimit, size_t* offsetPtr); + searchMax_f const searchMax = searchMethod ? ZSTD_BtFindBestMatch_selectMLS : ZSTD_HcFindBestMatch_selectMLS; + U32 offset_1 = rep[0], offset_2 = rep[1], savedOffset=0; + + /* init */ + ip += (ip==base); + ms->nextToUpdate3 = ms->nextToUpdate; + { U32 const maxRep = (U32)(ip-base); + if (offset_2 > maxRep) savedOffset = offset_2, offset_2 = 0; + if (offset_1 > maxRep) savedOffset = offset_1, offset_1 = 0; + } + + /* Match Loop */ + while (ip < ilimit) { + size_t matchLength=0; + size_t offset=0; + const BYTE* start=ip+1; + + /* check repCode */ + if ((offset_1>0) & (MEM_read32(ip+1) == MEM_read32(ip+1 - offset_1))) { + /* repcode : we take it */ + matchLength = ZSTD_count(ip+1+4, ip+1+4-offset_1, iend) + 4; + if (depth==0) goto _storeSequence; + } + + /* first search (depth 0) */ + { size_t offsetFound = 99999999; + size_t const ml2 = searchMax(ms, cParams, ip, iend, &offsetFound); + if (ml2 > matchLength) + matchLength = ml2, start = ip, offset=offsetFound; + } + + if (matchLength < 4) { + ip += ((ip-anchor) >> kSearchStrength) + 1; /* jump faster over incompressible sections */ + continue; + } + + /* let's try to find a better solution */ + if (depth>=1) + while (ip0) & (MEM_read32(ip) == MEM_read32(ip - offset_1)))) { + size_t const mlRep = ZSTD_count(ip+4, ip+4-offset_1, iend) + 4; + int const gain2 = (int)(mlRep * 3); + int const gain1 = (int)(matchLength*3 - ZSTD_highbit32((U32)offset+1) + 1); + if ((mlRep >= 4) && (gain2 > gain1)) + matchLength = mlRep, offset = 0, start = ip; + } + { size_t offset2=99999999; + size_t const ml2 = searchMax(ms, cParams, ip, iend, &offset2); + int const gain2 = (int)(ml2*4 - ZSTD_highbit32((U32)offset2+1)); /* raw approx */ + int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offset+1) + 4); + if ((ml2 >= 4) && (gain2 > gain1)) { + matchLength = ml2, offset = offset2, start = ip; + continue; /* search a better one */ + } } + + /* let's find an even better one */ + if ((depth==2) && (ip0) & (MEM_read32(ip) == MEM_read32(ip - offset_1)))) { + size_t const ml2 = ZSTD_count(ip+4, ip+4-offset_1, iend) + 4; + int const gain2 = (int)(ml2 * 4); + int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offset+1) + 1); + if ((ml2 >= 4) && (gain2 > gain1)) + matchLength = ml2, offset = 0, start = ip; + } + { size_t offset2=99999999; + size_t const ml2 = searchMax(ms, cParams, ip, iend, &offset2); + int const gain2 = (int)(ml2*4 - ZSTD_highbit32((U32)offset2+1)); /* raw approx */ + int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offset+1) + 7); + if ((ml2 >= 4) && (gain2 > gain1)) { + matchLength = ml2, offset = offset2, start = ip; + continue; + } } } + break; /* nothing found : store previous solution */ + } + + /* NOTE: + * start[-offset+ZSTD_REP_MOVE-1] is undefined behavior. + * (-offset+ZSTD_REP_MOVE-1) is unsigned, and is added to start, which + * overflows the pointer, which is undefined behavior. + */ + /* catch up */ + if (offset) { + while ( ((start > anchor) & (start - (offset-ZSTD_REP_MOVE) > base)) + && (start[-1] == (start-(offset-ZSTD_REP_MOVE))[-1]) ) /* only search for offset within prefix */ + { start--; matchLength++; } + offset_2 = offset_1; offset_1 = (U32)(offset - ZSTD_REP_MOVE); + } + /* store sequence */ +_storeSequence: + { size_t const litLength = start - anchor; + ZSTD_storeSeq(seqStore, litLength, anchor, (U32)offset, matchLength-MINMATCH); + anchor = ip = start + matchLength; + } + + /* check immediate repcode */ + while ( ((ip <= ilimit) & (offset_2>0)) + && (MEM_read32(ip) == MEM_read32(ip - offset_2)) ) { + /* store sequence */ + matchLength = ZSTD_count(ip+4, ip+4-offset_2, iend) + 4; + offset = offset_2; offset_2 = offset_1; offset_1 = (U32)offset; /* swap repcodes */ + ZSTD_storeSeq(seqStore, 0, anchor, 0, matchLength-MINMATCH); + ip += matchLength; + anchor = ip; + continue; /* faster when present ... (?) */ + } } + + /* Save reps for next block */ + rep[0] = offset_1 ? offset_1 : savedOffset; + rep[1] = offset_2 ? offset_2 : savedOffset; + + /* Return the last literals size */ + return iend - anchor; +} + + +size_t ZSTD_compressBlock_btlazy2( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + ZSTD_compressionParameters const* cParams, void const* src, size_t srcSize) +{ + return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, cParams, src, srcSize, 1, 2); +} + +size_t ZSTD_compressBlock_lazy2( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + ZSTD_compressionParameters const* cParams, void const* src, size_t srcSize) +{ + return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, cParams, src, srcSize, 0, 2); +} + +size_t ZSTD_compressBlock_lazy( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + ZSTD_compressionParameters const* cParams, void const* src, size_t srcSize) +{ + return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, cParams, src, srcSize, 0, 1); +} + +size_t ZSTD_compressBlock_greedy( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + ZSTD_compressionParameters const* cParams, void const* src, size_t srcSize) +{ + return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, cParams, src, srcSize, 0, 0); +} + + +FORCE_INLINE_TEMPLATE +size_t ZSTD_compressBlock_lazy_extDict_generic( + ZSTD_matchState_t* ms, seqStore_t* seqStore, + U32 rep[ZSTD_REP_NUM], + ZSTD_compressionParameters const* cParams, + const void* src, size_t srcSize, + const U32 searchMethod, const U32 depth) +{ + const BYTE* const istart = (const BYTE*)src; + const BYTE* ip = istart; + const BYTE* anchor = istart; + const BYTE* const iend = istart + srcSize; + const BYTE* const ilimit = iend - 8; + const BYTE* const base = ms->window.base; + const U32 dictLimit = ms->window.dictLimit; + const U32 lowestIndex = ms->window.lowLimit; + const BYTE* const prefixStart = base + dictLimit; + const BYTE* const dictBase = ms->window.dictBase; + const BYTE* const dictEnd = dictBase + dictLimit; + const BYTE* const dictStart = dictBase + lowestIndex; + + typedef size_t (*searchMax_f)( + ZSTD_matchState_t* ms, ZSTD_compressionParameters const* cParams, + const BYTE* ip, const BYTE* iLimit, size_t* offsetPtr); + searchMax_f searchMax = searchMethod ? ZSTD_BtFindBestMatch_selectMLS_extDict : ZSTD_HcFindBestMatch_extDict_selectMLS; + + U32 offset_1 = rep[0], offset_2 = rep[1]; + + /* init */ + ms->nextToUpdate3 = ms->nextToUpdate; + ip += (ip == prefixStart); + + /* Match Loop */ + while (ip < ilimit) { + size_t matchLength=0; + size_t offset=0; + const BYTE* start=ip+1; + U32 current = (U32)(ip-base); + + /* check repCode */ + { const U32 repIndex = (U32)(current+1 - offset_1); + const BYTE* const repBase = repIndex < dictLimit ? dictBase : base; + const BYTE* const repMatch = repBase + repIndex; + if (((U32)((dictLimit-1) - repIndex) >= 3) & (repIndex > lowestIndex)) /* intentional overflow */ + if (MEM_read32(ip+1) == MEM_read32(repMatch)) { + /* repcode detected we should take it */ + const BYTE* const repEnd = repIndex < dictLimit ? dictEnd : iend; + matchLength = ZSTD_count_2segments(ip+1+4, repMatch+4, iend, repEnd, prefixStart) + 4; + if (depth==0) goto _storeSequence; + } } + + /* first search (depth 0) */ + { size_t offsetFound = 99999999; + size_t const ml2 = searchMax(ms, cParams, ip, iend, &offsetFound); + if (ml2 > matchLength) + matchLength = ml2, start = ip, offset=offsetFound; + } + + if (matchLength < 4) { + ip += ((ip-anchor) >> kSearchStrength) + 1; /* jump faster over incompressible sections */ + continue; + } + + /* let's try to find a better solution */ + if (depth>=1) + while (ip= 3) & (repIndex > lowestIndex)) /* intentional overflow */ + if (MEM_read32(ip) == MEM_read32(repMatch)) { + /* repcode detected */ + const BYTE* const repEnd = repIndex < dictLimit ? dictEnd : iend; + size_t const repLength = ZSTD_count_2segments(ip+4, repMatch+4, iend, repEnd, prefixStart) + 4; + int const gain2 = (int)(repLength * 3); + int const gain1 = (int)(matchLength*3 - ZSTD_highbit32((U32)offset+1) + 1); + if ((repLength >= 4) && (gain2 > gain1)) + matchLength = repLength, offset = 0, start = ip; + } } + + /* search match, depth 1 */ + { size_t offset2=99999999; + size_t const ml2 = searchMax(ms, cParams, ip, iend, &offset2); + int const gain2 = (int)(ml2*4 - ZSTD_highbit32((U32)offset2+1)); /* raw approx */ + int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offset+1) + 4); + if ((ml2 >= 4) && (gain2 > gain1)) { + matchLength = ml2, offset = offset2, start = ip; + continue; /* search a better one */ + } } + + /* let's find an even better one */ + if ((depth==2) && (ip= 3) & (repIndex > lowestIndex)) /* intentional overflow */ + if (MEM_read32(ip) == MEM_read32(repMatch)) { + /* repcode detected */ + const BYTE* const repEnd = repIndex < dictLimit ? dictEnd : iend; + size_t const repLength = ZSTD_count_2segments(ip+4, repMatch+4, iend, repEnd, prefixStart) + 4; + int const gain2 = (int)(repLength * 4); + int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offset+1) + 1); + if ((repLength >= 4) && (gain2 > gain1)) + matchLength = repLength, offset = 0, start = ip; + } } + + /* search match, depth 2 */ + { size_t offset2=99999999; + size_t const ml2 = searchMax(ms, cParams, ip, iend, &offset2); + int const gain2 = (int)(ml2*4 - ZSTD_highbit32((U32)offset2+1)); /* raw approx */ + int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offset+1) + 7); + if ((ml2 >= 4) && (gain2 > gain1)) { + matchLength = ml2, offset = offset2, start = ip; + continue; + } } } + break; /* nothing found : store previous solution */ + } + + /* catch up */ + if (offset) { + U32 const matchIndex = (U32)((start-base) - (offset - ZSTD_REP_MOVE)); + const BYTE* match = (matchIndex < dictLimit) ? dictBase + matchIndex : base + matchIndex; + const BYTE* const mStart = (matchIndex < dictLimit) ? dictStart : prefixStart; + while ((start>anchor) && (match>mStart) && (start[-1] == match[-1])) { start--; match--; matchLength++; } /* catch up */ + offset_2 = offset_1; offset_1 = (U32)(offset - ZSTD_REP_MOVE); + } + + /* store sequence */ +_storeSequence: + { size_t const litLength = start - anchor; + ZSTD_storeSeq(seqStore, litLength, anchor, (U32)offset, matchLength-MINMATCH); + anchor = ip = start + matchLength; + } + + /* check immediate repcode */ + while (ip <= ilimit) { + const U32 repIndex = (U32)((ip-base) - offset_2); + const BYTE* const repBase = repIndex < dictLimit ? dictBase : base; + const BYTE* const repMatch = repBase + repIndex; + if (((U32)((dictLimit-1) - repIndex) >= 3) & (repIndex > lowestIndex)) /* intentional overflow */ + if (MEM_read32(ip) == MEM_read32(repMatch)) { + /* repcode detected we should take it */ + const BYTE* const repEnd = repIndex < dictLimit ? dictEnd : iend; + matchLength = ZSTD_count_2segments(ip+4, repMatch+4, iend, repEnd, prefixStart) + 4; + offset = offset_2; offset_2 = offset_1; offset_1 = (U32)offset; /* swap offset history */ + ZSTD_storeSeq(seqStore, 0, anchor, 0, matchLength-MINMATCH); + ip += matchLength; + anchor = ip; + continue; /* faster when present ... (?) */ + } + break; + } } + + /* Save reps for next block */ + rep[0] = offset_1; + rep[1] = offset_2; + + /* Return the last literals size */ + return iend - anchor; +} + + +size_t ZSTD_compressBlock_greedy_extDict( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + ZSTD_compressionParameters const* cParams, void const* src, size_t srcSize) +{ + return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, cParams, src, srcSize, 0, 0); +} + +size_t ZSTD_compressBlock_lazy_extDict( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + ZSTD_compressionParameters const* cParams, void const* src, size_t srcSize) + +{ + return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, cParams, src, srcSize, 0, 1); +} + +size_t ZSTD_compressBlock_lazy2_extDict( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + ZSTD_compressionParameters const* cParams, void const* src, size_t srcSize) + +{ + return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, cParams, src, srcSize, 0, 2); +} + +size_t ZSTD_compressBlock_btlazy2_extDict( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + ZSTD_compressionParameters const* cParams, void const* src, size_t srcSize) + +{ + return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, cParams, src, srcSize, 1, 2); +} diff -r fb92df8b634c -r ed5448edcbfa contrib/python-zstandard/zstd/compress/zstd_lazy.h --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/contrib/python-zstandard/zstd/compress/zstd_lazy.h Wed Apr 18 15:32:08 2018 -0400 @@ -0,0 +1,56 @@ +/* + * Copyright (c) 2016-present, Yann Collet, Facebook, Inc. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the + * LICENSE file in the root directory of this source tree) and the GPLv2 (found + * in the COPYING file in the root directory of this source tree). + * You may select, at your option, one of the above-listed licenses. + */ + +#ifndef ZSTD_LAZY_H +#define ZSTD_LAZY_H + +#if defined (__cplusplus) +extern "C" { +#endif + +#include "zstd_compress_internal.h" + +U32 ZSTD_insertAndFindFirstIndex( + ZSTD_matchState_t* ms, ZSTD_compressionParameters const* cParams, + const BYTE* ip); + +void ZSTD_preserveUnsortedMark (U32* const table, U32 const size, U32 const reducerValue); /*! used in ZSTD_reduceIndex(). pre-emptively increase value of ZSTD_DUBT_UNSORTED_MARK */ + +size_t ZSTD_compressBlock_btlazy2( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + ZSTD_compressionParameters const* cParams, void const* src, size_t srcSize); +size_t ZSTD_compressBlock_lazy2( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + ZSTD_compressionParameters const* cParams, void const* src, size_t srcSize); +size_t ZSTD_compressBlock_lazy( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + ZSTD_compressionParameters const* cParams, void const* src, size_t srcSize); +size_t ZSTD_compressBlock_greedy( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + ZSTD_compressionParameters const* cParams, void const* src, size_t srcSize); + +size_t ZSTD_compressBlock_greedy_extDict( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + ZSTD_compressionParameters const* cParams, void const* src, size_t srcSize); +size_t ZSTD_compressBlock_lazy_extDict( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + ZSTD_compressionParameters const* cParams, void const* src, size_t srcSize); +size_t ZSTD_compressBlock_lazy2_extDict( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + ZSTD_compressionParameters const* cParams, void const* src, size_t srcSize); +size_t ZSTD_compressBlock_btlazy2_extDict( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + ZSTD_compressionParameters const* cParams, void const* src, size_t srcSize); + +#if defined (__cplusplus) +} +#endif + +#endif /* ZSTD_LAZY_H */ diff -r fb92df8b634c -r ed5448edcbfa contrib/python-zstandard/zstd/compress/zstd_ldm.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/contrib/python-zstandard/zstd/compress/zstd_ldm.c Wed Apr 18 15:32:08 2018 -0400 @@ -0,0 +1,653 @@ +/* + * Copyright (c) 2016-present, Yann Collet, Facebook, Inc. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the + * LICENSE file in the root directory of this source tree) and the GPLv2 (found + * in the COPYING file in the root directory of this source tree). + */ + +#include "zstd_ldm.h" + +#include "zstd_fast.h" /* ZSTD_fillHashTable() */ +#include "zstd_double_fast.h" /* ZSTD_fillDoubleHashTable() */ + +#define LDM_BUCKET_SIZE_LOG 3 +#define LDM_MIN_MATCH_LENGTH 64 +#define LDM_HASH_RLOG 7 +#define LDM_HASH_CHAR_OFFSET 10 + +void ZSTD_ldm_adjustParameters(ldmParams_t* params, + ZSTD_compressionParameters const* cParams) +{ + U32 const windowLog = cParams->windowLog; + ZSTD_STATIC_ASSERT(LDM_BUCKET_SIZE_LOG <= ZSTD_LDM_BUCKETSIZELOG_MAX); + DEBUGLOG(4, "ZSTD_ldm_adjustParameters"); + if (!params->bucketSizeLog) params->bucketSizeLog = LDM_BUCKET_SIZE_LOG; + if (!params->minMatchLength) params->minMatchLength = LDM_MIN_MATCH_LENGTH; + if (cParams->strategy >= ZSTD_btopt) { + /* Get out of the way of the optimal parser */ + U32 const minMatch = MAX(cParams->targetLength, params->minMatchLength); + assert(minMatch >= ZSTD_LDM_MINMATCH_MIN); + assert(minMatch <= ZSTD_LDM_MINMATCH_MAX); + params->minMatchLength = minMatch; + } + if (params->hashLog == 0) { + params->hashLog = MAX(ZSTD_HASHLOG_MIN, windowLog - LDM_HASH_RLOG); + assert(params->hashLog <= ZSTD_HASHLOG_MAX); + } + if (params->hashEveryLog == 0) { + params->hashEveryLog = + windowLog < params->hashLog ? 0 : windowLog - params->hashLog; + } + params->bucketSizeLog = MIN(params->bucketSizeLog, params->hashLog); +} + +size_t ZSTD_ldm_getTableSize(ldmParams_t params) +{ + size_t const ldmHSize = ((size_t)1) << params.hashLog; + size_t const ldmBucketSizeLog = MIN(params.bucketSizeLog, params.hashLog); + size_t const ldmBucketSize = + ((size_t)1) << (params.hashLog - ldmBucketSizeLog); + size_t const totalSize = ldmBucketSize + ldmHSize * sizeof(ldmEntry_t); + return params.enableLdm ? totalSize : 0; +} + +size_t ZSTD_ldm_getMaxNbSeq(ldmParams_t params, size_t maxChunkSize) +{ + return params.enableLdm ? (maxChunkSize / params.minMatchLength) : 0; +} + +/** ZSTD_ldm_getSmallHash() : + * numBits should be <= 32 + * If numBits==0, returns 0. + * @return : the most significant numBits of value. */ +static U32 ZSTD_ldm_getSmallHash(U64 value, U32 numBits) +{ + assert(numBits <= 32); + return numBits == 0 ? 0 : (U32)(value >> (64 - numBits)); +} + +/** ZSTD_ldm_getChecksum() : + * numBitsToDiscard should be <= 32 + * @return : the next most significant 32 bits after numBitsToDiscard */ +static U32 ZSTD_ldm_getChecksum(U64 hash, U32 numBitsToDiscard) +{ + assert(numBitsToDiscard <= 32); + return (hash >> (64 - 32 - numBitsToDiscard)) & 0xFFFFFFFF; +} + +/** ZSTD_ldm_getTag() ; + * Given the hash, returns the most significant numTagBits bits + * after (32 + hbits) bits. + * + * If there are not enough bits remaining, return the last + * numTagBits bits. */ +static U32 ZSTD_ldm_getTag(U64 hash, U32 hbits, U32 numTagBits) +{ + assert(numTagBits < 32 && hbits <= 32); + if (32 - hbits < numTagBits) { + return hash & (((U32)1 << numTagBits) - 1); + } else { + return (hash >> (32 - hbits - numTagBits)) & (((U32)1 << numTagBits) - 1); + } +} + +/** ZSTD_ldm_getBucket() : + * Returns a pointer to the start of the bucket associated with hash. */ +static ldmEntry_t* ZSTD_ldm_getBucket( + ldmState_t* ldmState, size_t hash, ldmParams_t const ldmParams) +{ + return ldmState->hashTable + (hash << ldmParams.bucketSizeLog); +} + +/** ZSTD_ldm_insertEntry() : + * Insert the entry with corresponding hash into the hash table */ +static void ZSTD_ldm_insertEntry(ldmState_t* ldmState, + size_t const hash, const ldmEntry_t entry, + ldmParams_t const ldmParams) +{ + BYTE* const bucketOffsets = ldmState->bucketOffsets; + *(ZSTD_ldm_getBucket(ldmState, hash, ldmParams) + bucketOffsets[hash]) = entry; + bucketOffsets[hash]++; + bucketOffsets[hash] &= ((U32)1 << ldmParams.bucketSizeLog) - 1; +} + +/** ZSTD_ldm_makeEntryAndInsertByTag() : + * + * Gets the small hash, checksum, and tag from the rollingHash. + * + * If the tag matches (1 << ldmParams.hashEveryLog)-1, then + * creates an ldmEntry from the offset, and inserts it into the hash table. + * + * hBits is the length of the small hash, which is the most significant hBits + * of rollingHash. The checksum is the next 32 most significant bits, followed + * by ldmParams.hashEveryLog bits that make up the tag. */ +static void ZSTD_ldm_makeEntryAndInsertByTag(ldmState_t* ldmState, + U64 const rollingHash, + U32 const hBits, + U32 const offset, + ldmParams_t const ldmParams) +{ + U32 const tag = ZSTD_ldm_getTag(rollingHash, hBits, ldmParams.hashEveryLog); + U32 const tagMask = ((U32)1 << ldmParams.hashEveryLog) - 1; + if (tag == tagMask) { + U32 const hash = ZSTD_ldm_getSmallHash(rollingHash, hBits); + U32 const checksum = ZSTD_ldm_getChecksum(rollingHash, hBits); + ldmEntry_t entry; + entry.offset = offset; + entry.checksum = checksum; + ZSTD_ldm_insertEntry(ldmState, hash, entry, ldmParams); + } +} + +/** ZSTD_ldm_getRollingHash() : + * Get a 64-bit hash using the first len bytes from buf. + * + * Giving bytes s = s_1, s_2, ... s_k, the hash is defined to be + * H(s) = s_1*(a^(k-1)) + s_2*(a^(k-2)) + ... + s_k*(a^0) + * + * where the constant a is defined to be prime8bytes. + * + * The implementation adds an offset to each byte, so + * H(s) = (s_1 + HASH_CHAR_OFFSET)*(a^(k-1)) + ... */ +static U64 ZSTD_ldm_getRollingHash(const BYTE* buf, U32 len) +{ + U64 ret = 0; + U32 i; + for (i = 0; i < len; i++) { + ret *= prime8bytes; + ret += buf[i] + LDM_HASH_CHAR_OFFSET; + } + return ret; +} + +/** ZSTD_ldm_ipow() : + * Return base^exp. */ +static U64 ZSTD_ldm_ipow(U64 base, U64 exp) +{ + U64 ret = 1; + while (exp) { + if (exp & 1) { ret *= base; } + exp >>= 1; + base *= base; + } + return ret; +} + +U64 ZSTD_ldm_getHashPower(U32 minMatchLength) { + DEBUGLOG(4, "ZSTD_ldm_getHashPower: mml=%u", minMatchLength); + assert(minMatchLength >= ZSTD_LDM_MINMATCH_MIN); + return ZSTD_ldm_ipow(prime8bytes, minMatchLength - 1); +} + +/** ZSTD_ldm_updateHash() : + * Updates hash by removing toRemove and adding toAdd. */ +static U64 ZSTD_ldm_updateHash(U64 hash, BYTE toRemove, BYTE toAdd, U64 hashPower) +{ + hash -= ((toRemove + LDM_HASH_CHAR_OFFSET) * hashPower); + hash *= prime8bytes; + hash += toAdd + LDM_HASH_CHAR_OFFSET; + return hash; +} + +/** ZSTD_ldm_countBackwardsMatch() : + * Returns the number of bytes that match backwards before pIn and pMatch. + * + * We count only bytes where pMatch >= pBase and pIn >= pAnchor. */ +static size_t ZSTD_ldm_countBackwardsMatch( + const BYTE* pIn, const BYTE* pAnchor, + const BYTE* pMatch, const BYTE* pBase) +{ + size_t matchLength = 0; + while (pIn > pAnchor && pMatch > pBase && pIn[-1] == pMatch[-1]) { + pIn--; + pMatch--; + matchLength++; + } + return matchLength; +} + +/** ZSTD_ldm_fillFastTables() : + * + * Fills the relevant tables for the ZSTD_fast and ZSTD_dfast strategies. + * This is similar to ZSTD_loadDictionaryContent. + * + * The tables for the other strategies are filled within their + * block compressors. */ +static size_t ZSTD_ldm_fillFastTables(ZSTD_matchState_t* ms, + ZSTD_compressionParameters const* cParams, + void const* end) +{ + const BYTE* const iend = (const BYTE*)end; + + switch(cParams->strategy) + { + case ZSTD_fast: + ZSTD_fillHashTable(ms, cParams, iend); + ms->nextToUpdate = (U32)(iend - ms->window.base); + break; + + case ZSTD_dfast: + ZSTD_fillDoubleHashTable(ms, cParams, iend); + ms->nextToUpdate = (U32)(iend - ms->window.base); + break; + + case ZSTD_greedy: + case ZSTD_lazy: + case ZSTD_lazy2: + case ZSTD_btlazy2: + case ZSTD_btopt: + case ZSTD_btultra: + break; + default: + assert(0); /* not possible : not a valid strategy id */ + } + + return 0; +} + +/** ZSTD_ldm_fillLdmHashTable() : + * + * Fills hashTable from (lastHashed + 1) to iend (non-inclusive). + * lastHash is the rolling hash that corresponds to lastHashed. + * + * Returns the rolling hash corresponding to position iend-1. */ +static U64 ZSTD_ldm_fillLdmHashTable(ldmState_t* state, + U64 lastHash, const BYTE* lastHashed, + const BYTE* iend, const BYTE* base, + U32 hBits, ldmParams_t const ldmParams) +{ + U64 rollingHash = lastHash; + const BYTE* cur = lastHashed + 1; + + while (cur < iend) { + rollingHash = ZSTD_ldm_updateHash(rollingHash, cur[-1], + cur[ldmParams.minMatchLength-1], + state->hashPower); + ZSTD_ldm_makeEntryAndInsertByTag(state, + rollingHash, hBits, + (U32)(cur - base), ldmParams); + ++cur; + } + return rollingHash; +} + + +/** ZSTD_ldm_limitTableUpdate() : + * + * Sets cctx->nextToUpdate to a position corresponding closer to anchor + * if it is far way + * (after a long match, only update tables a limited amount). */ +static void ZSTD_ldm_limitTableUpdate(ZSTD_matchState_t* ms, const BYTE* anchor) +{ + U32 const current = (U32)(anchor - ms->window.base); + if (current > ms->nextToUpdate + 1024) { + ms->nextToUpdate = + current - MIN(512, current - ms->nextToUpdate - 1024); + } +} + +static size_t ZSTD_ldm_generateSequences_internal( + ldmState_t* ldmState, rawSeqStore_t* rawSeqStore, + ldmParams_t const* params, void const* src, size_t srcSize) +{ + /* LDM parameters */ + int const extDict = ZSTD_window_hasExtDict(ldmState->window); + U32 const minMatchLength = params->minMatchLength; + U64 const hashPower = ldmState->hashPower; + U32 const hBits = params->hashLog - params->bucketSizeLog; + U32 const ldmBucketSize = 1U << params->bucketSizeLog; + U32 const hashEveryLog = params->hashEveryLog; + U32 const ldmTagMask = (1U << params->hashEveryLog) - 1; + /* Prefix and extDict parameters */ + U32 const dictLimit = ldmState->window.dictLimit; + U32 const lowestIndex = extDict ? ldmState->window.lowLimit : dictLimit; + BYTE const* const base = ldmState->window.base; + BYTE const* const dictBase = extDict ? ldmState->window.dictBase : NULL; + BYTE const* const dictStart = extDict ? dictBase + lowestIndex : NULL; + BYTE const* const dictEnd = extDict ? dictBase + dictLimit : NULL; + BYTE const* const lowPrefixPtr = base + dictLimit; + /* Input bounds */ + BYTE const* const istart = (BYTE const*)src; + BYTE const* const iend = istart + srcSize; + BYTE const* const ilimit = iend - MAX(minMatchLength, HASH_READ_SIZE); + /* Input positions */ + BYTE const* anchor = istart; + BYTE const* ip = istart; + /* Rolling hash */ + BYTE const* lastHashed = NULL; + U64 rollingHash = 0; + + while (ip <= ilimit) { + size_t mLength; + U32 const current = (U32)(ip - base); + size_t forwardMatchLength = 0, backwardMatchLength = 0; + ldmEntry_t* bestEntry = NULL; + if (ip != istart) { + rollingHash = ZSTD_ldm_updateHash(rollingHash, lastHashed[0], + lastHashed[minMatchLength], + hashPower); + } else { + rollingHash = ZSTD_ldm_getRollingHash(ip, minMatchLength); + } + lastHashed = ip; + + /* Do not insert and do not look for a match */ + if (ZSTD_ldm_getTag(rollingHash, hBits, hashEveryLog) != ldmTagMask) { + ip++; + continue; + } + + /* Get the best entry and compute the match lengths */ + { + ldmEntry_t* const bucket = + ZSTD_ldm_getBucket(ldmState, + ZSTD_ldm_getSmallHash(rollingHash, hBits), + *params); + ldmEntry_t* cur; + size_t bestMatchLength = 0; + U32 const checksum = ZSTD_ldm_getChecksum(rollingHash, hBits); + + for (cur = bucket; cur < bucket + ldmBucketSize; ++cur) { + size_t curForwardMatchLength, curBackwardMatchLength, + curTotalMatchLength; + if (cur->checksum != checksum || cur->offset <= lowestIndex) { + continue; + } + if (extDict) { + BYTE const* const curMatchBase = + cur->offset < dictLimit ? dictBase : base; + BYTE const* const pMatch = curMatchBase + cur->offset; + BYTE const* const matchEnd = + cur->offset < dictLimit ? dictEnd : iend; + BYTE const* const lowMatchPtr = + cur->offset < dictLimit ? dictStart : lowPrefixPtr; + + curForwardMatchLength = ZSTD_count_2segments( + ip, pMatch, iend, + matchEnd, lowPrefixPtr); + if (curForwardMatchLength < minMatchLength) { + continue; + } + curBackwardMatchLength = + ZSTD_ldm_countBackwardsMatch(ip, anchor, pMatch, + lowMatchPtr); + curTotalMatchLength = curForwardMatchLength + + curBackwardMatchLength; + } else { /* !extDict */ + BYTE const* const pMatch = base + cur->offset; + curForwardMatchLength = ZSTD_count(ip, pMatch, iend); + if (curForwardMatchLength < minMatchLength) { + continue; + } + curBackwardMatchLength = + ZSTD_ldm_countBackwardsMatch(ip, anchor, pMatch, + lowPrefixPtr); + curTotalMatchLength = curForwardMatchLength + + curBackwardMatchLength; + } + + if (curTotalMatchLength > bestMatchLength) { + bestMatchLength = curTotalMatchLength; + forwardMatchLength = curForwardMatchLength; + backwardMatchLength = curBackwardMatchLength; + bestEntry = cur; + } + } + } + + /* No match found -- continue searching */ + if (bestEntry == NULL) { + ZSTD_ldm_makeEntryAndInsertByTag(ldmState, rollingHash, + hBits, current, + *params); + ip++; + continue; + } + + /* Match found */ + mLength = forwardMatchLength + backwardMatchLength; + ip -= backwardMatchLength; + + { + /* Store the sequence: + * ip = current - backwardMatchLength + * The match is at (bestEntry->offset - backwardMatchLength) + */ + U32 const matchIndex = bestEntry->offset; + U32 const offset = current - matchIndex; + rawSeq* const seq = rawSeqStore->seq + rawSeqStore->size; + + /* Out of sequence storage */ + if (rawSeqStore->size == rawSeqStore->capacity) + return ERROR(dstSize_tooSmall); + seq->litLength = (U32)(ip - anchor); + seq->matchLength = (U32)mLength; + seq->offset = offset; + rawSeqStore->size++; + } + + /* Insert the current entry into the hash table */ + ZSTD_ldm_makeEntryAndInsertByTag(ldmState, rollingHash, hBits, + (U32)(lastHashed - base), + *params); + + assert(ip + backwardMatchLength == lastHashed); + + /* Fill the hash table from lastHashed+1 to ip+mLength*/ + /* Heuristic: don't need to fill the entire table at end of block */ + if (ip + mLength <= ilimit) { + rollingHash = ZSTD_ldm_fillLdmHashTable( + ldmState, rollingHash, lastHashed, + ip + mLength, base, hBits, *params); + lastHashed = ip + mLength - 1; + } + ip += mLength; + anchor = ip; + } + return iend - anchor; +} + +/*! ZSTD_ldm_reduceTable() : + * reduce table indexes by `reducerValue` */ +static void ZSTD_ldm_reduceTable(ldmEntry_t* const table, U32 const size, + U32 const reducerValue) +{ + U32 u; + for (u = 0; u < size; u++) { + if (table[u].offset < reducerValue) table[u].offset = 0; + else table[u].offset -= reducerValue; + } +} + +size_t ZSTD_ldm_generateSequences( + ldmState_t* ldmState, rawSeqStore_t* sequences, + ldmParams_t const* params, void const* src, size_t srcSize) +{ + U32 const maxDist = 1U << params->windowLog; + BYTE const* const istart = (BYTE const*)src; + BYTE const* const iend = istart + srcSize; + size_t const kMaxChunkSize = 1 << 20; + size_t const nbChunks = (srcSize / kMaxChunkSize) + ((srcSize % kMaxChunkSize) != 0); + size_t chunk; + size_t leftoverSize = 0; + + assert(ZSTD_CHUNKSIZE_MAX >= kMaxChunkSize); + /* Check that ZSTD_window_update() has been called for this chunk prior + * to passing it to this function. + */ + assert(ldmState->window.nextSrc >= (BYTE const*)src + srcSize); + /* The input could be very large (in zstdmt), so it must be broken up into + * chunks to enforce the maximmum distance and handle overflow correction. + */ + assert(sequences->pos <= sequences->size); + assert(sequences->size <= sequences->capacity); + for (chunk = 0; chunk < nbChunks && sequences->size < sequences->capacity; ++chunk) { + BYTE const* const chunkStart = istart + chunk * kMaxChunkSize; + size_t const remaining = (size_t)(iend - chunkStart); + BYTE const *const chunkEnd = + (remaining < kMaxChunkSize) ? iend : chunkStart + kMaxChunkSize; + size_t const chunkSize = chunkEnd - chunkStart; + size_t newLeftoverSize; + size_t const prevSize = sequences->size; + + assert(chunkStart < iend); + /* 1. Perform overflow correction if necessary. */ + if (ZSTD_window_needOverflowCorrection(ldmState->window, chunkEnd)) { + U32 const ldmHSize = 1U << params->hashLog; + U32 const correction = ZSTD_window_correctOverflow( + &ldmState->window, /* cycleLog */ 0, maxDist, src); + ZSTD_ldm_reduceTable(ldmState->hashTable, ldmHSize, correction); + } + /* 2. We enforce the maximum offset allowed. + * + * kMaxChunkSize should be small enough that we don't lose too much of + * the window through early invalidation. + * TODO: * Test the chunk size. + * * Try invalidation after the sequence generation and test the + * the offset against maxDist directly. + */ + ZSTD_window_enforceMaxDist(&ldmState->window, chunkEnd, maxDist, NULL); + /* 3. Generate the sequences for the chunk, and get newLeftoverSize. */ + newLeftoverSize = ZSTD_ldm_generateSequences_internal( + ldmState, sequences, params, chunkStart, chunkSize); + if (ZSTD_isError(newLeftoverSize)) + return newLeftoverSize; + /* 4. We add the leftover literals from previous iterations to the first + * newly generated sequence, or add the `newLeftoverSize` if none are + * generated. + */ + /* Prepend the leftover literals from the last call */ + if (prevSize < sequences->size) { + sequences->seq[prevSize].litLength += (U32)leftoverSize; + leftoverSize = newLeftoverSize; + } else { + assert(newLeftoverSize == chunkSize); + leftoverSize += chunkSize; + } + } + return 0; +} + +void ZSTD_ldm_skipSequences(rawSeqStore_t* rawSeqStore, size_t srcSize, U32 const minMatch) { + while (srcSize > 0 && rawSeqStore->pos < rawSeqStore->size) { + rawSeq* seq = rawSeqStore->seq + rawSeqStore->pos; + if (srcSize <= seq->litLength) { + /* Skip past srcSize literals */ + seq->litLength -= (U32)srcSize; + return; + } + srcSize -= seq->litLength; + seq->litLength = 0; + if (srcSize < seq->matchLength) { + /* Skip past the first srcSize of the match */ + seq->matchLength -= (U32)srcSize; + if (seq->matchLength < minMatch) { + /* The match is too short, omit it */ + if (rawSeqStore->pos + 1 < rawSeqStore->size) { + seq[1].litLength += seq[0].matchLength; + } + rawSeqStore->pos++; + } + return; + } + srcSize -= seq->matchLength; + seq->matchLength = 0; + rawSeqStore->pos++; + } +} + +/** + * If the sequence length is longer than remaining then the sequence is split + * between this block and the next. + * + * Returns the current sequence to handle, or if the rest of the block should + * be literals, it returns a sequence with offset == 0. + */ +static rawSeq maybeSplitSequence(rawSeqStore_t* rawSeqStore, + U32 const remaining, U32 const minMatch) +{ + rawSeq sequence = rawSeqStore->seq[rawSeqStore->pos]; + assert(sequence.offset > 0); + /* Likely: No partial sequence */ + if (remaining >= sequence.litLength + sequence.matchLength) { + rawSeqStore->pos++; + return sequence; + } + /* Cut the sequence short (offset == 0 ==> rest is literals). */ + if (remaining <= sequence.litLength) { + sequence.offset = 0; + } else if (remaining < sequence.litLength + sequence.matchLength) { + sequence.matchLength = remaining - sequence.litLength; + if (sequence.matchLength < minMatch) { + sequence.offset = 0; + } + } + /* Skip past `remaining` bytes for the future sequences. */ + ZSTD_ldm_skipSequences(rawSeqStore, remaining, minMatch); + return sequence; +} + +size_t ZSTD_ldm_blockCompress(rawSeqStore_t* rawSeqStore, + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + ZSTD_compressionParameters const* cParams, void const* src, size_t srcSize, + int const extDict) +{ + unsigned const minMatch = cParams->searchLength; + ZSTD_blockCompressor const blockCompressor = + ZSTD_selectBlockCompressor(cParams->strategy, extDict); + BYTE const* const base = ms->window.base; + /* Input bounds */ + BYTE const* const istart = (BYTE const*)src; + BYTE const* const iend = istart + srcSize; + /* Input positions */ + BYTE const* ip = istart; + + assert(rawSeqStore->pos <= rawSeqStore->size); + assert(rawSeqStore->size <= rawSeqStore->capacity); + /* Loop through each sequence and apply the block compressor to the lits */ + while (rawSeqStore->pos < rawSeqStore->size && ip < iend) { + /* maybeSplitSequence updates rawSeqStore->pos */ + rawSeq const sequence = maybeSplitSequence(rawSeqStore, + (U32)(iend - ip), minMatch); + int i; + /* End signal */ + if (sequence.offset == 0) + break; + + assert(sequence.offset <= (1U << cParams->windowLog)); + assert(ip + sequence.litLength + sequence.matchLength <= iend); + + /* Fill tables for block compressor */ + ZSTD_ldm_limitTableUpdate(ms, ip); + ZSTD_ldm_fillFastTables(ms, cParams, ip); + /* Run the block compressor */ + { + size_t const newLitLength = + blockCompressor(ms, seqStore, rep, cParams, ip, + sequence.litLength); + ip += sequence.litLength; + ms->nextToUpdate = (U32)(ip - base); + /* Update the repcodes */ + for (i = ZSTD_REP_NUM - 1; i > 0; i--) + rep[i] = rep[i-1]; + rep[0] = sequence.offset; + /* Store the sequence */ + ZSTD_storeSeq(seqStore, newLitLength, ip - newLitLength, + sequence.offset + ZSTD_REP_MOVE, + sequence.matchLength - MINMATCH); + ip += sequence.matchLength; + } + } + /* Fill the tables for the block compressor */ + ZSTD_ldm_limitTableUpdate(ms, ip); + ZSTD_ldm_fillFastTables(ms, cParams, ip); + /* Compress the last literals */ + { + size_t const lastLiterals = blockCompressor(ms, seqStore, rep, cParams, + ip, iend - ip); + ms->nextToUpdate = (U32)(iend - base); + return lastLiterals; + } +} diff -r fb92df8b634c -r ed5448edcbfa contrib/python-zstandard/zstd/compress/zstd_ldm.h --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/contrib/python-zstandard/zstd/compress/zstd_ldm.h Wed Apr 18 15:32:08 2018 -0400 @@ -0,0 +1,111 @@ +/* + * Copyright (c) 2016-present, Yann Collet, Facebook, Inc. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the + * LICENSE file in the root directory of this source tree) and the GPLv2 (found + * in the COPYING file in the root directory of this source tree). + */ + +#ifndef ZSTD_LDM_H +#define ZSTD_LDM_H + +#if defined (__cplusplus) +extern "C" { +#endif + +#include "zstd_compress_internal.h" /* ldmParams_t, U32 */ +#include "zstd.h" /* ZSTD_CCtx, size_t */ + +/*-************************************* +* Long distance matching +***************************************/ + +#define ZSTD_LDM_DEFAULT_WINDOW_LOG ZSTD_WINDOWLOG_DEFAULTMAX + +/** + * ZSTD_ldm_generateSequences(): + * + * Generates the sequences using the long distance match finder. + * Generates long range matching sequences in `sequences`, which parse a prefix + * of the source. `sequences` must be large enough to store every sequence, + * which can be checked with `ZSTD_ldm_getMaxNbSeq()`. + * @returns 0 or an error code. + * + * NOTE: The user must have called ZSTD_window_update() for all of the input + * they have, even if they pass it to ZSTD_ldm_generateSequences() in chunks. + * NOTE: This function returns an error if it runs out of space to store + * sequences. + */ +size_t ZSTD_ldm_generateSequences( + ldmState_t* ldms, rawSeqStore_t* sequences, + ldmParams_t const* params, void const* src, size_t srcSize); + +/** + * ZSTD_ldm_blockCompress(): + * + * Compresses a block using the predefined sequences, along with a secondary + * block compressor. The literals section of every sequence is passed to the + * secondary block compressor, and those sequences are interspersed with the + * predefined sequences. Returns the length of the last literals. + * Updates `rawSeqStore.pos` to indicate how many sequences have been consumed. + * `rawSeqStore.seq` may also be updated to split the last sequence between two + * blocks. + * @return The length of the last literals. + * + * NOTE: The source must be at most the maximum block size, but the predefined + * sequences can be any size, and may be longer than the block. In the case that + * they are longer than the block, the last sequences may need to be split into + * two. We handle that case correctly, and update `rawSeqStore` appropriately. + * NOTE: This function does not return any errors. + */ +size_t ZSTD_ldm_blockCompress(rawSeqStore_t* rawSeqStore, + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + ZSTD_compressionParameters const* cParams, + void const* src, size_t srcSize, + int const extDict); + +/** + * ZSTD_ldm_skipSequences(): + * + * Skip past `srcSize` bytes worth of sequences in `rawSeqStore`. + * Avoids emitting matches less than `minMatch` bytes. + * Must be called for data with is not passed to ZSTD_ldm_blockCompress(). + */ +void ZSTD_ldm_skipSequences(rawSeqStore_t* rawSeqStore, size_t srcSize, + U32 const minMatch); + + +/** ZSTD_ldm_getTableSize() : + * Estimate the space needed for long distance matching tables or 0 if LDM is + * disabled. + */ +size_t ZSTD_ldm_getTableSize(ldmParams_t params); + +/** ZSTD_ldm_getSeqSpace() : + * Return an upper bound on the number of sequences that can be produced by + * the long distance matcher, or 0 if LDM is disabled. + */ +size_t ZSTD_ldm_getMaxNbSeq(ldmParams_t params, size_t maxChunkSize); + +/** ZSTD_ldm_getTableSize() : + * Return prime8bytes^(minMatchLength-1) */ +U64 ZSTD_ldm_getHashPower(U32 minMatchLength); + +/** ZSTD_ldm_adjustParameters() : + * If the params->hashEveryLog is not set, set it to its default value based on + * windowLog and params->hashLog. + * + * Ensures that params->bucketSizeLog is <= params->hashLog (setting it to + * params->hashLog if it is not). + * + * Ensures that the minMatchLength >= targetLength during optimal parsing. + */ +void ZSTD_ldm_adjustParameters(ldmParams_t* params, + ZSTD_compressionParameters const* cParams); + +#if defined (__cplusplus) +} +#endif + +#endif /* ZSTD_FAST_H */ diff -r fb92df8b634c -r ed5448edcbfa contrib/python-zstandard/zstd/compress/zstd_opt.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/contrib/python-zstandard/zstd/compress/zstd_opt.c Wed Apr 18 15:32:08 2018 -0400 @@ -0,0 +1,923 @@ +/* + * Copyright (c) 2016-present, Przemyslaw Skibinski, Yann Collet, Facebook, Inc. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the + * LICENSE file in the root directory of this source tree) and the GPLv2 (found + * in the COPYING file in the root directory of this source tree). + * You may select, at your option, one of the above-listed licenses. + */ + +#include "zstd_compress_internal.h" +#include "zstd_opt.h" + + +#define ZSTD_LITFREQ_ADD 2 /* scaling factor for litFreq, so that frequencies adapt faster to new stats. Also used for matchSum (?) */ +#define ZSTD_FREQ_DIV 4 /* log factor when using previous stats to init next stats */ +#define ZSTD_MAX_PRICE (1<<30) + + +/*-************************************* +* Price functions for optimal parser +***************************************/ +static void ZSTD_setLog2Prices(optState_t* optPtr) +{ + optPtr->log2litSum = ZSTD_highbit32(optPtr->litSum+1); + optPtr->log2litLengthSum = ZSTD_highbit32(optPtr->litLengthSum+1); + optPtr->log2matchLengthSum = ZSTD_highbit32(optPtr->matchLengthSum+1); + optPtr->log2offCodeSum = ZSTD_highbit32(optPtr->offCodeSum+1); +} + + +static void ZSTD_rescaleFreqs(optState_t* const optPtr, + const BYTE* const src, size_t const srcSize) +{ + optPtr->staticPrices = 0; + + if (optPtr->litLengthSum == 0) { /* first init */ + unsigned u; + if (srcSize <= 1024) optPtr->staticPrices = 1; + + assert(optPtr->litFreq!=NULL); + for (u=0; u<=MaxLit; u++) + optPtr->litFreq[u] = 0; + for (u=0; ulitFreq[src[u]]++; + optPtr->litSum = 0; + for (u=0; u<=MaxLit; u++) { + optPtr->litFreq[u] = 1 + (optPtr->litFreq[u] >> ZSTD_FREQ_DIV); + optPtr->litSum += optPtr->litFreq[u]; + } + + for (u=0; u<=MaxLL; u++) + optPtr->litLengthFreq[u] = 1; + optPtr->litLengthSum = MaxLL+1; + for (u=0; u<=MaxML; u++) + optPtr->matchLengthFreq[u] = 1; + optPtr->matchLengthSum = MaxML+1; + for (u=0; u<=MaxOff; u++) + optPtr->offCodeFreq[u] = 1; + optPtr->offCodeSum = (MaxOff+1); + + } else { + unsigned u; + + optPtr->litSum = 0; + for (u=0; u<=MaxLit; u++) { + optPtr->litFreq[u] = 1 + (optPtr->litFreq[u] >> (ZSTD_FREQ_DIV+1)); + optPtr->litSum += optPtr->litFreq[u]; + } + optPtr->litLengthSum = 0; + for (u=0; u<=MaxLL; u++) { + optPtr->litLengthFreq[u] = 1 + (optPtr->litLengthFreq[u]>>(ZSTD_FREQ_DIV+1)); + optPtr->litLengthSum += optPtr->litLengthFreq[u]; + } + optPtr->matchLengthSum = 0; + for (u=0; u<=MaxML; u++) { + optPtr->matchLengthFreq[u] = 1 + (optPtr->matchLengthFreq[u]>>ZSTD_FREQ_DIV); + optPtr->matchLengthSum += optPtr->matchLengthFreq[u]; + } + optPtr->offCodeSum = 0; + for (u=0; u<=MaxOff; u++) { + optPtr->offCodeFreq[u] = 1 + (optPtr->offCodeFreq[u]>>ZSTD_FREQ_DIV); + optPtr->offCodeSum += optPtr->offCodeFreq[u]; + } + } + + ZSTD_setLog2Prices(optPtr); +} + + +/* ZSTD_rawLiteralsCost() : + * cost of literals (only) in given segment (which length can be null) + * does not include cost of literalLength symbol */ +static U32 ZSTD_rawLiteralsCost(const BYTE* const literals, U32 const litLength, + const optState_t* const optPtr) +{ + if (optPtr->staticPrices) return (litLength*6); /* 6 bit per literal - no statistic used */ + if (litLength == 0) return 0; + + /* literals */ + { U32 u; + U32 cost = litLength * optPtr->log2litSum; + for (u=0; u < litLength; u++) + cost -= ZSTD_highbit32(optPtr->litFreq[literals[u]]+1); + return cost; + } +} + +/* ZSTD_litLengthPrice() : + * cost of literalLength symbol */ +static U32 ZSTD_litLengthPrice(U32 const litLength, const optState_t* const optPtr) +{ + if (optPtr->staticPrices) return ZSTD_highbit32((U32)litLength+1); + + /* literal Length */ + { U32 const llCode = ZSTD_LLcode(litLength); + U32 const price = LL_bits[llCode] + optPtr->log2litLengthSum - ZSTD_highbit32(optPtr->litLengthFreq[llCode]+1); + return price; + } +} + +/* ZSTD_litLengthPrice() : + * cost of the literal part of a sequence, + * including literals themselves, and literalLength symbol */ +static U32 ZSTD_fullLiteralsCost(const BYTE* const literals, U32 const litLength, + const optState_t* const optPtr) +{ + return ZSTD_rawLiteralsCost(literals, litLength, optPtr) + + ZSTD_litLengthPrice(litLength, optPtr); +} + +/* ZSTD_litLengthContribution() : + * @return ( cost(litlength) - cost(0) ) + * this value can then be added to rawLiteralsCost() + * to provide a cost which is directly comparable to a match ending at same position */ +static int ZSTD_litLengthContribution(U32 const litLength, const optState_t* const optPtr) +{ + if (optPtr->staticPrices) return ZSTD_highbit32(litLength+1); + + /* literal Length */ + { U32 const llCode = ZSTD_LLcode(litLength); + int const contribution = LL_bits[llCode] + + ZSTD_highbit32(optPtr->litLengthFreq[0]+1) + - ZSTD_highbit32(optPtr->litLengthFreq[llCode]+1); +#if 1 + return contribution; +#else + return MAX(0, contribution); /* sometimes better, sometimes not ... */ +#endif + } +} + +/* ZSTD_literalsContribution() : + * creates a fake cost for the literals part of a sequence + * which can be compared to the ending cost of a match + * should a new match start at this position */ +static int ZSTD_literalsContribution(const BYTE* const literals, U32 const litLength, + const optState_t* const optPtr) +{ + int const contribution = ZSTD_rawLiteralsCost(literals, litLength, optPtr) + + ZSTD_litLengthContribution(litLength, optPtr); + return contribution; +} + +/* ZSTD_getMatchPrice() : + * Provides the cost of the match part (offset + matchLength) of a sequence + * Must be combined with ZSTD_fullLiteralsCost() to get the full cost of a sequence. + * optLevel: when <2, favors small offset for decompression speed (improved cache efficiency) */ +FORCE_INLINE_TEMPLATE U32 ZSTD_getMatchPrice( + U32 const offset, U32 const matchLength, + const optState_t* const optPtr, + int const optLevel) +{ + U32 price; + U32 const offCode = ZSTD_highbit32(offset+1); + U32 const mlBase = matchLength - MINMATCH; + assert(matchLength >= MINMATCH); + + if (optPtr->staticPrices) /* fixed scheme, do not use statistics */ + return ZSTD_highbit32((U32)mlBase+1) + 16 + offCode; + + price = offCode + optPtr->log2offCodeSum - ZSTD_highbit32(optPtr->offCodeFreq[offCode]+1); + if ((optLevel<2) /*static*/ && offCode >= 20) price += (offCode-19)*2; /* handicap for long distance offsets, favor decompression speed */ + + /* match Length */ + { U32 const mlCode = ZSTD_MLcode(mlBase); + price += ML_bits[mlCode] + optPtr->log2matchLengthSum - ZSTD_highbit32(optPtr->matchLengthFreq[mlCode]+1); + } + + DEBUGLOG(8, "ZSTD_getMatchPrice(ml:%u) = %u", matchLength, price); + return price; +} + +static void ZSTD_updateStats(optState_t* const optPtr, + U32 litLength, const BYTE* literals, + U32 offsetCode, U32 matchLength) +{ + /* literals */ + { U32 u; + for (u=0; u < litLength; u++) + optPtr->litFreq[literals[u]] += ZSTD_LITFREQ_ADD; + optPtr->litSum += litLength*ZSTD_LITFREQ_ADD; + } + + /* literal Length */ + { U32 const llCode = ZSTD_LLcode(litLength); + optPtr->litLengthFreq[llCode]++; + optPtr->litLengthSum++; + } + + /* match offset code (0-2=>repCode; 3+=>offset+2) */ + { U32 const offCode = ZSTD_highbit32(offsetCode+1); + assert(offCode <= MaxOff); + optPtr->offCodeFreq[offCode]++; + optPtr->offCodeSum++; + } + + /* match Length */ + { U32 const mlBase = matchLength - MINMATCH; + U32 const mlCode = ZSTD_MLcode(mlBase); + optPtr->matchLengthFreq[mlCode]++; + optPtr->matchLengthSum++; + } +} + + +/* ZSTD_readMINMATCH() : + * function safe only for comparisons + * assumption : memPtr must be at least 4 bytes before end of buffer */ +MEM_STATIC U32 ZSTD_readMINMATCH(const void* memPtr, U32 length) +{ + switch (length) + { + default : + case 4 : return MEM_read32(memPtr); + case 3 : if (MEM_isLittleEndian()) + return MEM_read32(memPtr)<<8; + else + return MEM_read32(memPtr)>>8; + } +} + + +/* Update hashTable3 up to ip (excluded) + Assumption : always within prefix (i.e. not within extDict) */ +static U32 ZSTD_insertAndFindFirstIndexHash3 (ZSTD_matchState_t* ms, const BYTE* const ip) +{ + U32* const hashTable3 = ms->hashTable3; + U32 const hashLog3 = ms->hashLog3; + const BYTE* const base = ms->window.base; + U32 idx = ms->nextToUpdate3; + U32 const target = ms->nextToUpdate3 = (U32)(ip - base); + size_t const hash3 = ZSTD_hash3Ptr(ip, hashLog3); + assert(hashLog3 > 0); + + while(idx < target) { + hashTable3[ZSTD_hash3Ptr(base+idx, hashLog3)] = idx; + idx++; + } + + return hashTable3[hash3]; +} + + +/*-************************************* +* Binary Tree search +***************************************/ +/** ZSTD_insertBt1() : add one or multiple positions to tree. + * ip : assumed <= iend-8 . + * @return : nb of positions added */ +static U32 ZSTD_insertBt1( + ZSTD_matchState_t* ms, ZSTD_compressionParameters const* cParams, + const BYTE* const ip, const BYTE* const iend, + U32 const mls, U32 const extDict) +{ + U32* const hashTable = ms->hashTable; + U32 const hashLog = cParams->hashLog; + size_t const h = ZSTD_hashPtr(ip, hashLog, mls); + U32* const bt = ms->chainTable; + U32 const btLog = cParams->chainLog - 1; + U32 const btMask = (1 << btLog) - 1; + U32 matchIndex = hashTable[h]; + size_t commonLengthSmaller=0, commonLengthLarger=0; + const BYTE* const base = ms->window.base; + const BYTE* const dictBase = ms->window.dictBase; + const U32 dictLimit = ms->window.dictLimit; + const BYTE* const dictEnd = dictBase + dictLimit; + const BYTE* const prefixStart = base + dictLimit; + const BYTE* match; + const U32 current = (U32)(ip-base); + const U32 btLow = btMask >= current ? 0 : current - btMask; + U32* smallerPtr = bt + 2*(current&btMask); + U32* largerPtr = smallerPtr + 1; + U32 dummy32; /* to be nullified at the end */ + U32 const windowLow = ms->window.lowLimit; + U32 matchEndIdx = current+8+1; + size_t bestLength = 8; + U32 nbCompares = 1U << cParams->searchLog; +#ifdef ZSTD_C_PREDICT + U32 predictedSmall = *(bt + 2*((current-1)&btMask) + 0); + U32 predictedLarge = *(bt + 2*((current-1)&btMask) + 1); + predictedSmall += (predictedSmall>0); + predictedLarge += (predictedLarge>0); +#endif /* ZSTD_C_PREDICT */ + + DEBUGLOG(8, "ZSTD_insertBt1 (%u)", current); + + assert(ip <= iend-8); /* required for h calculation */ + hashTable[h] = current; /* Update Hash Table */ + + while (nbCompares-- && (matchIndex > windowLow)) { + U32* const nextPtr = bt + 2*(matchIndex & btMask); + size_t matchLength = MIN(commonLengthSmaller, commonLengthLarger); /* guaranteed minimum nb of common bytes */ + assert(matchIndex < current); + +#ifdef ZSTD_C_PREDICT /* note : can create issues when hlog small <= 11 */ + const U32* predictPtr = bt + 2*((matchIndex-1) & btMask); /* written this way, as bt is a roll buffer */ + if (matchIndex == predictedSmall) { + /* no need to check length, result known */ + *smallerPtr = matchIndex; + if (matchIndex <= btLow) { smallerPtr=&dummy32; break; } /* beyond tree size, stop the search */ + smallerPtr = nextPtr+1; /* new "smaller" => larger of match */ + matchIndex = nextPtr[1]; /* new matchIndex larger than previous (closer to current) */ + predictedSmall = predictPtr[1] + (predictPtr[1]>0); + continue; + } + if (matchIndex == predictedLarge) { + *largerPtr = matchIndex; + if (matchIndex <= btLow) { largerPtr=&dummy32; break; } /* beyond tree size, stop the search */ + largerPtr = nextPtr; + matchIndex = nextPtr[0]; + predictedLarge = predictPtr[0] + (predictPtr[0]>0); + continue; + } +#endif + + if ((!extDict) || (matchIndex+matchLength >= dictLimit)) { + assert(matchIndex+matchLength >= dictLimit); /* might be wrong if extDict is incorrectly set to 0 */ + match = base + matchIndex; + matchLength += ZSTD_count(ip+matchLength, match+matchLength, iend); + } else { + match = dictBase + matchIndex; + matchLength += ZSTD_count_2segments(ip+matchLength, match+matchLength, iend, dictEnd, prefixStart); + if (matchIndex+matchLength >= dictLimit) + match = base + matchIndex; /* to prepare for next usage of match[matchLength] */ + } + + if (matchLength > bestLength) { + bestLength = matchLength; + if (matchLength > matchEndIdx - matchIndex) + matchEndIdx = matchIndex + (U32)matchLength; + } + + if (ip+matchLength == iend) { /* equal : no way to know if inf or sup */ + break; /* drop , to guarantee consistency ; miss a bit of compression, but other solutions can corrupt tree */ + } + + if (match[matchLength] < ip[matchLength]) { /* necessarily within buffer */ + /* match is smaller than current */ + *smallerPtr = matchIndex; /* update smaller idx */ + commonLengthSmaller = matchLength; /* all smaller will now have at least this guaranteed common length */ + if (matchIndex <= btLow) { smallerPtr=&dummy32; break; } /* beyond tree size, stop searching */ + smallerPtr = nextPtr+1; /* new "candidate" => larger than match, which was smaller than target */ + matchIndex = nextPtr[1]; /* new matchIndex, larger than previous and closer to current */ + } else { + /* match is larger than current */ + *largerPtr = matchIndex; + commonLengthLarger = matchLength; + if (matchIndex <= btLow) { largerPtr=&dummy32; break; } /* beyond tree size, stop searching */ + largerPtr = nextPtr; + matchIndex = nextPtr[0]; + } } + + *smallerPtr = *largerPtr = 0; + if (bestLength > 384) return MIN(192, (U32)(bestLength - 384)); /* speed optimization */ + assert(matchEndIdx > current + 8); + return matchEndIdx - (current + 8); +} + +FORCE_INLINE_TEMPLATE +void ZSTD_updateTree_internal( + ZSTD_matchState_t* ms, ZSTD_compressionParameters const* cParams, + const BYTE* const ip, const BYTE* const iend, + const U32 mls, const U32 extDict) +{ + const BYTE* const base = ms->window.base; + U32 const target = (U32)(ip - base); + U32 idx = ms->nextToUpdate; + DEBUGLOG(7, "ZSTD_updateTree_internal, from %u to %u (extDict:%u)", + idx, target, extDict); + + while(idx < target) + idx += ZSTD_insertBt1(ms, cParams, base+idx, iend, mls, extDict); + ms->nextToUpdate = target; +} + +void ZSTD_updateTree( + ZSTD_matchState_t* ms, ZSTD_compressionParameters const* cParams, + const BYTE* ip, const BYTE* iend) +{ + ZSTD_updateTree_internal(ms, cParams, ip, iend, cParams->searchLength, 0 /*extDict*/); +} + +FORCE_INLINE_TEMPLATE +U32 ZSTD_insertBtAndGetAllMatches ( + ZSTD_matchState_t* ms, ZSTD_compressionParameters const* cParams, + const BYTE* const ip, const BYTE* const iLimit, int const extDict, + U32 rep[ZSTD_REP_NUM], U32 const ll0, + ZSTD_match_t* matches, const U32 lengthToBeat, U32 const mls /* template */) +{ + U32 const sufficient_len = MIN(cParams->targetLength, ZSTD_OPT_NUM -1); + const BYTE* const base = ms->window.base; + U32 const current = (U32)(ip-base); + U32 const hashLog = cParams->hashLog; + U32 const minMatch = (mls==3) ? 3 : 4; + U32* const hashTable = ms->hashTable; + size_t const h = ZSTD_hashPtr(ip, hashLog, mls); + U32 matchIndex = hashTable[h]; + U32* const bt = ms->chainTable; + U32 const btLog = cParams->chainLog - 1; + U32 const btMask= (1U << btLog) - 1; + size_t commonLengthSmaller=0, commonLengthLarger=0; + const BYTE* const dictBase = ms->window.dictBase; + U32 const dictLimit = ms->window.dictLimit; + const BYTE* const dictEnd = dictBase + dictLimit; + const BYTE* const prefixStart = base + dictLimit; + U32 const btLow = btMask >= current ? 0 : current - btMask; + U32 const windowLow = ms->window.lowLimit; + U32* smallerPtr = bt + 2*(current&btMask); + U32* largerPtr = bt + 2*(current&btMask) + 1; + U32 matchEndIdx = current+8+1; /* farthest referenced position of any match => detects repetitive patterns */ + U32 dummy32; /* to be nullified at the end */ + U32 mnum = 0; + U32 nbCompares = 1U << cParams->searchLog; + + size_t bestLength = lengthToBeat-1; + DEBUGLOG(7, "ZSTD_insertBtAndGetAllMatches"); + + /* check repCode */ + { U32 const lastR = ZSTD_REP_NUM + ll0; + U32 repCode; + for (repCode = ll0; repCode < lastR; repCode++) { + U32 const repOffset = (repCode==ZSTD_REP_NUM) ? (rep[0] - 1) : rep[repCode]; + U32 const repIndex = current - repOffset; + U32 repLen = 0; + assert(current >= dictLimit); + if (repOffset-1 /* intentional overflow, discards 0 and -1 */ < current-dictLimit) { /* equivalent to `current > repIndex >= dictLimit` */ + if (ZSTD_readMINMATCH(ip, minMatch) == ZSTD_readMINMATCH(ip - repOffset, minMatch)) { + repLen = (U32)ZSTD_count(ip+minMatch, ip+minMatch-repOffset, iLimit) + minMatch; + } + } else { /* repIndex < dictLimit || repIndex >= current */ + const BYTE* const repMatch = dictBase + repIndex; + assert(current >= windowLow); + if ( extDict /* this case only valid in extDict mode */ + && ( ((repOffset-1) /*intentional overflow*/ < current - windowLow) /* equivalent to `current > repIndex >= windowLow` */ + & (((U32)((dictLimit-1) - repIndex) >= 3) ) /* intentional overflow : do not test positions overlapping 2 memory segments */) + && (ZSTD_readMINMATCH(ip, minMatch) == ZSTD_readMINMATCH(repMatch, minMatch)) ) { + repLen = (U32)ZSTD_count_2segments(ip+minMatch, repMatch+minMatch, iLimit, dictEnd, prefixStart) + minMatch; + } } + /* save longer solution */ + if (repLen > bestLength) { + DEBUGLOG(8, "found rep-match %u of length %u", + repCode - ll0, (U32)repLen); + bestLength = repLen; + matches[mnum].off = repCode - ll0; + matches[mnum].len = (U32)repLen; + mnum++; + if ( (repLen > sufficient_len) + | (ip+repLen == iLimit) ) { /* best possible */ + return mnum; + } } } } + + /* HC3 match finder */ + if ((mls == 3) /*static*/ && (bestLength < mls)) { + U32 const matchIndex3 = ZSTD_insertAndFindFirstIndexHash3(ms, ip); + if ((matchIndex3 > windowLow) + & (current - matchIndex3 < (1<<18)) /*heuristic : longer distance likely too expensive*/ ) { + size_t mlen; + if ((!extDict) /*static*/ || (matchIndex3 >= dictLimit)) { + const BYTE* const match = base + matchIndex3; + mlen = ZSTD_count(ip, match, iLimit); + } else { + const BYTE* const match = dictBase + matchIndex3; + mlen = ZSTD_count_2segments(ip, match, iLimit, dictEnd, prefixStart); + } + + /* save best solution */ + if (mlen >= mls /* == 3 > bestLength */) { + DEBUGLOG(8, "found small match with hlog3, of length %u", + (U32)mlen); + bestLength = mlen; + assert(current > matchIndex3); + assert(mnum==0); /* no prior solution */ + matches[0].off = (current - matchIndex3) + ZSTD_REP_MOVE; + matches[0].len = (U32)mlen; + mnum = 1; + if ( (mlen > sufficient_len) | + (ip+mlen == iLimit) ) { /* best possible length */ + ms->nextToUpdate = current+1; /* skip insertion */ + return 1; + } } } } + + hashTable[h] = current; /* Update Hash Table */ + + while (nbCompares-- && (matchIndex > windowLow)) { + U32* const nextPtr = bt + 2*(matchIndex & btMask); + size_t matchLength = MIN(commonLengthSmaller, commonLengthLarger); /* guaranteed minimum nb of common bytes */ + const BYTE* match; + assert(current > matchIndex); + + if ((!extDict) || (matchIndex+matchLength >= dictLimit)) { + assert(matchIndex+matchLength >= dictLimit); /* ensure the condition is correct when !extDict */ + match = base + matchIndex; + matchLength += ZSTD_count(ip+matchLength, match+matchLength, iLimit); + } else { + match = dictBase + matchIndex; + matchLength += ZSTD_count_2segments(ip+matchLength, match+matchLength, iLimit, dictEnd, prefixStart); + if (matchIndex+matchLength >= dictLimit) + match = base + matchIndex; /* prepare for match[matchLength] */ + } + + if (matchLength > bestLength) { + DEBUGLOG(8, "found match of length %u at distance %u", + (U32)matchLength, current - matchIndex); + assert(matchEndIdx > matchIndex); + if (matchLength > matchEndIdx - matchIndex) + matchEndIdx = matchIndex + (U32)matchLength; + bestLength = matchLength; + matches[mnum].off = (current - matchIndex) + ZSTD_REP_MOVE; + matches[mnum].len = (U32)matchLength; + mnum++; + if (matchLength > ZSTD_OPT_NUM) break; + if (ip+matchLength == iLimit) { /* equal : no way to know if inf or sup */ + break; /* drop, to preserve bt consistency (miss a little bit of compression) */ + } + } + + if (match[matchLength] < ip[matchLength]) { + /* match smaller than current */ + *smallerPtr = matchIndex; /* update smaller idx */ + commonLengthSmaller = matchLength; /* all smaller will now have at least this guaranteed common length */ + if (matchIndex <= btLow) { smallerPtr=&dummy32; break; } /* beyond tree size, stop the search */ + smallerPtr = nextPtr+1; /* new candidate => larger than match, which was smaller than current */ + matchIndex = nextPtr[1]; /* new matchIndex, larger than previous, closer to current */ + } else { + *largerPtr = matchIndex; + commonLengthLarger = matchLength; + if (matchIndex <= btLow) { largerPtr=&dummy32; break; } /* beyond tree size, stop the search */ + largerPtr = nextPtr; + matchIndex = nextPtr[0]; + } } + + *smallerPtr = *largerPtr = 0; + + assert(matchEndIdx > current+8); + ms->nextToUpdate = matchEndIdx - 8; /* skip repetitive patterns */ + return mnum; +} + + +FORCE_INLINE_TEMPLATE U32 ZSTD_BtGetAllMatches ( + ZSTD_matchState_t* ms, ZSTD_compressionParameters const* cParams, + const BYTE* ip, const BYTE* const iHighLimit, int const extDict, + U32 rep[ZSTD_REP_NUM], U32 const ll0, + ZSTD_match_t* matches, U32 const lengthToBeat) +{ + U32 const matchLengthSearch = cParams->searchLength; + DEBUGLOG(7, "ZSTD_BtGetAllMatches"); + if (ip < ms->window.base + ms->nextToUpdate) return 0; /* skipped area */ + ZSTD_updateTree_internal(ms, cParams, ip, iHighLimit, matchLengthSearch, extDict); + switch(matchLengthSearch) + { + case 3 : return ZSTD_insertBtAndGetAllMatches(ms, cParams, ip, iHighLimit, extDict, rep, ll0, matches, lengthToBeat, 3); + default : + case 4 : return ZSTD_insertBtAndGetAllMatches(ms, cParams, ip, iHighLimit, extDict, rep, ll0, matches, lengthToBeat, 4); + case 5 : return ZSTD_insertBtAndGetAllMatches(ms, cParams, ip, iHighLimit, extDict, rep, ll0, matches, lengthToBeat, 5); + case 7 : + case 6 : return ZSTD_insertBtAndGetAllMatches(ms, cParams, ip, iHighLimit, extDict, rep, ll0, matches, lengthToBeat, 6); + } +} + + +/*-******************************* +* Optimal parser +*********************************/ +typedef struct repcodes_s { + U32 rep[3]; +} repcodes_t; + +repcodes_t ZSTD_updateRep(U32 const rep[3], U32 const offset, U32 const ll0) +{ + repcodes_t newReps; + if (offset >= ZSTD_REP_NUM) { /* full offset */ + newReps.rep[2] = rep[1]; + newReps.rep[1] = rep[0]; + newReps.rep[0] = offset - ZSTD_REP_MOVE; + } else { /* repcode */ + U32 const repCode = offset + ll0; + if (repCode > 0) { /* note : if repCode==0, no change */ + U32 const currentOffset = (repCode==ZSTD_REP_NUM) ? (rep[0] - 1) : rep[repCode]; + newReps.rep[2] = (repCode >= 2) ? rep[1] : rep[2]; + newReps.rep[1] = rep[0]; + newReps.rep[0] = currentOffset; + } else { /* repCode == 0 */ + memcpy(&newReps, rep, sizeof(newReps)); + } + } + return newReps; +} + + +typedef struct { + const BYTE* anchor; + U32 litlen; + U32 rawLitCost; +} cachedLiteralPrice_t; + +static U32 ZSTD_rawLiteralsCost_cached( + cachedLiteralPrice_t* const cachedLitPrice, + const BYTE* const anchor, U32 const litlen, + const optState_t* const optStatePtr) +{ + U32 startCost; + U32 remainingLength; + const BYTE* startPosition; + + if (anchor == cachedLitPrice->anchor) { + startCost = cachedLitPrice->rawLitCost; + startPosition = anchor + cachedLitPrice->litlen; + assert(litlen >= cachedLitPrice->litlen); + remainingLength = litlen - cachedLitPrice->litlen; + } else { + startCost = 0; + startPosition = anchor; + remainingLength = litlen; + } + + { U32 const rawLitCost = startCost + ZSTD_rawLiteralsCost(startPosition, remainingLength, optStatePtr); + cachedLitPrice->anchor = anchor; + cachedLitPrice->litlen = litlen; + cachedLitPrice->rawLitCost = rawLitCost; + return rawLitCost; + } +} + +static U32 ZSTD_fullLiteralsCost_cached( + cachedLiteralPrice_t* const cachedLitPrice, + const BYTE* const anchor, U32 const litlen, + const optState_t* const optStatePtr) +{ + return ZSTD_rawLiteralsCost_cached(cachedLitPrice, anchor, litlen, optStatePtr) + + ZSTD_litLengthPrice(litlen, optStatePtr); +} + +static int ZSTD_literalsContribution_cached( + cachedLiteralPrice_t* const cachedLitPrice, + const BYTE* const anchor, U32 const litlen, + const optState_t* const optStatePtr) +{ + int const contribution = ZSTD_rawLiteralsCost_cached(cachedLitPrice, anchor, litlen, optStatePtr) + + ZSTD_litLengthContribution(litlen, optStatePtr); + return contribution; +} + +FORCE_INLINE_TEMPLATE +size_t ZSTD_compressBlock_opt_generic(ZSTD_matchState_t* ms,seqStore_t* seqStore, + U32 rep[ZSTD_REP_NUM], + ZSTD_compressionParameters const* cParams, + const void* src, size_t srcSize, + const int optLevel, const int extDict) +{ + optState_t* const optStatePtr = &ms->opt; + const BYTE* const istart = (const BYTE*)src; + const BYTE* ip = istart; + const BYTE* anchor = istart; + const BYTE* const iend = istart + srcSize; + const BYTE* const ilimit = iend - 8; + const BYTE* const base = ms->window.base; + const BYTE* const prefixStart = base + ms->window.dictLimit; + + U32 const sufficient_len = MIN(cParams->targetLength, ZSTD_OPT_NUM -1); + U32 const minMatch = (cParams->searchLength == 3) ? 3 : 4; + + ZSTD_optimal_t* const opt = optStatePtr->priceTable; + ZSTD_match_t* const matches = optStatePtr->matchTable; + cachedLiteralPrice_t cachedLitPrice; + + /* init */ + DEBUGLOG(5, "ZSTD_compressBlock_opt_generic"); + ms->nextToUpdate3 = ms->nextToUpdate; + ZSTD_rescaleFreqs(optStatePtr, (const BYTE*)src, srcSize); + ip += (ip==prefixStart); + memset(&cachedLitPrice, 0, sizeof(cachedLitPrice)); + + /* Match Loop */ + while (ip < ilimit) { + U32 cur, last_pos = 0; + U32 best_mlen, best_off; + + /* find first match */ + { U32 const litlen = (U32)(ip - anchor); + U32 const ll0 = !litlen; + U32 const nbMatches = ZSTD_BtGetAllMatches(ms, cParams, ip, iend, extDict, rep, ll0, matches, minMatch); + if (!nbMatches) { ip++; continue; } + + /* initialize opt[0] */ + { U32 i ; for (i=0; i immediate encoding */ + { U32 const maxML = matches[nbMatches-1].len; + DEBUGLOG(7, "found %u matches of maxLength=%u and offset=%u at cPos=%u => start new serie", + nbMatches, maxML, matches[nbMatches-1].off, (U32)(ip-prefixStart)); + + if (maxML > sufficient_len) { + best_mlen = maxML; + best_off = matches[nbMatches-1].off; + DEBUGLOG(7, "large match (%u>%u), immediate encoding", + best_mlen, sufficient_len); + cur = 0; + last_pos = 1; + goto _shortestPath; + } } + + /* set prices for first matches starting position == 0 */ + { U32 const literalsPrice = ZSTD_fullLiteralsCost_cached(&cachedLitPrice, anchor, litlen, optStatePtr); + U32 pos; + U32 matchNb; + for (pos = 0; pos < minMatch; pos++) { + opt[pos].mlen = 1; + opt[pos].price = ZSTD_MAX_PRICE; + } + for (matchNb = 0; matchNb < nbMatches; matchNb++) { + U32 const offset = matches[matchNb].off; + U32 const end = matches[matchNb].len; + repcodes_t const repHistory = ZSTD_updateRep(rep, offset, ll0); + for ( ; pos <= end ; pos++ ) { + U32 const matchPrice = literalsPrice + ZSTD_getMatchPrice(offset, pos, optStatePtr, optLevel); + DEBUGLOG(7, "rPos:%u => set initial price : %u", + pos, matchPrice); + opt[pos].mlen = pos; + opt[pos].off = offset; + opt[pos].litlen = litlen; + opt[pos].price = matchPrice; + memcpy(opt[pos].rep, &repHistory, sizeof(repHistory)); + } } + last_pos = pos-1; + } + } + + /* check further positions */ + for (cur = 1; cur <= last_pos; cur++) { + const BYTE* const inr = ip + cur; + assert(cur < ZSTD_OPT_NUM); + + /* Fix current position with one literal if cheaper */ + { U32 const litlen = (opt[cur-1].mlen == 1) ? opt[cur-1].litlen + 1 : 1; + int price; /* note : contribution can be negative */ + if (cur > litlen) { + price = opt[cur - litlen].price + ZSTD_literalsContribution(inr-litlen, litlen, optStatePtr); + } else { + price = ZSTD_literalsContribution_cached(&cachedLitPrice, anchor, litlen, optStatePtr); + } + assert(price < 1000000000); /* overflow check */ + if (price <= opt[cur].price) { + DEBUGLOG(7, "rPos:%u : better price (%u<%u) using literal", + cur, price, opt[cur].price); + opt[cur].mlen = 1; + opt[cur].off = 0; + opt[cur].litlen = litlen; + opt[cur].price = price; + memcpy(opt[cur].rep, opt[cur-1].rep, sizeof(opt[cur].rep)); + } } + + /* last match must start at a minimum distance of 8 from oend */ + if (inr > ilimit) continue; + + if (cur == last_pos) break; + + if ( (optLevel==0) /*static*/ + && (opt[cur+1].price <= opt[cur].price) ) + continue; /* skip unpromising positions; about ~+6% speed, -0.01 ratio */ + + { U32 const ll0 = (opt[cur].mlen != 1); + U32 const litlen = (opt[cur].mlen == 1) ? opt[cur].litlen : 0; + U32 const previousPrice = (cur > litlen) ? opt[cur-litlen].price : 0; + U32 const basePrice = previousPrice + ZSTD_fullLiteralsCost(inr-litlen, litlen, optStatePtr); + U32 const nbMatches = ZSTD_BtGetAllMatches(ms, cParams, inr, iend, extDict, opt[cur].rep, ll0, matches, minMatch); + U32 matchNb; + if (!nbMatches) continue; + + { U32 const maxML = matches[nbMatches-1].len; + DEBUGLOG(7, "rPos:%u, found %u matches, of maxLength=%u", + cur, nbMatches, maxML); + + if ( (maxML > sufficient_len) + | (cur + maxML >= ZSTD_OPT_NUM) ) { + best_mlen = maxML; + best_off = matches[nbMatches-1].off; + last_pos = cur + 1; + goto _shortestPath; + } + } + + /* set prices using matches found at position == cur */ + for (matchNb = 0; matchNb < nbMatches; matchNb++) { + U32 const offset = matches[matchNb].off; + repcodes_t const repHistory = ZSTD_updateRep(opt[cur].rep, offset, ll0); + U32 const lastML = matches[matchNb].len; + U32 const startML = (matchNb>0) ? matches[matchNb-1].len+1 : minMatch; + U32 mlen; + + DEBUGLOG(7, "testing match %u => offCode=%u, mlen=%u, llen=%u", + matchNb, matches[matchNb].off, lastML, litlen); + + for (mlen = lastML; mlen >= startML; mlen--) { + U32 const pos = cur + mlen; + int const price = basePrice + ZSTD_getMatchPrice(offset, mlen, optStatePtr, optLevel); + + if ((pos > last_pos) || (price < opt[pos].price)) { + DEBUGLOG(7, "rPos:%u => new better price (%u<%u)", + pos, price, opt[pos].price); + while (last_pos < pos) { opt[last_pos+1].price = ZSTD_MAX_PRICE; last_pos++; } + opt[pos].mlen = mlen; + opt[pos].off = offset; + opt[pos].litlen = litlen; + opt[pos].price = price; + memcpy(opt[pos].rep, &repHistory, sizeof(repHistory)); + } else { + if (optLevel==0) break; /* gets ~+10% speed for about -0.01 ratio loss */ + } + } } } + } /* for (cur = 1; cur <= last_pos; cur++) */ + + best_mlen = opt[last_pos].mlen; + best_off = opt[last_pos].off; + cur = last_pos - best_mlen; + +_shortestPath: /* cur, last_pos, best_mlen, best_off have to be set */ + assert(opt[0].mlen == 1); + + /* reverse traversal */ + DEBUGLOG(7, "start reverse traversal (last_pos:%u, cur:%u)", + last_pos, cur); + { U32 selectedMatchLength = best_mlen; + U32 selectedOffset = best_off; + U32 pos = cur; + while (1) { + U32 const mlen = opt[pos].mlen; + U32 const off = opt[pos].off; + opt[pos].mlen = selectedMatchLength; + opt[pos].off = selectedOffset; + selectedMatchLength = mlen; + selectedOffset = off; + if (mlen > pos) break; + pos -= mlen; + } } + + /* save sequences */ + { U32 pos; + for (pos=0; pos < last_pos; ) { + U32 const llen = (U32)(ip - anchor); + U32 const mlen = opt[pos].mlen; + U32 const offset = opt[pos].off; + if (mlen == 1) { ip++; pos++; continue; } /* literal position => move on */ + pos += mlen; ip += mlen; + + /* repcodes update : like ZSTD_updateRep(), but update in place */ + if (offset >= ZSTD_REP_NUM) { /* full offset */ + rep[2] = rep[1]; + rep[1] = rep[0]; + rep[0] = offset - ZSTD_REP_MOVE; + } else { /* repcode */ + U32 const repCode = offset + (llen==0); + if (repCode) { /* note : if repCode==0, no change */ + U32 const currentOffset = (repCode==ZSTD_REP_NUM) ? (rep[0] - 1) : rep[repCode]; + if (repCode >= 2) rep[2] = rep[1]; + rep[1] = rep[0]; + rep[0] = currentOffset; + } + } + + ZSTD_updateStats(optStatePtr, llen, anchor, offset, mlen); + ZSTD_storeSeq(seqStore, llen, anchor, offset, mlen-MINMATCH); + anchor = ip; + } } + ZSTD_setLog2Prices(optStatePtr); + } /* while (ip < ilimit) */ + + /* Return the last literals size */ + return iend - anchor; +} + + +size_t ZSTD_compressBlock_btopt( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + ZSTD_compressionParameters const* cParams, void const* src, size_t srcSize) +{ + DEBUGLOG(5, "ZSTD_compressBlock_btopt"); + return ZSTD_compressBlock_opt_generic(ms, seqStore, rep, cParams, src, srcSize, 0 /*optLevel*/, 0 /*extDict*/); +} + +size_t ZSTD_compressBlock_btultra( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + ZSTD_compressionParameters const* cParams, void const* src, size_t srcSize) +{ + return ZSTD_compressBlock_opt_generic(ms, seqStore, rep, cParams, src, srcSize, 2 /*optLevel*/, 0 /*extDict*/); +} + +size_t ZSTD_compressBlock_btopt_extDict( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + ZSTD_compressionParameters const* cParams, void const* src, size_t srcSize) +{ + return ZSTD_compressBlock_opt_generic(ms, seqStore, rep, cParams, src, srcSize, 0 /*optLevel*/, 1 /*extDict*/); +} + +size_t ZSTD_compressBlock_btultra_extDict( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + ZSTD_compressionParameters const* cParams, void const* src, size_t srcSize) +{ + return ZSTD_compressBlock_opt_generic(ms, seqStore, rep, cParams, src, srcSize, 2 /*optLevel*/, 1 /*extDict*/); +} diff -r fb92df8b634c -r ed5448edcbfa contrib/python-zstandard/zstd/compress/zstd_opt.h --- a/contrib/python-zstandard/zstd/compress/zstd_opt.h Wed Apr 04 10:35:09 2018 -0400 +++ b/contrib/python-zstandard/zstd/compress/zstd_opt.h Wed Apr 18 15:32:08 2018 -0400 @@ -1,919 +1,42 @@ -/** - * Copyright (c) 2016-present, Przemyslaw Skibinski, Yann Collet, Facebook, Inc. +/* + * Copyright (c) 2016-present, Yann Collet, Facebook, Inc. * All rights reserved. * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. An additional grant - * of patent rights can be found in the PATENTS file in the same directory. + * This source code is licensed under both the BSD-style license (found in the + * LICENSE file in the root directory of this source tree) and the GPLv2 (found + * in the COPYING file in the root directory of this source tree). + * You may select, at your option, one of the above-listed licenses. */ - -/* Note : this file is intended to be included within zstd_compress.c */ - - -#ifndef ZSTD_OPT_H_91842398743 -#define ZSTD_OPT_H_91842398743 - - -#define ZSTD_LITFREQ_ADD 2 -#define ZSTD_FREQ_DIV 4 -#define ZSTD_MAX_PRICE (1<<30) - -/*-************************************* -* Price functions for optimal parser -***************************************/ -FORCE_INLINE void ZSTD_setLog2Prices(seqStore_t* ssPtr) -{ - ssPtr->log2matchLengthSum = ZSTD_highbit32(ssPtr->matchLengthSum+1); - ssPtr->log2litLengthSum = ZSTD_highbit32(ssPtr->litLengthSum+1); - ssPtr->log2litSum = ZSTD_highbit32(ssPtr->litSum+1); - ssPtr->log2offCodeSum = ZSTD_highbit32(ssPtr->offCodeSum+1); - ssPtr->factor = 1 + ((ssPtr->litSum>>5) / ssPtr->litLengthSum) + ((ssPtr->litSum<<1) / (ssPtr->litSum + ssPtr->matchSum)); -} - - -MEM_STATIC void ZSTD_rescaleFreqs(seqStore_t* ssPtr, const BYTE* src, size_t srcSize) -{ - unsigned u; - - ssPtr->cachedLiterals = NULL; - ssPtr->cachedPrice = ssPtr->cachedLitLength = 0; - ssPtr->staticPrices = 0; - - if (ssPtr->litLengthSum == 0) { - if (srcSize <= 1024) ssPtr->staticPrices = 1; - - for (u=0; u<=MaxLit; u++) - ssPtr->litFreq[u] = 0; - for (u=0; ulitFreq[src[u]]++; - - ssPtr->litSum = 0; - ssPtr->litLengthSum = MaxLL+1; - ssPtr->matchLengthSum = MaxML+1; - ssPtr->offCodeSum = (MaxOff+1); - ssPtr->matchSum = (ZSTD_LITFREQ_ADD<litFreq[u] = 1 + (ssPtr->litFreq[u]>>ZSTD_FREQ_DIV); - ssPtr->litSum += ssPtr->litFreq[u]; - } - for (u=0; u<=MaxLL; u++) - ssPtr->litLengthFreq[u] = 1; - for (u=0; u<=MaxML; u++) - ssPtr->matchLengthFreq[u] = 1; - for (u=0; u<=MaxOff; u++) - ssPtr->offCodeFreq[u] = 1; - } else { - ssPtr->matchLengthSum = 0; - ssPtr->litLengthSum = 0; - ssPtr->offCodeSum = 0; - ssPtr->matchSum = 0; - ssPtr->litSum = 0; - - for (u=0; u<=MaxLit; u++) { - ssPtr->litFreq[u] = 1 + (ssPtr->litFreq[u]>>(ZSTD_FREQ_DIV+1)); - ssPtr->litSum += ssPtr->litFreq[u]; - } - for (u=0; u<=MaxLL; u++) { - ssPtr->litLengthFreq[u] = 1 + (ssPtr->litLengthFreq[u]>>(ZSTD_FREQ_DIV+1)); - ssPtr->litLengthSum += ssPtr->litLengthFreq[u]; - } - for (u=0; u<=MaxML; u++) { - ssPtr->matchLengthFreq[u] = 1 + (ssPtr->matchLengthFreq[u]>>ZSTD_FREQ_DIV); - ssPtr->matchLengthSum += ssPtr->matchLengthFreq[u]; - ssPtr->matchSum += ssPtr->matchLengthFreq[u] * (u + 3); - } - ssPtr->matchSum *= ZSTD_LITFREQ_ADD; - for (u=0; u<=MaxOff; u++) { - ssPtr->offCodeFreq[u] = 1 + (ssPtr->offCodeFreq[u]>>ZSTD_FREQ_DIV); - ssPtr->offCodeSum += ssPtr->offCodeFreq[u]; - } - } - - ZSTD_setLog2Prices(ssPtr); -} - - -FORCE_INLINE U32 ZSTD_getLiteralPrice(seqStore_t* ssPtr, U32 litLength, const BYTE* literals) -{ - U32 price, u; - - if (ssPtr->staticPrices) - return ZSTD_highbit32((U32)litLength+1) + (litLength*6); - - if (litLength == 0) - return ssPtr->log2litLengthSum - ZSTD_highbit32(ssPtr->litLengthFreq[0]+1); - - /* literals */ - if (ssPtr->cachedLiterals == literals) { - U32 const additional = litLength - ssPtr->cachedLitLength; - const BYTE* literals2 = ssPtr->cachedLiterals + ssPtr->cachedLitLength; - price = ssPtr->cachedPrice + additional * ssPtr->log2litSum; - for (u=0; u < additional; u++) - price -= ZSTD_highbit32(ssPtr->litFreq[literals2[u]]+1); - ssPtr->cachedPrice = price; - ssPtr->cachedLitLength = litLength; - } else { - price = litLength * ssPtr->log2litSum; - for (u=0; u < litLength; u++) - price -= ZSTD_highbit32(ssPtr->litFreq[literals[u]]+1); - - if (litLength >= 12) { - ssPtr->cachedLiterals = literals; - ssPtr->cachedPrice = price; - ssPtr->cachedLitLength = litLength; - } - } - - /* literal Length */ - { const BYTE LL_deltaCode = 19; - const BYTE llCode = (litLength>63) ? (BYTE)ZSTD_highbit32(litLength) + LL_deltaCode : LL_Code[litLength]; - price += LL_bits[llCode] + ssPtr->log2litLengthSum - ZSTD_highbit32(ssPtr->litLengthFreq[llCode]+1); - } - - return price; -} - - -FORCE_INLINE U32 ZSTD_getPrice(seqStore_t* seqStorePtr, U32 litLength, const BYTE* literals, U32 offset, U32 matchLength, const int ultra) -{ - /* offset */ - U32 price; - BYTE const offCode = (BYTE)ZSTD_highbit32(offset+1); - - if (seqStorePtr->staticPrices) - return ZSTD_getLiteralPrice(seqStorePtr, litLength, literals) + ZSTD_highbit32((U32)matchLength+1) + 16 + offCode; - - price = offCode + seqStorePtr->log2offCodeSum - ZSTD_highbit32(seqStorePtr->offCodeFreq[offCode]+1); - if (!ultra && offCode >= 20) price += (offCode-19)*2; - - /* match Length */ - { const BYTE ML_deltaCode = 36; - const BYTE mlCode = (matchLength>127) ? (BYTE)ZSTD_highbit32(matchLength) + ML_deltaCode : ML_Code[matchLength]; - price += ML_bits[mlCode] + seqStorePtr->log2matchLengthSum - ZSTD_highbit32(seqStorePtr->matchLengthFreq[mlCode]+1); - } - - return price + ZSTD_getLiteralPrice(seqStorePtr, litLength, literals) + seqStorePtr->factor; -} - - -MEM_STATIC void ZSTD_updatePrice(seqStore_t* seqStorePtr, U32 litLength, const BYTE* literals, U32 offset, U32 matchLength) -{ - U32 u; - - /* literals */ - seqStorePtr->litSum += litLength*ZSTD_LITFREQ_ADD; - for (u=0; u < litLength; u++) - seqStorePtr->litFreq[literals[u]] += ZSTD_LITFREQ_ADD; - - /* literal Length */ - { const BYTE LL_deltaCode = 19; - const BYTE llCode = (litLength>63) ? (BYTE)ZSTD_highbit32(litLength) + LL_deltaCode : LL_Code[litLength]; - seqStorePtr->litLengthFreq[llCode]++; - seqStorePtr->litLengthSum++; - } - - /* match offset */ - { BYTE const offCode = (BYTE)ZSTD_highbit32(offset+1); - seqStorePtr->offCodeSum++; - seqStorePtr->offCodeFreq[offCode]++; - } - - /* match Length */ - { const BYTE ML_deltaCode = 36; - const BYTE mlCode = (matchLength>127) ? (BYTE)ZSTD_highbit32(matchLength) + ML_deltaCode : ML_Code[matchLength]; - seqStorePtr->matchLengthFreq[mlCode]++; - seqStorePtr->matchLengthSum++; - } - - ZSTD_setLog2Prices(seqStorePtr); -} - - -#define SET_PRICE(pos, mlen_, offset_, litlen_, price_) \ - { \ - while (last_pos < pos) { opt[last_pos+1].price = ZSTD_MAX_PRICE; last_pos++; } \ - opt[pos].mlen = mlen_; \ - opt[pos].off = offset_; \ - opt[pos].litlen = litlen_; \ - opt[pos].price = price_; \ - } - - - -/* Update hashTable3 up to ip (excluded) - Assumption : always within prefix (ie. not within extDict) */ -FORCE_INLINE -U32 ZSTD_insertAndFindFirstIndexHash3 (ZSTD_CCtx* zc, const BYTE* ip) -{ - U32* const hashTable3 = zc->hashTable3; - U32 const hashLog3 = zc->hashLog3; - const BYTE* const base = zc->base; - U32 idx = zc->nextToUpdate3; - const U32 target = zc->nextToUpdate3 = (U32)(ip - base); - const size_t hash3 = ZSTD_hash3Ptr(ip, hashLog3); - - while(idx < target) { - hashTable3[ZSTD_hash3Ptr(base+idx, hashLog3)] = idx; - idx++; - } - - return hashTable3[hash3]; -} - +#ifndef ZSTD_OPT_H +#define ZSTD_OPT_H -/*-************************************* -* Binary Tree search -***************************************/ -static U32 ZSTD_insertBtAndGetAllMatches ( - ZSTD_CCtx* zc, - const BYTE* const ip, const BYTE* const iLimit, - U32 nbCompares, const U32 mls, - U32 extDict, ZSTD_match_t* matches, const U32 minMatchLen) -{ - const BYTE* const base = zc->base; - const U32 current = (U32)(ip-base); - const U32 hashLog = zc->params.cParams.hashLog; - const size_t h = ZSTD_hashPtr(ip, hashLog, mls); - U32* const hashTable = zc->hashTable; - U32 matchIndex = hashTable[h]; - U32* const bt = zc->chainTable; - const U32 btLog = zc->params.cParams.chainLog - 1; - const U32 btMask= (1U << btLog) - 1; - size_t commonLengthSmaller=0, commonLengthLarger=0; - const BYTE* const dictBase = zc->dictBase; - const U32 dictLimit = zc->dictLimit; - const BYTE* const dictEnd = dictBase + dictLimit; - const BYTE* const prefixStart = base + dictLimit; - const U32 btLow = btMask >= current ? 0 : current - btMask; - const U32 windowLow = zc->lowLimit; - U32* smallerPtr = bt + 2*(current&btMask); - U32* largerPtr = bt + 2*(current&btMask) + 1; - U32 matchEndIdx = current+8; - U32 dummy32; /* to be nullified at the end */ - U32 mnum = 0; - - const U32 minMatch = (mls == 3) ? 3 : 4; - size_t bestLength = minMatchLen-1; - - if (minMatch == 3) { /* HC3 match finder */ - U32 const matchIndex3 = ZSTD_insertAndFindFirstIndexHash3 (zc, ip); - if (matchIndex3>windowLow && (current - matchIndex3 < (1<<18))) { - const BYTE* match; - size_t currentMl=0; - if ((!extDict) || matchIndex3 >= dictLimit) { - match = base + matchIndex3; - if (match[bestLength] == ip[bestLength]) currentMl = ZSTD_count(ip, match, iLimit); - } else { - match = dictBase + matchIndex3; - if (MEM_readMINMATCH(match, MINMATCH) == MEM_readMINMATCH(ip, MINMATCH)) /* assumption : matchIndex3 <= dictLimit-4 (by table construction) */ - currentMl = ZSTD_count_2segments(ip+MINMATCH, match+MINMATCH, iLimit, dictEnd, prefixStart) + MINMATCH; - } - - /* save best solution */ - if (currentMl > bestLength) { - bestLength = currentMl; - matches[mnum].off = ZSTD_REP_MOVE_OPT + current - matchIndex3; - matches[mnum].len = (U32)currentMl; - mnum++; - if (currentMl > ZSTD_OPT_NUM) goto update; - if (ip+currentMl == iLimit) goto update; /* best possible, and avoid read overflow*/ - } - } - } - - hashTable[h] = current; /* Update Hash Table */ - - while (nbCompares-- && (matchIndex > windowLow)) { - U32* nextPtr = bt + 2*(matchIndex & btMask); - size_t matchLength = MIN(commonLengthSmaller, commonLengthLarger); /* guaranteed minimum nb of common bytes */ - const BYTE* match; - - if ((!extDict) || (matchIndex+matchLength >= dictLimit)) { - match = base + matchIndex; - if (match[matchLength] == ip[matchLength]) { - matchLength += ZSTD_count(ip+matchLength+1, match+matchLength+1, iLimit) +1; - } - } else { - match = dictBase + matchIndex; - matchLength += ZSTD_count_2segments(ip+matchLength, match+matchLength, iLimit, dictEnd, prefixStart); - if (matchIndex+matchLength >= dictLimit) - match = base + matchIndex; /* to prepare for next usage of match[matchLength] */ - } - - if (matchLength > bestLength) { - if (matchLength > matchEndIdx - matchIndex) matchEndIdx = matchIndex + (U32)matchLength; - bestLength = matchLength; - matches[mnum].off = ZSTD_REP_MOVE_OPT + current - matchIndex; - matches[mnum].len = (U32)matchLength; - mnum++; - if (matchLength > ZSTD_OPT_NUM) break; - if (ip+matchLength == iLimit) /* equal : no way to know if inf or sup */ - break; /* drop, to guarantee consistency (miss a little bit of compression) */ - } - - if (match[matchLength] < ip[matchLength]) { - /* match is smaller than current */ - *smallerPtr = matchIndex; /* update smaller idx */ - commonLengthSmaller = matchLength; /* all smaller will now have at least this guaranteed common length */ - if (matchIndex <= btLow) { smallerPtr=&dummy32; break; } /* beyond tree size, stop the search */ - smallerPtr = nextPtr+1; /* new "smaller" => larger of match */ - matchIndex = nextPtr[1]; /* new matchIndex larger than previous (closer to current) */ - } else { - /* match is larger than current */ - *largerPtr = matchIndex; - commonLengthLarger = matchLength; - if (matchIndex <= btLow) { largerPtr=&dummy32; break; } /* beyond tree size, stop the search */ - largerPtr = nextPtr; - matchIndex = nextPtr[0]; - } } - - *smallerPtr = *largerPtr = 0; - -update: - zc->nextToUpdate = (matchEndIdx > current + 8) ? matchEndIdx - 8 : current+1; - return mnum; -} - +#if defined (__cplusplus) +extern "C" { +#endif -/** Tree updater, providing best match */ -static U32 ZSTD_BtGetAllMatches ( - ZSTD_CCtx* zc, - const BYTE* const ip, const BYTE* const iLimit, - const U32 maxNbAttempts, const U32 mls, ZSTD_match_t* matches, const U32 minMatchLen) -{ - if (ip < zc->base + zc->nextToUpdate) return 0; /* skipped area */ - ZSTD_updateTree(zc, ip, iLimit, maxNbAttempts, mls); - return ZSTD_insertBtAndGetAllMatches(zc, ip, iLimit, maxNbAttempts, mls, 0, matches, minMatchLen); -} - - -static U32 ZSTD_BtGetAllMatches_selectMLS ( - ZSTD_CCtx* zc, /* Index table will be updated */ - const BYTE* ip, const BYTE* const iHighLimit, - const U32 maxNbAttempts, const U32 matchLengthSearch, ZSTD_match_t* matches, const U32 minMatchLen) -{ - switch(matchLengthSearch) - { - case 3 : return ZSTD_BtGetAllMatches(zc, ip, iHighLimit, maxNbAttempts, 3, matches, minMatchLen); - default : - case 4 : return ZSTD_BtGetAllMatches(zc, ip, iHighLimit, maxNbAttempts, 4, matches, minMatchLen); - case 5 : return ZSTD_BtGetAllMatches(zc, ip, iHighLimit, maxNbAttempts, 5, matches, minMatchLen); - case 6 : return ZSTD_BtGetAllMatches(zc, ip, iHighLimit, maxNbAttempts, 6, matches, minMatchLen); - } -} - -/** Tree updater, providing best match */ -static U32 ZSTD_BtGetAllMatches_extDict ( - ZSTD_CCtx* zc, - const BYTE* const ip, const BYTE* const iLimit, - const U32 maxNbAttempts, const U32 mls, ZSTD_match_t* matches, const U32 minMatchLen) -{ - if (ip < zc->base + zc->nextToUpdate) return 0; /* skipped area */ - ZSTD_updateTree_extDict(zc, ip, iLimit, maxNbAttempts, mls); - return ZSTD_insertBtAndGetAllMatches(zc, ip, iLimit, maxNbAttempts, mls, 1, matches, minMatchLen); -} - - -static U32 ZSTD_BtGetAllMatches_selectMLS_extDict ( - ZSTD_CCtx* zc, /* Index table will be updated */ - const BYTE* ip, const BYTE* const iHighLimit, - const U32 maxNbAttempts, const U32 matchLengthSearch, ZSTD_match_t* matches, const U32 minMatchLen) -{ - switch(matchLengthSearch) - { - case 3 : return ZSTD_BtGetAllMatches_extDict(zc, ip, iHighLimit, maxNbAttempts, 3, matches, minMatchLen); - default : - case 4 : return ZSTD_BtGetAllMatches_extDict(zc, ip, iHighLimit, maxNbAttempts, 4, matches, minMatchLen); - case 5 : return ZSTD_BtGetAllMatches_extDict(zc, ip, iHighLimit, maxNbAttempts, 5, matches, minMatchLen); - case 6 : return ZSTD_BtGetAllMatches_extDict(zc, ip, iHighLimit, maxNbAttempts, 6, matches, minMatchLen); - } -} - +#include "zstd_compress_internal.h" -/*-******************************* -* Optimal parser -*********************************/ -FORCE_INLINE -void ZSTD_compressBlock_opt_generic(ZSTD_CCtx* ctx, - const void* src, size_t srcSize, const int ultra) -{ - seqStore_t* seqStorePtr = &(ctx->seqStore); - const BYTE* const istart = (const BYTE*)src; - const BYTE* ip = istart; - const BYTE* anchor = istart; - const BYTE* const iend = istart + srcSize; - const BYTE* const ilimit = iend - 8; - const BYTE* const base = ctx->base; - const BYTE* const prefixStart = base + ctx->dictLimit; - - const U32 maxSearches = 1U << ctx->params.cParams.searchLog; - const U32 sufficient_len = ctx->params.cParams.targetLength; - const U32 mls = ctx->params.cParams.searchLength; - const U32 minMatch = (ctx->params.cParams.searchLength == 3) ? 3 : 4; - - ZSTD_optimal_t* opt = seqStorePtr->priceTable; - ZSTD_match_t* matches = seqStorePtr->matchTable; - const BYTE* inr; - U32 offset, rep[ZSTD_REP_NUM]; - - /* init */ - ctx->nextToUpdate3 = ctx->nextToUpdate; - ZSTD_rescaleFreqs(seqStorePtr, (const BYTE*)src, srcSize); - ip += (ip==prefixStart); - { U32 i; for (i=0; irep[i]; } - - /* Match Loop */ - while (ip < ilimit) { - U32 cur, match_num, last_pos, litlen, price; - U32 u, mlen, best_mlen, best_off, litLength; - memset(opt, 0, sizeof(ZSTD_optimal_t)); - last_pos = 0; - litlen = (U32)(ip - anchor); - - /* check repCode */ - { U32 i, last_i = ZSTD_REP_CHECK + (ip==anchor); - for (i=(ip == anchor); i 0) && (repCur < (S32)(ip-prefixStart)) - && (MEM_readMINMATCH(ip, minMatch) == MEM_readMINMATCH(ip - repCur, minMatch))) { - mlen = (U32)ZSTD_count(ip+minMatch, ip+minMatch-repCur, iend) + minMatch; - if (mlen > sufficient_len || mlen >= ZSTD_OPT_NUM) { - best_mlen = mlen; best_off = i; cur = 0; last_pos = 1; - goto _storeSequence; - } - best_off = i - (ip == anchor); - do { - price = ZSTD_getPrice(seqStorePtr, litlen, anchor, best_off, mlen - MINMATCH, ultra); - if (mlen > last_pos || price < opt[mlen].price) - SET_PRICE(mlen, mlen, i, litlen, price); /* note : macro modifies last_pos */ - mlen--; - } while (mlen >= minMatch); - } } } - - match_num = ZSTD_BtGetAllMatches_selectMLS(ctx, ip, iend, maxSearches, mls, matches, minMatch); - - if (!last_pos && !match_num) { ip++; continue; } +void ZSTD_updateTree( + ZSTD_matchState_t* ms, ZSTD_compressionParameters const* cParams, + const BYTE* ip, const BYTE* iend); /* used in ZSTD_loadDictionaryContent() */ - if (match_num && (matches[match_num-1].len > sufficient_len || matches[match_num-1].len >= ZSTD_OPT_NUM)) { - best_mlen = matches[match_num-1].len; - best_off = matches[match_num-1].off; - cur = 0; - last_pos = 1; - goto _storeSequence; - } - - /* set prices using matches at position = 0 */ - best_mlen = (last_pos) ? last_pos : minMatch; - for (u = 0; u < match_num; u++) { - mlen = (u>0) ? matches[u-1].len+1 : best_mlen; - best_mlen = matches[u].len; - while (mlen <= best_mlen) { - price = ZSTD_getPrice(seqStorePtr, litlen, anchor, matches[u].off-1, mlen - MINMATCH, ultra); - if (mlen > last_pos || price < opt[mlen].price) - SET_PRICE(mlen, mlen, matches[u].off, litlen, price); /* note : macro modifies last_pos */ - mlen++; - } } - - if (last_pos < minMatch) { ip++; continue; } - - /* initialize opt[0] */ - { U32 i ; for (i=0; i litlen) { - price = opt[cur - litlen].price + ZSTD_getLiteralPrice(seqStorePtr, litlen, inr-litlen); - } else - price = ZSTD_getLiteralPrice(seqStorePtr, litlen, anchor); - } else { - litlen = 1; - price = opt[cur - 1].price + ZSTD_getLiteralPrice(seqStorePtr, litlen, inr-1); - } - - if (cur > last_pos || price <= opt[cur].price) - SET_PRICE(cur, 1, 0, litlen, price); - - if (cur == last_pos) break; - - if (inr > ilimit) /* last match must start at a minimum distance of 8 from oend */ - continue; - - mlen = opt[cur].mlen; - if (opt[cur].off > ZSTD_REP_MOVE_OPT) { - opt[cur].rep[2] = opt[cur-mlen].rep[1]; - opt[cur].rep[1] = opt[cur-mlen].rep[0]; - opt[cur].rep[0] = opt[cur].off - ZSTD_REP_MOVE_OPT; - } else { - opt[cur].rep[2] = (opt[cur].off > 1) ? opt[cur-mlen].rep[1] : opt[cur-mlen].rep[2]; - opt[cur].rep[1] = (opt[cur].off > 0) ? opt[cur-mlen].rep[0] : opt[cur-mlen].rep[1]; - opt[cur].rep[0] = ((opt[cur].off==ZSTD_REP_MOVE_OPT) && (mlen != 1)) ? (opt[cur-mlen].rep[0] - 1) : (opt[cur-mlen].rep[opt[cur].off]); - } - - best_mlen = minMatch; - { U32 i, last_i = ZSTD_REP_CHECK + (mlen != 1); - for (i=(opt[cur].mlen != 1); i 0) && (repCur < (S32)(inr-prefixStart)) - && (MEM_readMINMATCH(inr, minMatch) == MEM_readMINMATCH(inr - repCur, minMatch))) { - mlen = (U32)ZSTD_count(inr+minMatch, inr+minMatch - repCur, iend) + minMatch; - - if (mlen > sufficient_len || cur + mlen >= ZSTD_OPT_NUM) { - best_mlen = mlen; best_off = i; last_pos = cur + 1; - goto _storeSequence; - } - - best_off = i - (opt[cur].mlen != 1); - if (mlen > best_mlen) best_mlen = mlen; - - do { - if (opt[cur].mlen == 1) { - litlen = opt[cur].litlen; - if (cur > litlen) { - price = opt[cur - litlen].price + ZSTD_getPrice(seqStorePtr, litlen, inr-litlen, best_off, mlen - MINMATCH, ultra); - } else - price = ZSTD_getPrice(seqStorePtr, litlen, anchor, best_off, mlen - MINMATCH, ultra); - } else { - litlen = 0; - price = opt[cur].price + ZSTD_getPrice(seqStorePtr, 0, NULL, best_off, mlen - MINMATCH, ultra); - } - - if (cur + mlen > last_pos || price <= opt[cur + mlen].price) - SET_PRICE(cur + mlen, mlen, i, litlen, price); - mlen--; - } while (mlen >= minMatch); - } } } - - match_num = ZSTD_BtGetAllMatches_selectMLS(ctx, inr, iend, maxSearches, mls, matches, best_mlen); - - if (match_num > 0 && (matches[match_num-1].len > sufficient_len || cur + matches[match_num-1].len >= ZSTD_OPT_NUM)) { - best_mlen = matches[match_num-1].len; - best_off = matches[match_num-1].off; - last_pos = cur + 1; - goto _storeSequence; - } - - /* set prices using matches at position = cur */ - for (u = 0; u < match_num; u++) { - mlen = (u>0) ? matches[u-1].len+1 : best_mlen; - best_mlen = matches[u].len; - - while (mlen <= best_mlen) { - if (opt[cur].mlen == 1) { - litlen = opt[cur].litlen; - if (cur > litlen) - price = opt[cur - litlen].price + ZSTD_getPrice(seqStorePtr, litlen, ip+cur-litlen, matches[u].off-1, mlen - MINMATCH, ultra); - else - price = ZSTD_getPrice(seqStorePtr, litlen, anchor, matches[u].off-1, mlen - MINMATCH, ultra); - } else { - litlen = 0; - price = opt[cur].price + ZSTD_getPrice(seqStorePtr, 0, NULL, matches[u].off-1, mlen - MINMATCH, ultra); - } - - if (cur + mlen > last_pos || (price < opt[cur + mlen].price)) - SET_PRICE(cur + mlen, mlen, matches[u].off, litlen, price); - - mlen++; - } } } - - best_mlen = opt[last_pos].mlen; - best_off = opt[last_pos].off; - cur = last_pos - best_mlen; - - /* store sequence */ -_storeSequence: /* cur, last_pos, best_mlen, best_off have to be set */ - opt[0].mlen = 1; - - while (1) { - mlen = opt[cur].mlen; - offset = opt[cur].off; - opt[cur].mlen = best_mlen; - opt[cur].off = best_off; - best_mlen = mlen; - best_off = offset; - if (mlen > cur) break; - cur -= mlen; - } - - for (u = 0; u <= last_pos;) { - u += opt[u].mlen; - } - - for (cur=0; cur < last_pos; ) { - mlen = opt[cur].mlen; - if (mlen == 1) { ip++; cur++; continue; } - offset = opt[cur].off; - cur += mlen; - litLength = (U32)(ip - anchor); - - if (offset > ZSTD_REP_MOVE_OPT) { - rep[2] = rep[1]; - rep[1] = rep[0]; - rep[0] = offset - ZSTD_REP_MOVE_OPT; - offset--; - } else { - if (offset != 0) { - best_off = (offset==ZSTD_REP_MOVE_OPT) ? (rep[0] - 1) : (rep[offset]); - if (offset != 1) rep[2] = rep[1]; - rep[1] = rep[0]; - rep[0] = best_off; - } - if (litLength==0) offset--; - } - - ZSTD_updatePrice(seqStorePtr, litLength, anchor, offset, mlen-MINMATCH); - ZSTD_storeSeq(seqStorePtr, litLength, anchor, offset, mlen-MINMATCH); - anchor = ip = ip + mlen; - } } /* for (cur=0; cur < last_pos; ) */ - - /* Save reps for next block */ - { int i; for (i=0; irepToConfirm[i] = rep[i]; } - - /* Last Literals */ - { size_t const lastLLSize = iend - anchor; - memcpy(seqStorePtr->lit, anchor, lastLLSize); - seqStorePtr->lit += lastLLSize; - } -} - - -FORCE_INLINE -void ZSTD_compressBlock_opt_extDict_generic(ZSTD_CCtx* ctx, - const void* src, size_t srcSize, const int ultra) -{ - seqStore_t* seqStorePtr = &(ctx->seqStore); - const BYTE* const istart = (const BYTE*)src; - const BYTE* ip = istart; - const BYTE* anchor = istart; - const BYTE* const iend = istart + srcSize; - const BYTE* const ilimit = iend - 8; - const BYTE* const base = ctx->base; - const U32 lowestIndex = ctx->lowLimit; - const U32 dictLimit = ctx->dictLimit; - const BYTE* const prefixStart = base + dictLimit; - const BYTE* const dictBase = ctx->dictBase; - const BYTE* const dictEnd = dictBase + dictLimit; - - const U32 maxSearches = 1U << ctx->params.cParams.searchLog; - const U32 sufficient_len = ctx->params.cParams.targetLength; - const U32 mls = ctx->params.cParams.searchLength; - const U32 minMatch = (ctx->params.cParams.searchLength == 3) ? 3 : 4; - - ZSTD_optimal_t* opt = seqStorePtr->priceTable; - ZSTD_match_t* matches = seqStorePtr->matchTable; - const BYTE* inr; - - /* init */ - U32 offset, rep[ZSTD_REP_NUM]; - { U32 i; for (i=0; irep[i]; } - - ctx->nextToUpdate3 = ctx->nextToUpdate; - ZSTD_rescaleFreqs(seqStorePtr, (const BYTE*)src, srcSize); - ip += (ip==prefixStart); - - /* Match Loop */ - while (ip < ilimit) { - U32 cur, match_num, last_pos, litlen, price; - U32 u, mlen, best_mlen, best_off, litLength; - U32 current = (U32)(ip-base); - memset(opt, 0, sizeof(ZSTD_optimal_t)); - last_pos = 0; - opt[0].litlen = (U32)(ip - anchor); +size_t ZSTD_compressBlock_btopt( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + ZSTD_compressionParameters const* cParams, void const* src, size_t srcSize); +size_t ZSTD_compressBlock_btultra( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + ZSTD_compressionParameters const* cParams, void const* src, size_t srcSize); - /* check repCode */ - { U32 i, last_i = ZSTD_REP_CHECK + (ip==anchor); - for (i = (ip==anchor); i 0 && repCur <= (S32)current) - && (((U32)((dictLimit-1) - repIndex) >= 3) & (repIndex>lowestIndex)) /* intentional overflow */ - && (MEM_readMINMATCH(ip, minMatch) == MEM_readMINMATCH(repMatch, minMatch)) ) { - /* repcode detected we should take it */ - const BYTE* const repEnd = repIndex < dictLimit ? dictEnd : iend; - mlen = (U32)ZSTD_count_2segments(ip+minMatch, repMatch+minMatch, iend, repEnd, prefixStart) + minMatch; - - if (mlen > sufficient_len || mlen >= ZSTD_OPT_NUM) { - best_mlen = mlen; best_off = i; cur = 0; last_pos = 1; - goto _storeSequence; - } - - best_off = i - (ip==anchor); - litlen = opt[0].litlen; - do { - price = ZSTD_getPrice(seqStorePtr, litlen, anchor, best_off, mlen - MINMATCH, ultra); - if (mlen > last_pos || price < opt[mlen].price) - SET_PRICE(mlen, mlen, i, litlen, price); /* note : macro modifies last_pos */ - mlen--; - } while (mlen >= minMatch); - } } } - - match_num = ZSTD_BtGetAllMatches_selectMLS_extDict(ctx, ip, iend, maxSearches, mls, matches, minMatch); /* first search (depth 0) */ - - if (!last_pos && !match_num) { ip++; continue; } - - { U32 i; for (i=0; i sufficient_len || matches[match_num-1].len >= ZSTD_OPT_NUM)) { - best_mlen = matches[match_num-1].len; - best_off = matches[match_num-1].off; - cur = 0; - last_pos = 1; - goto _storeSequence; - } - - best_mlen = (last_pos) ? last_pos : minMatch; - - /* set prices using matches at position = 0 */ - for (u = 0; u < match_num; u++) { - mlen = (u>0) ? matches[u-1].len+1 : best_mlen; - best_mlen = matches[u].len; - litlen = opt[0].litlen; - while (mlen <= best_mlen) { - price = ZSTD_getPrice(seqStorePtr, litlen, anchor, matches[u].off-1, mlen - MINMATCH, ultra); - if (mlen > last_pos || price < opt[mlen].price) - SET_PRICE(mlen, mlen, matches[u].off, litlen, price); - mlen++; - } } - - if (last_pos < minMatch) { - ip++; continue; - } - - /* check further positions */ - for (cur = 1; cur <= last_pos; cur++) { - inr = ip + cur; - - if (opt[cur-1].mlen == 1) { - litlen = opt[cur-1].litlen + 1; - if (cur > litlen) { - price = opt[cur - litlen].price + ZSTD_getLiteralPrice(seqStorePtr, litlen, inr-litlen); - } else - price = ZSTD_getLiteralPrice(seqStorePtr, litlen, anchor); - } else { - litlen = 1; - price = opt[cur - 1].price + ZSTD_getLiteralPrice(seqStorePtr, litlen, inr-1); - } - - if (cur > last_pos || price <= opt[cur].price) - SET_PRICE(cur, 1, 0, litlen, price); - - if (cur == last_pos) break; - - if (inr > ilimit) /* last match must start at a minimum distance of 8 from oend */ - continue; - - mlen = opt[cur].mlen; - if (opt[cur].off > ZSTD_REP_MOVE_OPT) { - opt[cur].rep[2] = opt[cur-mlen].rep[1]; - opt[cur].rep[1] = opt[cur-mlen].rep[0]; - opt[cur].rep[0] = opt[cur].off - ZSTD_REP_MOVE_OPT; - } else { - opt[cur].rep[2] = (opt[cur].off > 1) ? opt[cur-mlen].rep[1] : opt[cur-mlen].rep[2]; - opt[cur].rep[1] = (opt[cur].off > 0) ? opt[cur-mlen].rep[0] : opt[cur-mlen].rep[1]; - opt[cur].rep[0] = ((opt[cur].off==ZSTD_REP_MOVE_OPT) && (mlen != 1)) ? (opt[cur-mlen].rep[0] - 1) : (opt[cur-mlen].rep[opt[cur].off]); - } - - best_mlen = minMatch; - { U32 i, last_i = ZSTD_REP_CHECK + (mlen != 1); - for (i = (mlen != 1); i 0 && repCur <= (S32)(current+cur)) - && (((U32)((dictLimit-1) - repIndex) >= 3) & (repIndex>lowestIndex)) /* intentional overflow */ - && (MEM_readMINMATCH(inr, minMatch) == MEM_readMINMATCH(repMatch, minMatch)) ) { - /* repcode detected */ - const BYTE* const repEnd = repIndex < dictLimit ? dictEnd : iend; - mlen = (U32)ZSTD_count_2segments(inr+minMatch, repMatch+minMatch, iend, repEnd, prefixStart) + minMatch; - - if (mlen > sufficient_len || cur + mlen >= ZSTD_OPT_NUM) { - best_mlen = mlen; best_off = i; last_pos = cur + 1; - goto _storeSequence; - } +size_t ZSTD_compressBlock_btopt_extDict( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + ZSTD_compressionParameters const* cParams, void const* src, size_t srcSize); +size_t ZSTD_compressBlock_btultra_extDict( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + ZSTD_compressionParameters const* cParams, void const* src, size_t srcSize); - best_off = i - (opt[cur].mlen != 1); - if (mlen > best_mlen) best_mlen = mlen; - - do { - if (opt[cur].mlen == 1) { - litlen = opt[cur].litlen; - if (cur > litlen) { - price = opt[cur - litlen].price + ZSTD_getPrice(seqStorePtr, litlen, inr-litlen, best_off, mlen - MINMATCH, ultra); - } else - price = ZSTD_getPrice(seqStorePtr, litlen, anchor, best_off, mlen - MINMATCH, ultra); - } else { - litlen = 0; - price = opt[cur].price + ZSTD_getPrice(seqStorePtr, 0, NULL, best_off, mlen - MINMATCH, ultra); - } - - if (cur + mlen > last_pos || price <= opt[cur + mlen].price) - SET_PRICE(cur + mlen, mlen, i, litlen, price); - mlen--; - } while (mlen >= minMatch); - } } } - - match_num = ZSTD_BtGetAllMatches_selectMLS_extDict(ctx, inr, iend, maxSearches, mls, matches, minMatch); - - if (match_num > 0 && (matches[match_num-1].len > sufficient_len || cur + matches[match_num-1].len >= ZSTD_OPT_NUM)) { - best_mlen = matches[match_num-1].len; - best_off = matches[match_num-1].off; - last_pos = cur + 1; - goto _storeSequence; - } - - /* set prices using matches at position = cur */ - for (u = 0; u < match_num; u++) { - mlen = (u>0) ? matches[u-1].len+1 : best_mlen; - best_mlen = matches[u].len; - - while (mlen <= best_mlen) { - if (opt[cur].mlen == 1) { - litlen = opt[cur].litlen; - if (cur > litlen) - price = opt[cur - litlen].price + ZSTD_getPrice(seqStorePtr, litlen, ip+cur-litlen, matches[u].off-1, mlen - MINMATCH, ultra); - else - price = ZSTD_getPrice(seqStorePtr, litlen, anchor, matches[u].off-1, mlen - MINMATCH, ultra); - } else { - litlen = 0; - price = opt[cur].price + ZSTD_getPrice(seqStorePtr, 0, NULL, matches[u].off-1, mlen - MINMATCH, ultra); - } - - if (cur + mlen > last_pos || (price < opt[cur + mlen].price)) - SET_PRICE(cur + mlen, mlen, matches[u].off, litlen, price); - - mlen++; - } } } /* for (cur = 1; cur <= last_pos; cur++) */ - - best_mlen = opt[last_pos].mlen; - best_off = opt[last_pos].off; - cur = last_pos - best_mlen; +#if defined (__cplusplus) +} +#endif - /* store sequence */ -_storeSequence: /* cur, last_pos, best_mlen, best_off have to be set */ - opt[0].mlen = 1; - - while (1) { - mlen = opt[cur].mlen; - offset = opt[cur].off; - opt[cur].mlen = best_mlen; - opt[cur].off = best_off; - best_mlen = mlen; - best_off = offset; - if (mlen > cur) break; - cur -= mlen; - } - - for (u = 0; u <= last_pos; ) { - u += opt[u].mlen; - } - - for (cur=0; cur < last_pos; ) { - mlen = opt[cur].mlen; - if (mlen == 1) { ip++; cur++; continue; } - offset = opt[cur].off; - cur += mlen; - litLength = (U32)(ip - anchor); - - if (offset > ZSTD_REP_MOVE_OPT) { - rep[2] = rep[1]; - rep[1] = rep[0]; - rep[0] = offset - ZSTD_REP_MOVE_OPT; - offset--; - } else { - if (offset != 0) { - best_off = (offset==ZSTD_REP_MOVE_OPT) ? (rep[0] - 1) : (rep[offset]); - if (offset != 1) rep[2] = rep[1]; - rep[1] = rep[0]; - rep[0] = best_off; - } - - if (litLength==0) offset--; - } - - ZSTD_updatePrice(seqStorePtr, litLength, anchor, offset, mlen-MINMATCH); - ZSTD_storeSeq(seqStorePtr, litLength, anchor, offset, mlen-MINMATCH); - anchor = ip = ip + mlen; - } } /* for (cur=0; cur < last_pos; ) */ - - /* Save reps for next block */ - { int i; for (i=0; irepToConfirm[i] = rep[i]; } - - /* Last Literals */ - { size_t lastLLSize = iend - anchor; - memcpy(seqStorePtr->lit, anchor, lastLLSize); - seqStorePtr->lit += lastLLSize; - } -} - -#endif /* ZSTD_OPT_H_91842398743 */ +#endif /* ZSTD_OPT_H */ diff -r fb92df8b634c -r ed5448edcbfa contrib/python-zstandard/zstd/compress/zstdmt_compress.c --- a/contrib/python-zstandard/zstd/compress/zstdmt_compress.c Wed Apr 04 10:35:09 2018 -0400 +++ b/contrib/python-zstandard/zstd/compress/zstdmt_compress.c Wed Apr 18 15:32:08 2018 -0400 @@ -1,264 +1,712 @@ -/** +/* * Copyright (c) 2016-present, Yann Collet, Facebook, Inc. * All rights reserved. * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. An additional grant - * of patent rights can be found in the PATENTS file in the same directory. + * This source code is licensed under both the BSD-style license (found in the + * LICENSE file in the root directory of this source tree) and the GPLv2 (found + * in the COPYING file in the root directory of this source tree). + * You may select, at your option, one of the above-listed licenses. */ /* ====== Tuning parameters ====== */ -#define ZSTDMT_NBTHREADS_MAX 128 +#define ZSTDMT_NBWORKERS_MAX 200 +#define ZSTDMT_JOBSIZE_MAX (MEM_32bits() ? (512 MB) : (2 GB)) /* note : limited by `jobSize` type, which is `unsigned` */ +#define ZSTDMT_OVERLAPLOG_DEFAULT 6 /* ====== Compiler specifics ====== */ #if defined(_MSC_VER) -# pragma warning(disable : 4204) /* disable: C4204: non-constant aggregate initializer */ +# pragma warning(disable : 4204) /* disable: C4204: non-constant aggregate initializer */ #endif /* ====== Dependencies ====== */ -#include /* malloc */ -#include /* memcpy */ -#include "pool.h" /* threadpool */ -#include "threading.h" /* mutex */ -#include "zstd_internal.h" /* MIN, ERROR, ZSTD_*, ZSTD_highbit32 */ +#include /* memcpy, memset */ +#include /* INT_MAX */ +#include "pool.h" /* threadpool */ +#include "threading.h" /* mutex */ +#include "zstd_compress_internal.h" /* MIN, ERROR, ZSTD_*, ZSTD_highbit32 */ +#include "zstd_ldm.h" #include "zstdmt_compress.h" -#define XXH_STATIC_LINKING_ONLY /* XXH64_state_t */ -#include "xxhash.h" +/* Guards code to support resizing the SeqPool. + * We will want to resize the SeqPool to save memory in the future. + * Until then, comment the code out since it is unused. + */ +#define ZSTD_RESIZE_SEQPOOL 0 /* ====== Debug ====== */ -#if 0 +#if defined(ZSTD_DEBUG) && (ZSTD_DEBUG>=2) # include # include # include - static unsigned g_debugLevel = 3; -# define DEBUGLOGRAW(l, ...) if (l<=g_debugLevel) { fprintf(stderr, __VA_ARGS__); } -# define DEBUGLOG(l, ...) if (l<=g_debugLevel) { fprintf(stderr, __FILE__ ": "); fprintf(stderr, __VA_ARGS__); fprintf(stderr, " \n"); } +# define DEBUGLOGRAW(l, ...) if (l<=ZSTD_DEBUG) { fprintf(stderr, __VA_ARGS__); } -# define DEBUG_PRINTHEX(l,p,n) { \ - unsigned debug_u; \ - for (debug_u=0; debug_u<(n); debug_u++) \ +# define DEBUG_PRINTHEX(l,p,n) { \ + unsigned debug_u; \ + for (debug_u=0; debug_u<(n); debug_u++) \ DEBUGLOGRAW(l, "%02X ", ((const unsigned char*)(p))[debug_u]); \ - DEBUGLOGRAW(l, " \n"); \ + DEBUGLOGRAW(l, " \n"); \ } -static unsigned long long GetCurrentClockTimeMicroseconds() +static unsigned long long GetCurrentClockTimeMicroseconds(void) { static clock_t _ticksPerSecond = 0; if (_ticksPerSecond <= 0) _ticksPerSecond = sysconf(_SC_CLK_TCK); - struct tms junk; clock_t newTicks = (clock_t) times(&junk); - return ((((unsigned long long)newTicks)*(1000000))/_ticksPerSecond); + { struct tms junk; clock_t newTicks = (clock_t) times(&junk); + return ((((unsigned long long)newTicks)*(1000000))/_ticksPerSecond); } } -#define MUTEX_WAIT_TIME_DLEVEL 5 -#define PTHREAD_MUTEX_LOCK(mutex) \ -if (g_debugLevel>=MUTEX_WAIT_TIME_DLEVEL) { \ - unsigned long long beforeTime = GetCurrentClockTimeMicroseconds(); \ - pthread_mutex_lock(mutex); \ - unsigned long long afterTime = GetCurrentClockTimeMicroseconds(); \ - unsigned long long elapsedTime = (afterTime-beforeTime); \ - if (elapsedTime > 1000) { /* or whatever threshold you like; I'm using 1 millisecond here */ \ - DEBUGLOG(MUTEX_WAIT_TIME_DLEVEL, "Thread took %llu microseconds to acquire mutex %s \n", \ - elapsedTime, #mutex); \ - } \ -} else pthread_mutex_lock(mutex); +#define MUTEX_WAIT_TIME_DLEVEL 6 +#define ZSTD_PTHREAD_MUTEX_LOCK(mutex) { \ + if (ZSTD_DEBUG >= MUTEX_WAIT_TIME_DLEVEL) { \ + unsigned long long const beforeTime = GetCurrentClockTimeMicroseconds(); \ + ZSTD_pthread_mutex_lock(mutex); \ + { unsigned long long const afterTime = GetCurrentClockTimeMicroseconds(); \ + unsigned long long const elapsedTime = (afterTime-beforeTime); \ + if (elapsedTime > 1000) { /* or whatever threshold you like; I'm using 1 millisecond here */ \ + DEBUGLOG(MUTEX_WAIT_TIME_DLEVEL, "Thread took %llu microseconds to acquire mutex %s \n", \ + elapsedTime, #mutex); \ + } } \ + } else { \ + ZSTD_pthread_mutex_lock(mutex); \ + } \ +} #else -# define DEBUGLOG(l, ...) {} /* disabled */ -# define PTHREAD_MUTEX_LOCK(m) pthread_mutex_lock(m) +# define ZSTD_PTHREAD_MUTEX_LOCK(m) ZSTD_pthread_mutex_lock(m) # define DEBUG_PRINTHEX(l,p,n) {} #endif /* ===== Buffer Pool ===== */ +/* a single Buffer Pool can be invoked from multiple threads in parallel */ typedef struct buffer_s { void* start; - size_t size; + size_t capacity; } buffer_t; static const buffer_t g_nullBuffer = { NULL, 0 }; typedef struct ZSTDMT_bufferPool_s { + ZSTD_pthread_mutex_t poolMutex; + size_t bufferSize; unsigned totalBuffers; unsigned nbBuffers; + ZSTD_customMem cMem; buffer_t bTable[1]; /* variable size */ } ZSTDMT_bufferPool; -static ZSTDMT_bufferPool* ZSTDMT_createBufferPool(unsigned nbThreads) +static ZSTDMT_bufferPool* ZSTDMT_createBufferPool(unsigned nbWorkers, ZSTD_customMem cMem) { - unsigned const maxNbBuffers = 2*nbThreads + 2; - ZSTDMT_bufferPool* const bufPool = (ZSTDMT_bufferPool*)calloc(1, sizeof(ZSTDMT_bufferPool) + (maxNbBuffers-1) * sizeof(buffer_t)); + unsigned const maxNbBuffers = 2*nbWorkers + 3; + ZSTDMT_bufferPool* const bufPool = (ZSTDMT_bufferPool*)ZSTD_calloc( + sizeof(ZSTDMT_bufferPool) + (maxNbBuffers-1) * sizeof(buffer_t), cMem); if (bufPool==NULL) return NULL; + if (ZSTD_pthread_mutex_init(&bufPool->poolMutex, NULL)) { + ZSTD_free(bufPool, cMem); + return NULL; + } + bufPool->bufferSize = 64 KB; bufPool->totalBuffers = maxNbBuffers; bufPool->nbBuffers = 0; + bufPool->cMem = cMem; return bufPool; } static void ZSTDMT_freeBufferPool(ZSTDMT_bufferPool* bufPool) { unsigned u; + DEBUGLOG(3, "ZSTDMT_freeBufferPool (address:%08X)", (U32)(size_t)bufPool); if (!bufPool) return; /* compatibility with free on NULL */ + for (u=0; utotalBuffers; u++) { + DEBUGLOG(4, "free buffer %2u (address:%08X)", u, (U32)(size_t)bufPool->bTable[u].start); + ZSTD_free(bufPool->bTable[u].start, bufPool->cMem); + } + ZSTD_pthread_mutex_destroy(&bufPool->poolMutex); + ZSTD_free(bufPool, bufPool->cMem); +} + +/* only works at initialization, not during compression */ +static size_t ZSTDMT_sizeof_bufferPool(ZSTDMT_bufferPool* bufPool) +{ + size_t const poolSize = sizeof(*bufPool) + + (bufPool->totalBuffers - 1) * sizeof(buffer_t); + unsigned u; + size_t totalBufferSize = 0; + ZSTD_pthread_mutex_lock(&bufPool->poolMutex); for (u=0; utotalBuffers; u++) - free(bufPool->bTable[u].start); - free(bufPool); + totalBufferSize += bufPool->bTable[u].capacity; + ZSTD_pthread_mutex_unlock(&bufPool->poolMutex); + + return poolSize + totalBufferSize; +} + +/* ZSTDMT_setBufferSize() : + * all future buffers provided by this buffer pool will have _at least_ this size + * note : it's better for all buffers to have same size, + * as they become freely interchangeable, reducing malloc/free usages and memory fragmentation */ +static void ZSTDMT_setBufferSize(ZSTDMT_bufferPool* const bufPool, size_t const bSize) +{ + ZSTD_pthread_mutex_lock(&bufPool->poolMutex); + DEBUGLOG(4, "ZSTDMT_setBufferSize: bSize = %u", (U32)bSize); + bufPool->bufferSize = bSize; + ZSTD_pthread_mutex_unlock(&bufPool->poolMutex); } -/* assumption : invocation from main thread only ! */ -static buffer_t ZSTDMT_getBuffer(ZSTDMT_bufferPool* pool, size_t bSize) +/** ZSTDMT_getBuffer() : + * assumption : bufPool must be valid + * @return : a buffer, with start pointer and size + * note: allocation may fail, in this case, start==NULL and size==0 */ +static buffer_t ZSTDMT_getBuffer(ZSTDMT_bufferPool* bufPool) { - if (pool->nbBuffers) { /* try to use an existing buffer */ - buffer_t const buf = pool->bTable[--(pool->nbBuffers)]; - size_t const availBufferSize = buf.size; - if ((availBufferSize >= bSize) & (availBufferSize <= 10*bSize)) /* large enough, but not too much */ + size_t const bSize = bufPool->bufferSize; + DEBUGLOG(5, "ZSTDMT_getBuffer: bSize = %u", (U32)bufPool->bufferSize); + ZSTD_pthread_mutex_lock(&bufPool->poolMutex); + if (bufPool->nbBuffers) { /* try to use an existing buffer */ + buffer_t const buf = bufPool->bTable[--(bufPool->nbBuffers)]; + size_t const availBufferSize = buf.capacity; + bufPool->bTable[bufPool->nbBuffers] = g_nullBuffer; + if ((availBufferSize >= bSize) & ((availBufferSize>>3) <= bSize)) { + /* large enough, but not too much */ + DEBUGLOG(5, "ZSTDMT_getBuffer: provide buffer %u of size %u", + bufPool->nbBuffers, (U32)buf.capacity); + ZSTD_pthread_mutex_unlock(&bufPool->poolMutex); return buf; - free(buf.start); /* size conditions not respected : scratch this buffer and create a new one */ + } + /* size conditions not respected : scratch this buffer, create new one */ + DEBUGLOG(5, "ZSTDMT_getBuffer: existing buffer does not meet size conditions => freeing"); + ZSTD_free(buf.start, bufPool->cMem); } + ZSTD_pthread_mutex_unlock(&bufPool->poolMutex); /* create new buffer */ + DEBUGLOG(5, "ZSTDMT_getBuffer: create a new buffer"); { buffer_t buffer; - void* const start = malloc(bSize); - if (start==NULL) bSize = 0; + void* const start = ZSTD_malloc(bSize, bufPool->cMem); buffer.start = start; /* note : start can be NULL if malloc fails ! */ - buffer.size = bSize; + buffer.capacity = (start==NULL) ? 0 : bSize; + if (start==NULL) { + DEBUGLOG(5, "ZSTDMT_getBuffer: buffer allocation failure !!"); + } else { + DEBUGLOG(5, "ZSTDMT_getBuffer: created buffer of size %u", (U32)bSize); + } return buffer; } } -/* store buffer for later re-use, up to pool capacity */ -static void ZSTDMT_releaseBuffer(ZSTDMT_bufferPool* pool, buffer_t buf) +#if ZSTD_RESIZE_SEQPOOL +/** ZSTDMT_resizeBuffer() : + * assumption : bufPool must be valid + * @return : a buffer that is at least the buffer pool buffer size. + * If a reallocation happens, the data in the input buffer is copied. + */ +static buffer_t ZSTDMT_resizeBuffer(ZSTDMT_bufferPool* bufPool, buffer_t buffer) { - if (buf.start == NULL) return; /* release on NULL */ - if (pool->nbBuffers < pool->totalBuffers) { - pool->bTable[pool->nbBuffers++] = buf; /* store for later re-use */ + size_t const bSize = bufPool->bufferSize; + if (buffer.capacity < bSize) { + void* const start = ZSTD_malloc(bSize, bufPool->cMem); + buffer_t newBuffer; + newBuffer.start = start; + newBuffer.capacity = start == NULL ? 0 : bSize; + if (start != NULL) { + assert(newBuffer.capacity >= buffer.capacity); + memcpy(newBuffer.start, buffer.start, buffer.capacity); + DEBUGLOG(5, "ZSTDMT_resizeBuffer: created buffer of size %u", (U32)bSize); + return newBuffer; + } + DEBUGLOG(5, "ZSTDMT_resizeBuffer: buffer allocation failure !!"); + } + return buffer; +} +#endif + +/* store buffer for later re-use, up to pool capacity */ +static void ZSTDMT_releaseBuffer(ZSTDMT_bufferPool* bufPool, buffer_t buf) +{ + if (buf.start == NULL) return; /* compatible with release on NULL */ + DEBUGLOG(5, "ZSTDMT_releaseBuffer"); + ZSTD_pthread_mutex_lock(&bufPool->poolMutex); + if (bufPool->nbBuffers < bufPool->totalBuffers) { + bufPool->bTable[bufPool->nbBuffers++] = buf; /* stored for later use */ + DEBUGLOG(5, "ZSTDMT_releaseBuffer: stored buffer of size %u in slot %u", + (U32)buf.capacity, (U32)(bufPool->nbBuffers-1)); + ZSTD_pthread_mutex_unlock(&bufPool->poolMutex); return; } + ZSTD_pthread_mutex_unlock(&bufPool->poolMutex); /* Reached bufferPool capacity (should not happen) */ - free(buf.start); + DEBUGLOG(5, "ZSTDMT_releaseBuffer: pool capacity reached => freeing "); + ZSTD_free(buf.start, bufPool->cMem); } +/* ===== Seq Pool Wrapper ====== */ + +static rawSeqStore_t kNullRawSeqStore = {NULL, 0, 0, 0}; + +typedef ZSTDMT_bufferPool ZSTDMT_seqPool; + +static size_t ZSTDMT_sizeof_seqPool(ZSTDMT_seqPool* seqPool) +{ + return ZSTDMT_sizeof_bufferPool(seqPool); +} + +static rawSeqStore_t bufferToSeq(buffer_t buffer) +{ + rawSeqStore_t seq = {NULL, 0, 0, 0}; + seq.seq = (rawSeq*)buffer.start; + seq.capacity = buffer.capacity / sizeof(rawSeq); + return seq; +} + +static buffer_t seqToBuffer(rawSeqStore_t seq) +{ + buffer_t buffer; + buffer.start = seq.seq; + buffer.capacity = seq.capacity * sizeof(rawSeq); + return buffer; +} + +static rawSeqStore_t ZSTDMT_getSeq(ZSTDMT_seqPool* seqPool) +{ + if (seqPool->bufferSize == 0) { + return kNullRawSeqStore; + } + return bufferToSeq(ZSTDMT_getBuffer(seqPool)); +} + +#if ZSTD_RESIZE_SEQPOOL +static rawSeqStore_t ZSTDMT_resizeSeq(ZSTDMT_seqPool* seqPool, rawSeqStore_t seq) +{ + return bufferToSeq(ZSTDMT_resizeBuffer(seqPool, seqToBuffer(seq))); +} +#endif + +static void ZSTDMT_releaseSeq(ZSTDMT_seqPool* seqPool, rawSeqStore_t seq) +{ + ZSTDMT_releaseBuffer(seqPool, seqToBuffer(seq)); +} + +static void ZSTDMT_setNbSeq(ZSTDMT_seqPool* const seqPool, size_t const nbSeq) +{ + ZSTDMT_setBufferSize(seqPool, nbSeq * sizeof(rawSeq)); +} + +static ZSTDMT_seqPool* ZSTDMT_createSeqPool(unsigned nbWorkers, ZSTD_customMem cMem) +{ + ZSTDMT_seqPool* seqPool = ZSTDMT_createBufferPool(nbWorkers, cMem); + ZSTDMT_setNbSeq(seqPool, 0); + return seqPool; +} + +static void ZSTDMT_freeSeqPool(ZSTDMT_seqPool* seqPool) +{ + ZSTDMT_freeBufferPool(seqPool); +} + + + /* ===== CCtx Pool ===== */ +/* a single CCtx Pool can be invoked from multiple threads in parallel */ typedef struct { + ZSTD_pthread_mutex_t poolMutex; unsigned totalCCtx; unsigned availCCtx; + ZSTD_customMem cMem; ZSTD_CCtx* cctx[1]; /* variable size */ } ZSTDMT_CCtxPool; -/* assumption : CCtxPool invocation only from main thread */ - /* note : all CCtx borrowed from the pool should be released back to the pool _before_ freeing the pool */ static void ZSTDMT_freeCCtxPool(ZSTDMT_CCtxPool* pool) { unsigned u; for (u=0; utotalCCtx; u++) ZSTD_freeCCtx(pool->cctx[u]); /* note : compatible with free on NULL */ - free(pool); + ZSTD_pthread_mutex_destroy(&pool->poolMutex); + ZSTD_free(pool, pool->cMem); } /* ZSTDMT_createCCtxPool() : - * implies nbThreads >= 1 , checked by caller ZSTDMT_createCCtx() */ -static ZSTDMT_CCtxPool* ZSTDMT_createCCtxPool(unsigned nbThreads) + * implies nbWorkers >= 1 , checked by caller ZSTDMT_createCCtx() */ +static ZSTDMT_CCtxPool* ZSTDMT_createCCtxPool(unsigned nbWorkers, + ZSTD_customMem cMem) { - ZSTDMT_CCtxPool* const cctxPool = (ZSTDMT_CCtxPool*) calloc(1, sizeof(ZSTDMT_CCtxPool) + (nbThreads-1)*sizeof(ZSTD_CCtx*)); + ZSTDMT_CCtxPool* const cctxPool = (ZSTDMT_CCtxPool*) ZSTD_calloc( + sizeof(ZSTDMT_CCtxPool) + (nbWorkers-1)*sizeof(ZSTD_CCtx*), cMem); + assert(nbWorkers > 0); if (!cctxPool) return NULL; - cctxPool->totalCCtx = nbThreads; + if (ZSTD_pthread_mutex_init(&cctxPool->poolMutex, NULL)) { + ZSTD_free(cctxPool, cMem); + return NULL; + } + cctxPool->cMem = cMem; + cctxPool->totalCCtx = nbWorkers; cctxPool->availCCtx = 1; /* at least one cctx for single-thread mode */ - cctxPool->cctx[0] = ZSTD_createCCtx(); + cctxPool->cctx[0] = ZSTD_createCCtx_advanced(cMem); if (!cctxPool->cctx[0]) { ZSTDMT_freeCCtxPool(cctxPool); return NULL; } - DEBUGLOG(1, "cctxPool created, with %u threads", nbThreads); + DEBUGLOG(3, "cctxPool created, with %u workers", nbWorkers); return cctxPool; } -static ZSTD_CCtx* ZSTDMT_getCCtx(ZSTDMT_CCtxPool* pool) +/* only works during initialization phase, not during compression */ +static size_t ZSTDMT_sizeof_CCtxPool(ZSTDMT_CCtxPool* cctxPool) { - if (pool->availCCtx) { - pool->availCCtx--; - return pool->cctx[pool->availCCtx]; + ZSTD_pthread_mutex_lock(&cctxPool->poolMutex); + { unsigned const nbWorkers = cctxPool->totalCCtx; + size_t const poolSize = sizeof(*cctxPool) + + (nbWorkers-1) * sizeof(ZSTD_CCtx*); + unsigned u; + size_t totalCCtxSize = 0; + for (u=0; ucctx[u]); + } + ZSTD_pthread_mutex_unlock(&cctxPool->poolMutex); + assert(nbWorkers > 0); + return poolSize + totalCCtxSize; } - return ZSTD_createCCtx(); /* note : can be NULL, when creation fails ! */ +} + +static ZSTD_CCtx* ZSTDMT_getCCtx(ZSTDMT_CCtxPool* cctxPool) +{ + DEBUGLOG(5, "ZSTDMT_getCCtx"); + ZSTD_pthread_mutex_lock(&cctxPool->poolMutex); + if (cctxPool->availCCtx) { + cctxPool->availCCtx--; + { ZSTD_CCtx* const cctx = cctxPool->cctx[cctxPool->availCCtx]; + ZSTD_pthread_mutex_unlock(&cctxPool->poolMutex); + return cctx; + } } + ZSTD_pthread_mutex_unlock(&cctxPool->poolMutex); + DEBUGLOG(5, "create one more CCtx"); + return ZSTD_createCCtx_advanced(cctxPool->cMem); /* note : can be NULL, when creation fails ! */ } static void ZSTDMT_releaseCCtx(ZSTDMT_CCtxPool* pool, ZSTD_CCtx* cctx) { if (cctx==NULL) return; /* compatibility with release on NULL */ + ZSTD_pthread_mutex_lock(&pool->poolMutex); if (pool->availCCtx < pool->totalCCtx) pool->cctx[pool->availCCtx++] = cctx; - else - /* pool overflow : should not happen, since totalCCtx==nbThreads */ + else { + /* pool overflow : should not happen, since totalCCtx==nbWorkers */ + DEBUGLOG(4, "CCtx pool overflow : free cctx"); ZSTD_freeCCtx(cctx); + } + ZSTD_pthread_mutex_unlock(&pool->poolMutex); +} + +/* ==== Serial State ==== */ + +typedef struct { + void const* start; + size_t size; +} range_t; + +typedef struct { + /* All variables in the struct are protected by mutex. */ + ZSTD_pthread_mutex_t mutex; + ZSTD_pthread_cond_t cond; + ZSTD_CCtx_params params; + ldmState_t ldmState; + XXH64_state_t xxhState; + unsigned nextJobID; + /* Protects ldmWindow. + * Must be acquired after the main mutex when acquiring both. + */ + ZSTD_pthread_mutex_t ldmWindowMutex; + ZSTD_pthread_cond_t ldmWindowCond; /* Signaled when ldmWindow is udpated */ + ZSTD_window_t ldmWindow; /* A thread-safe copy of ldmState.window */ +} serialState_t; + +static int ZSTDMT_serialState_reset(serialState_t* serialState, ZSTDMT_seqPool* seqPool, ZSTD_CCtx_params params) +{ + /* Adjust parameters */ + if (params.ldmParams.enableLdm) { + DEBUGLOG(4, "LDM window size = %u KB", (1U << params.cParams.windowLog) >> 10); + params.ldmParams.windowLog = params.cParams.windowLog; + ZSTD_ldm_adjustParameters(¶ms.ldmParams, ¶ms.cParams); + assert(params.ldmParams.hashLog >= params.ldmParams.bucketSizeLog); + assert(params.ldmParams.hashEveryLog < 32); + serialState->ldmState.hashPower = + ZSTD_ldm_getHashPower(params.ldmParams.minMatchLength); + } else { + memset(¶ms.ldmParams, 0, sizeof(params.ldmParams)); + } + serialState->nextJobID = 0; + if (params.fParams.checksumFlag) + XXH64_reset(&serialState->xxhState, 0); + if (params.ldmParams.enableLdm) { + ZSTD_customMem cMem = params.customMem; + unsigned const hashLog = params.ldmParams.hashLog; + size_t const hashSize = ((size_t)1 << hashLog) * sizeof(ldmEntry_t); + unsigned const bucketLog = + params.ldmParams.hashLog - params.ldmParams.bucketSizeLog; + size_t const bucketSize = (size_t)1 << bucketLog; + unsigned const prevBucketLog = + serialState->params.ldmParams.hashLog - + serialState->params.ldmParams.bucketSizeLog; + /* Size the seq pool tables */ + ZSTDMT_setNbSeq(seqPool, ZSTD_ldm_getMaxNbSeq(params.ldmParams, params.jobSize)); + /* Reset the window */ + ZSTD_window_clear(&serialState->ldmState.window); + serialState->ldmWindow = serialState->ldmState.window; + /* Resize tables and output space if necessary. */ + if (serialState->ldmState.hashTable == NULL || serialState->params.ldmParams.hashLog < hashLog) { + ZSTD_free(serialState->ldmState.hashTable, cMem); + serialState->ldmState.hashTable = (ldmEntry_t*)ZSTD_malloc(hashSize, cMem); + } + if (serialState->ldmState.bucketOffsets == NULL || prevBucketLog < bucketLog) { + ZSTD_free(serialState->ldmState.bucketOffsets, cMem); + serialState->ldmState.bucketOffsets = (BYTE*)ZSTD_malloc(bucketSize, cMem); + } + if (!serialState->ldmState.hashTable || !serialState->ldmState.bucketOffsets) + return 1; + /* Zero the tables */ + memset(serialState->ldmState.hashTable, 0, hashSize); + memset(serialState->ldmState.bucketOffsets, 0, bucketSize); + } + serialState->params = params; + return 0; +} + +static int ZSTDMT_serialState_init(serialState_t* serialState) +{ + int initError = 0; + memset(serialState, 0, sizeof(*serialState)); + initError |= ZSTD_pthread_mutex_init(&serialState->mutex, NULL); + initError |= ZSTD_pthread_cond_init(&serialState->cond, NULL); + initError |= ZSTD_pthread_mutex_init(&serialState->ldmWindowMutex, NULL); + initError |= ZSTD_pthread_cond_init(&serialState->ldmWindowCond, NULL); + return initError; +} + +static void ZSTDMT_serialState_free(serialState_t* serialState) +{ + ZSTD_customMem cMem = serialState->params.customMem; + ZSTD_pthread_mutex_destroy(&serialState->mutex); + ZSTD_pthread_cond_destroy(&serialState->cond); + ZSTD_pthread_mutex_destroy(&serialState->ldmWindowMutex); + ZSTD_pthread_cond_destroy(&serialState->ldmWindowCond); + ZSTD_free(serialState->ldmState.hashTable, cMem); + ZSTD_free(serialState->ldmState.bucketOffsets, cMem); +} + +static void ZSTDMT_serialState_update(serialState_t* serialState, + ZSTD_CCtx* jobCCtx, rawSeqStore_t seqStore, + range_t src, unsigned jobID) +{ + /* Wait for our turn */ + ZSTD_PTHREAD_MUTEX_LOCK(&serialState->mutex); + while (serialState->nextJobID < jobID) { + ZSTD_pthread_cond_wait(&serialState->cond, &serialState->mutex); + } + /* A future job may error and skip our job */ + if (serialState->nextJobID == jobID) { + /* It is now our turn, do any processing necessary */ + if (serialState->params.ldmParams.enableLdm) { + size_t error; + assert(seqStore.seq != NULL && seqStore.pos == 0 && + seqStore.size == 0 && seqStore.capacity > 0); + ZSTD_window_update(&serialState->ldmState.window, src.start, src.size); + error = ZSTD_ldm_generateSequences( + &serialState->ldmState, &seqStore, + &serialState->params.ldmParams, src.start, src.size); + /* We provide a large enough buffer to never fail. */ + assert(!ZSTD_isError(error)); (void)error; + /* Update ldmWindow to match the ldmState.window and signal the main + * thread if it is waiting for a buffer. + */ + ZSTD_PTHREAD_MUTEX_LOCK(&serialState->ldmWindowMutex); + serialState->ldmWindow = serialState->ldmState.window; + ZSTD_pthread_cond_signal(&serialState->ldmWindowCond); + ZSTD_pthread_mutex_unlock(&serialState->ldmWindowMutex); + } + if (serialState->params.fParams.checksumFlag && src.size > 0) + XXH64_update(&serialState->xxhState, src.start, src.size); + } + /* Now it is the next jobs turn */ + serialState->nextJobID++; + ZSTD_pthread_cond_broadcast(&serialState->cond); + ZSTD_pthread_mutex_unlock(&serialState->mutex); + + if (seqStore.size > 0) { + size_t const err = ZSTD_referenceExternalSequences( + jobCCtx, seqStore.seq, seqStore.size); + assert(serialState->params.ldmParams.enableLdm); + assert(!ZSTD_isError(err)); + (void)err; + } +} + +static void ZSTDMT_serialState_ensureFinished(serialState_t* serialState, + unsigned jobID, size_t cSize) +{ + ZSTD_PTHREAD_MUTEX_LOCK(&serialState->mutex); + if (serialState->nextJobID <= jobID) { + assert(ZSTD_isError(cSize)); (void)cSize; + DEBUGLOG(5, "Skipping past job %u because of error", jobID); + serialState->nextJobID = jobID + 1; + ZSTD_pthread_cond_broadcast(&serialState->cond); + + ZSTD_PTHREAD_MUTEX_LOCK(&serialState->ldmWindowMutex); + ZSTD_window_clear(&serialState->ldmWindow); + ZSTD_pthread_cond_signal(&serialState->ldmWindowCond); + ZSTD_pthread_mutex_unlock(&serialState->ldmWindowMutex); + } + ZSTD_pthread_mutex_unlock(&serialState->mutex); + } -/* ===== Thread worker ===== */ +/* ------------------------------------------ */ +/* ===== Worker thread ===== */ +/* ------------------------------------------ */ -typedef struct { - buffer_t buffer; - size_t filled; -} inBuff_t; +static const range_t kNullRange = { NULL, 0 }; typedef struct { - ZSTD_CCtx* cctx; - buffer_t src; - const void* srcStart; - size_t srcSize; - size_t dictSize; - buffer_t dstBuff; - size_t cSize; - size_t dstFlushed; - unsigned firstChunk; - unsigned lastChunk; - unsigned jobCompleted; - unsigned jobScanned; - pthread_mutex_t* jobCompleted_mutex; - pthread_cond_t* jobCompleted_cond; - ZSTD_parameters params; - ZSTD_CDict* cdict; - unsigned long long fullFrameSize; + size_t consumed; /* SHARED - set0 by mtctx, then modified by worker AND read by mtctx */ + size_t cSize; /* SHARED - set0 by mtctx, then modified by worker AND read by mtctx, then set0 by mtctx */ + ZSTD_pthread_mutex_t job_mutex; /* Thread-safe - used by mtctx and worker */ + ZSTD_pthread_cond_t job_cond; /* Thread-safe - used by mtctx and worker */ + ZSTDMT_CCtxPool* cctxPool; /* Thread-safe - used by mtctx and (all) workers */ + ZSTDMT_bufferPool* bufPool; /* Thread-safe - used by mtctx and (all) workers */ + ZSTDMT_seqPool* seqPool; /* Thread-safe - used by mtctx and (all) workers */ + serialState_t* serial; /* Thread-safe - used by mtctx and (all) workers */ + buffer_t dstBuff; /* set by worker (or mtctx), then read by worker & mtctx, then modified by mtctx => no barrier */ + range_t prefix; /* set by mtctx, then read by worker & mtctx => no barrier */ + range_t src; /* set by mtctx, then read by worker & mtctx => no barrier */ + unsigned jobID; /* set by mtctx, then read by worker => no barrier */ + unsigned firstJob; /* set by mtctx, then read by worker => no barrier */ + unsigned lastJob; /* set by mtctx, then read by worker => no barrier */ + ZSTD_CCtx_params params; /* set by mtctx, then read by worker => no barrier */ + const ZSTD_CDict* cdict; /* set by mtctx, then read by worker => no barrier */ + unsigned long long fullFrameSize; /* set by mtctx, then read by worker => no barrier */ + size_t dstFlushed; /* used only by mtctx */ + unsigned frameChecksumNeeded; /* used only by mtctx */ } ZSTDMT_jobDescription; -/* ZSTDMT_compressChunk() : POOL_function type */ -void ZSTDMT_compressChunk(void* jobDescription) +/* ZSTDMT_compressionJob() is a POOL_function type */ +void ZSTDMT_compressionJob(void* jobDescription) { ZSTDMT_jobDescription* const job = (ZSTDMT_jobDescription*)jobDescription; - const void* const src = (const char*)job->srcStart + job->dictSize; - buffer_t const dstBuff = job->dstBuff; - DEBUGLOG(3, "job (first:%u) (last:%u) : dictSize %u, srcSize %u", job->firstChunk, job->lastChunk, (U32)job->dictSize, (U32)job->srcSize); - if (job->cdict) { - size_t const initError = ZSTD_compressBegin_usingCDict(job->cctx, job->cdict, job->fullFrameSize); - if (job->cdict) DEBUGLOG(3, "using CDict "); - if (ZSTD_isError(initError)) { job->cSize = initError; goto _endJob; } - } else { - size_t const initError = ZSTD_compressBegin_advanced(job->cctx, job->srcStart, job->dictSize, job->params, job->fullFrameSize); - if (ZSTD_isError(initError)) { job->cSize = initError; goto _endJob; } - ZSTD_setCCtxParameter(job->cctx, ZSTD_p_forceWindow, 1); + ZSTD_CCtx_params jobParams = job->params; /* do not modify job->params ! copy it, modify the copy */ + ZSTD_CCtx* const cctx = ZSTDMT_getCCtx(job->cctxPool); + rawSeqStore_t rawSeqStore = ZSTDMT_getSeq(job->seqPool); + buffer_t dstBuff = job->dstBuff; + + /* Don't compute the checksum for chunks, since we compute it externally, + * but write it in the header. + */ + if (job->jobID != 0) jobParams.fParams.checksumFlag = 0; + /* Don't run LDM for the chunks, since we handle it externally */ + jobParams.ldmParams.enableLdm = 0; + + /* ressources */ + if (cctx==NULL) { + job->cSize = ERROR(memory_allocation); + goto _endJob; } - if (!job->firstChunk) { /* flush frame header */ - size_t const hSize = ZSTD_compressContinue(job->cctx, dstBuff.start, dstBuff.size, src, 0); - if (ZSTD_isError(hSize)) { job->cSize = hSize; goto _endJob; } - ZSTD_invalidateRepCodes(job->cctx); + if (dstBuff.start == NULL) { /* streaming job : doesn't provide a dstBuffer */ + dstBuff = ZSTDMT_getBuffer(job->bufPool); + if (dstBuff.start==NULL) { + job->cSize = ERROR(memory_allocation); + goto _endJob; + } + job->dstBuff = dstBuff; /* this value can be read in ZSTDMT_flush, when it copies the whole job */ } - DEBUGLOG(4, "Compressing : "); - DEBUG_PRINTHEX(4, job->srcStart, 12); - job->cSize = (job->lastChunk) ? /* last chunk signal */ - ZSTD_compressEnd (job->cctx, dstBuff.start, dstBuff.size, src, job->srcSize) : - ZSTD_compressContinue(job->cctx, dstBuff.start, dstBuff.size, src, job->srcSize); - DEBUGLOG(3, "compressed %u bytes into %u bytes (first:%u) (last:%u)", (unsigned)job->srcSize, (unsigned)job->cSize, job->firstChunk, job->lastChunk); + /* init */ + if (job->cdict) { + size_t const initError = ZSTD_compressBegin_advanced_internal(cctx, NULL, 0, ZSTD_dct_auto, job->cdict, jobParams, job->fullFrameSize); + assert(job->firstJob); /* only allowed for first job */ + if (ZSTD_isError(initError)) { job->cSize = initError; goto _endJob; } + } else { /* srcStart points at reloaded section */ + U64 const pledgedSrcSize = job->firstJob ? job->fullFrameSize : job->src.size; + { size_t const forceWindowError = ZSTD_CCtxParam_setParameter(&jobParams, ZSTD_p_forceMaxWindow, !job->firstJob); + if (ZSTD_isError(forceWindowError)) { + job->cSize = forceWindowError; + goto _endJob; + } } + { size_t const initError = ZSTD_compressBegin_advanced_internal(cctx, + job->prefix.start, job->prefix.size, ZSTD_dct_rawContent, /* load dictionary in "content-only" mode (no header analysis) */ + NULL, /*cdict*/ + jobParams, pledgedSrcSize); + if (ZSTD_isError(initError)) { + job->cSize = initError; + goto _endJob; + } } } + + /* Perform serial step as early as possible, but after CCtx initialization */ + ZSTDMT_serialState_update(job->serial, cctx, rawSeqStore, job->src, job->jobID); + + if (!job->firstJob) { /* flush and overwrite frame header when it's not first job */ + size_t const hSize = ZSTD_compressContinue(cctx, dstBuff.start, dstBuff.capacity, job->src.start, 0); + if (ZSTD_isError(hSize)) { job->cSize = hSize; /* save error code */ goto _endJob; } + DEBUGLOG(5, "ZSTDMT_compressionJob: flush and overwrite %u bytes of frame header (not first job)", (U32)hSize); + ZSTD_invalidateRepCodes(cctx); + } + + /* compress */ + { size_t const chunkSize = 4*ZSTD_BLOCKSIZE_MAX; + int const nbChunks = (int)((job->src.size + (chunkSize-1)) / chunkSize); + const BYTE* ip = (const BYTE*) job->src.start; + BYTE* const ostart = (BYTE*)dstBuff.start; + BYTE* op = ostart; + BYTE* oend = op + dstBuff.capacity; + int chunkNb; + if (sizeof(size_t) > sizeof(int)) assert(job->src.size < ((size_t)INT_MAX) * chunkSize); /* check overflow */ + DEBUGLOG(5, "ZSTDMT_compressionJob: compress %u bytes in %i blocks", (U32)job->src.size, nbChunks); + assert(job->cSize == 0); + for (chunkNb = 1; chunkNb < nbChunks; chunkNb++) { + size_t const cSize = ZSTD_compressContinue(cctx, op, oend-op, ip, chunkSize); + if (ZSTD_isError(cSize)) { job->cSize = cSize; goto _endJob; } + ip += chunkSize; + op += cSize; assert(op < oend); + /* stats */ + ZSTD_PTHREAD_MUTEX_LOCK(&job->job_mutex); + job->cSize += cSize; + job->consumed = chunkSize * chunkNb; + DEBUGLOG(5, "ZSTDMT_compressionJob: compress new block : cSize==%u bytes (total: %u)", + (U32)cSize, (U32)job->cSize); + ZSTD_pthread_cond_signal(&job->job_cond); /* warns some more data is ready to be flushed */ + ZSTD_pthread_mutex_unlock(&job->job_mutex); + } + /* last block */ + assert(chunkSize > 0); assert((chunkSize & (chunkSize - 1)) == 0); /* chunkSize must be power of 2 for mask==(chunkSize-1) to work */ + if ((nbChunks > 0) | job->lastJob /*must output a "last block" flag*/ ) { + size_t const lastBlockSize1 = job->src.size & (chunkSize-1); + size_t const lastBlockSize = ((lastBlockSize1==0) & (job->src.size>=chunkSize)) ? chunkSize : lastBlockSize1; + size_t const cSize = (job->lastJob) ? + ZSTD_compressEnd (cctx, op, oend-op, ip, lastBlockSize) : + ZSTD_compressContinue(cctx, op, oend-op, ip, lastBlockSize); + if (ZSTD_isError(cSize)) { job->cSize = cSize; goto _endJob; } + /* stats */ + ZSTD_PTHREAD_MUTEX_LOCK(&job->job_mutex); + job->cSize += cSize; + ZSTD_pthread_mutex_unlock(&job->job_mutex); + } } _endJob: - PTHREAD_MUTEX_LOCK(job->jobCompleted_mutex); - job->jobCompleted = 1; - job->jobScanned = 0; - pthread_cond_signal(job->jobCompleted_cond); - pthread_mutex_unlock(job->jobCompleted_mutex); + ZSTDMT_serialState_ensureFinished(job->serial, job->jobID, job->cSize); + if (job->prefix.size > 0) + DEBUGLOG(5, "Finished with prefix: %zx", (size_t)job->prefix.start); + DEBUGLOG(5, "Finished with source: %zx", (size_t)job->src.start); + /* release resources */ + ZSTDMT_releaseSeq(job->seqPool, rawSeqStore); + ZSTDMT_releaseCCtx(job->cctxPool, cctx); + /* report */ + ZSTD_PTHREAD_MUTEX_LOCK(&job->job_mutex); + job->consumed = job->src.size; + ZSTD_pthread_cond_signal(&job->job_cond); + ZSTD_pthread_mutex_unlock(&job->job_mutex); } @@ -266,213 +714,500 @@ /* ===== Multi-threaded compression ===== */ /* ------------------------------------------ */ +typedef struct { + range_t prefix; /* read-only non-owned prefix buffer */ + buffer_t buffer; + size_t filled; +} inBuff_t; + +typedef struct { + BYTE* buffer; /* The round input buffer. All jobs get references + * to pieces of the buffer. ZSTDMT_tryGetInputRange() + * handles handing out job input buffers, and makes + * sure it doesn't overlap with any pieces still in use. + */ + size_t capacity; /* The capacity of buffer. */ + size_t pos; /* The position of the current inBuff in the round + * buffer. Updated past the end if the inBuff once + * the inBuff is sent to the worker thread. + * pos <= capacity. + */ +} roundBuff_t; + +static const roundBuff_t kNullRoundBuff = {NULL, 0, 0}; + struct ZSTDMT_CCtx_s { POOL_ctx* factory; - ZSTDMT_bufferPool* buffPool; + ZSTDMT_jobDescription* jobs; + ZSTDMT_bufferPool* bufPool; ZSTDMT_CCtxPool* cctxPool; - pthread_mutex_t jobCompleted_mutex; - pthread_cond_t jobCompleted_cond; + ZSTDMT_seqPool* seqPool; + ZSTD_CCtx_params params; size_t targetSectionSize; - size_t marginSize; - size_t inBuffSize; - size_t dictSize; - size_t targetDictSize; + size_t targetPrefixSize; + roundBuff_t roundBuff; inBuff_t inBuff; - ZSTD_parameters params; - XXH64_state_t xxhState; - unsigned nbThreads; + int jobReady; /* 1 => one job is already prepared, but pool has shortage of workers. Don't create another one. */ + serialState_t serial; + unsigned singleBlockingThread; unsigned jobIDMask; unsigned doneJobID; unsigned nextJobID; unsigned frameEnded; unsigned allJobsCompleted; - unsigned overlapRLog; unsigned long long frameContentSize; - size_t sectionSize; - ZSTD_CDict* cdict; - ZSTD_CStream* cstream; - ZSTDMT_jobDescription jobs[1]; /* variable size (must lies at the end) */ + unsigned long long consumed; + unsigned long long produced; + ZSTD_customMem cMem; + ZSTD_CDict* cdictLocal; + const ZSTD_CDict* cdict; }; -ZSTDMT_CCtx *ZSTDMT_createCCtx(unsigned nbThreads) +static void ZSTDMT_freeJobsTable(ZSTDMT_jobDescription* jobTable, U32 nbJobs, ZSTD_customMem cMem) { - ZSTDMT_CCtx* cctx; - U32 const minNbJobs = nbThreads + 2; - U32 const nbJobsLog2 = ZSTD_highbit32(minNbJobs) + 1; + U32 jobNb; + if (jobTable == NULL) return; + for (jobNb=0; jobNb ZSTDMT_NBTHREADS_MAX)) return NULL; - cctx = (ZSTDMT_CCtx*) calloc(1, sizeof(ZSTDMT_CCtx) + nbJobs*sizeof(ZSTDMT_jobDescription)); - if (!cctx) return NULL; - cctx->nbThreads = nbThreads; - cctx->jobIDMask = nbJobs - 1; - cctx->allJobsCompleted = 1; - cctx->sectionSize = 0; - cctx->overlapRLog = 3; - cctx->factory = POOL_create(nbThreads, 1); - cctx->buffPool = ZSTDMT_createBufferPool(nbThreads); - cctx->cctxPool = ZSTDMT_createCCtxPool(nbThreads); - if (!cctx->factory | !cctx->buffPool | !cctx->cctxPool) { /* one object was not created */ - ZSTDMT_freeCCtx(cctx); + U32 jobNb; + ZSTDMT_jobDescription* const jobTable = (ZSTDMT_jobDescription*) + ZSTD_calloc(nbJobs * sizeof(ZSTDMT_jobDescription), cMem); + int initError = 0; + if (jobTable==NULL) return NULL; + *nbJobsPtr = nbJobs; + for (jobNb=0; jobNbcstream = ZSTD_createCStream(); - if (!cctx->cstream) { - ZSTDMT_freeCCtx(cctx); return NULL; - } } - pthread_mutex_init(&cctx->jobCompleted_mutex, NULL); /* Todo : check init function return */ - pthread_cond_init(&cctx->jobCompleted_cond, NULL); - DEBUGLOG(4, "mt_cctx created, for %u threads \n", nbThreads); - return cctx; + return jobTable; +} + +/* ZSTDMT_CCtxParam_setNbWorkers(): + * Internal use only */ +size_t ZSTDMT_CCtxParam_setNbWorkers(ZSTD_CCtx_params* params, unsigned nbWorkers) +{ + if (nbWorkers > ZSTDMT_NBWORKERS_MAX) nbWorkers = ZSTDMT_NBWORKERS_MAX; + params->nbWorkers = nbWorkers; + params->overlapSizeLog = ZSTDMT_OVERLAPLOG_DEFAULT; + params->jobSize = 0; + return nbWorkers; } +ZSTDMT_CCtx* ZSTDMT_createCCtx_advanced(unsigned nbWorkers, ZSTD_customMem cMem) +{ + ZSTDMT_CCtx* mtctx; + U32 nbJobs = nbWorkers + 2; + int initError; + DEBUGLOG(3, "ZSTDMT_createCCtx_advanced (nbWorkers = %u)", nbWorkers); + + if (nbWorkers < 1) return NULL; + nbWorkers = MIN(nbWorkers , ZSTDMT_NBWORKERS_MAX); + if ((cMem.customAlloc!=NULL) ^ (cMem.customFree!=NULL)) + /* invalid custom allocator */ + return NULL; + + mtctx = (ZSTDMT_CCtx*) ZSTD_calloc(sizeof(ZSTDMT_CCtx), cMem); + if (!mtctx) return NULL; + ZSTDMT_CCtxParam_setNbWorkers(&mtctx->params, nbWorkers); + mtctx->cMem = cMem; + mtctx->allJobsCompleted = 1; + mtctx->factory = POOL_create_advanced(nbWorkers, 0, cMem); + mtctx->jobs = ZSTDMT_createJobsTable(&nbJobs, cMem); + assert(nbJobs > 0); assert((nbJobs & (nbJobs - 1)) == 0); /* ensure nbJobs is a power of 2 */ + mtctx->jobIDMask = nbJobs - 1; + mtctx->bufPool = ZSTDMT_createBufferPool(nbWorkers, cMem); + mtctx->cctxPool = ZSTDMT_createCCtxPool(nbWorkers, cMem); + mtctx->seqPool = ZSTDMT_createSeqPool(nbWorkers, cMem); + initError = ZSTDMT_serialState_init(&mtctx->serial); + mtctx->roundBuff = kNullRoundBuff; + if (!mtctx->factory | !mtctx->jobs | !mtctx->bufPool | !mtctx->cctxPool | !mtctx->seqPool | initError) { + ZSTDMT_freeCCtx(mtctx); + return NULL; + } + DEBUGLOG(3, "mt_cctx created, for %u threads", nbWorkers); + return mtctx; +} + +ZSTDMT_CCtx* ZSTDMT_createCCtx(unsigned nbWorkers) +{ + return ZSTDMT_createCCtx_advanced(nbWorkers, ZSTD_defaultCMem); +} + + /* ZSTDMT_releaseAllJobResources() : - * Ensure all workers are killed first. */ + * note : ensure all workers are killed first ! */ static void ZSTDMT_releaseAllJobResources(ZSTDMT_CCtx* mtctx) { unsigned jobID; + DEBUGLOG(3, "ZSTDMT_releaseAllJobResources"); for (jobID=0; jobID <= mtctx->jobIDMask; jobID++) { - ZSTDMT_releaseBuffer(mtctx->buffPool, mtctx->jobs[jobID].dstBuff); + DEBUGLOG(4, "job%02u: release dst address %08X", jobID, (U32)(size_t)mtctx->jobs[jobID].dstBuff.start); + ZSTDMT_releaseBuffer(mtctx->bufPool, mtctx->jobs[jobID].dstBuff); mtctx->jobs[jobID].dstBuff = g_nullBuffer; - ZSTDMT_releaseBuffer(mtctx->buffPool, mtctx->jobs[jobID].src); - mtctx->jobs[jobID].src = g_nullBuffer; - ZSTDMT_releaseCCtx(mtctx->cctxPool, mtctx->jobs[jobID].cctx); - mtctx->jobs[jobID].cctx = NULL; + mtctx->jobs[jobID].cSize = 0; } memset(mtctx->jobs, 0, (mtctx->jobIDMask+1)*sizeof(ZSTDMT_jobDescription)); - ZSTDMT_releaseBuffer(mtctx->buffPool, mtctx->inBuff.buffer); mtctx->inBuff.buffer = g_nullBuffer; + mtctx->inBuff.filled = 0; mtctx->allJobsCompleted = 1; } +static void ZSTDMT_waitForAllJobsCompleted(ZSTDMT_CCtx* mtctx) +{ + DEBUGLOG(4, "ZSTDMT_waitForAllJobsCompleted"); + while (mtctx->doneJobID < mtctx->nextJobID) { + unsigned const jobID = mtctx->doneJobID & mtctx->jobIDMask; + ZSTD_PTHREAD_MUTEX_LOCK(&mtctx->jobs[jobID].job_mutex); + while (mtctx->jobs[jobID].consumed < mtctx->jobs[jobID].src.size) { + DEBUGLOG(5, "waiting for jobCompleted signal from job %u", mtctx->doneJobID); /* we want to block when waiting for data to flush */ + ZSTD_pthread_cond_wait(&mtctx->jobs[jobID].job_cond, &mtctx->jobs[jobID].job_mutex); + } + ZSTD_pthread_mutex_unlock(&mtctx->jobs[jobID].job_mutex); + mtctx->doneJobID++; + } +} + size_t ZSTDMT_freeCCtx(ZSTDMT_CCtx* mtctx) { if (mtctx==NULL) return 0; /* compatible with free on NULL */ - POOL_free(mtctx->factory); - if (!mtctx->allJobsCompleted) ZSTDMT_releaseAllJobResources(mtctx); /* stop workers first */ - ZSTDMT_freeBufferPool(mtctx->buffPool); /* release job resources into pools first */ + POOL_free(mtctx->factory); /* stop and free worker threads */ + ZSTDMT_releaseAllJobResources(mtctx); /* release job resources into pools first */ + ZSTDMT_freeJobsTable(mtctx->jobs, mtctx->jobIDMask+1, mtctx->cMem); + ZSTDMT_freeBufferPool(mtctx->bufPool); ZSTDMT_freeCCtxPool(mtctx->cctxPool); - ZSTD_freeCDict(mtctx->cdict); - ZSTD_freeCStream(mtctx->cstream); - pthread_mutex_destroy(&mtctx->jobCompleted_mutex); - pthread_cond_destroy(&mtctx->jobCompleted_cond); - free(mtctx); + ZSTDMT_freeSeqPool(mtctx->seqPool); + ZSTDMT_serialState_free(&mtctx->serial); + ZSTD_freeCDict(mtctx->cdictLocal); + if (mtctx->roundBuff.buffer) + ZSTD_free(mtctx->roundBuff.buffer, mtctx->cMem); + ZSTD_free(mtctx, mtctx->cMem); return 0; } -size_t ZSTDMT_setMTCtxParameter(ZSTDMT_CCtx* mtctx, ZSDTMT_parameter parameter, unsigned value) +size_t ZSTDMT_sizeof_CCtx(ZSTDMT_CCtx* mtctx) { + if (mtctx == NULL) return 0; /* supports sizeof NULL */ + return sizeof(*mtctx) + + POOL_sizeof(mtctx->factory) + + ZSTDMT_sizeof_bufferPool(mtctx->bufPool) + + (mtctx->jobIDMask+1) * sizeof(ZSTDMT_jobDescription) + + ZSTDMT_sizeof_CCtxPool(mtctx->cctxPool) + + ZSTDMT_sizeof_seqPool(mtctx->seqPool) + + ZSTD_sizeof_CDict(mtctx->cdictLocal) + + mtctx->roundBuff.capacity; +} + +/* Internal only */ +size_t ZSTDMT_CCtxParam_setMTCtxParameter(ZSTD_CCtx_params* params, + ZSTDMT_parameter parameter, unsigned value) { + DEBUGLOG(4, "ZSTDMT_CCtxParam_setMTCtxParameter"); + switch(parameter) + { + case ZSTDMT_p_jobSize : + DEBUGLOG(4, "ZSTDMT_CCtxParam_setMTCtxParameter : set jobSize to %u", value); + if ( (value > 0) /* value==0 => automatic job size */ + & (value < ZSTDMT_JOBSIZE_MIN) ) + value = ZSTDMT_JOBSIZE_MIN; + params->jobSize = value; + return value; + case ZSTDMT_p_overlapSectionLog : + if (value > 9) value = 9; + DEBUGLOG(4, "ZSTDMT_p_overlapSectionLog : %u", value); + params->overlapSizeLog = (value >= 9) ? 9 : value; + return value; + default : + return ERROR(parameter_unsupported); + } +} + +size_t ZSTDMT_setMTCtxParameter(ZSTDMT_CCtx* mtctx, ZSTDMT_parameter parameter, unsigned value) +{ + DEBUGLOG(4, "ZSTDMT_setMTCtxParameter"); switch(parameter) { - case ZSTDMT_p_sectionSize : - mtctx->sectionSize = value; - return 0; + case ZSTDMT_p_jobSize : + return ZSTDMT_CCtxParam_setMTCtxParameter(&mtctx->params, parameter, value); case ZSTDMT_p_overlapSectionLog : - DEBUGLOG(4, "ZSTDMT_p_overlapSectionLog : %u", value); - mtctx->overlapRLog = (value >= 9) ? 0 : 9 - value; - return 0; + return ZSTDMT_CCtxParam_setMTCtxParameter(&mtctx->params, parameter, value); default : - return ERROR(compressionParameter_unsupported); + return ERROR(parameter_unsupported); + } +} + +/* Sets parameters relevant to the compression job, + * initializing others to default values. */ +static ZSTD_CCtx_params ZSTDMT_initJobCCtxParams(ZSTD_CCtx_params const params) +{ + ZSTD_CCtx_params jobParams; + memset(&jobParams, 0, sizeof(jobParams)); + + jobParams.cParams = params.cParams; + jobParams.fParams = params.fParams; + jobParams.compressionLevel = params.compressionLevel; + jobParams.disableLiteralCompression = params.disableLiteralCompression; + + return jobParams; +} + +/*! ZSTDMT_updateCParams_whileCompressing() : + * Updates only a selected set of compression parameters, to remain compatible with current frame. + * New parameters will be applied to next compression job. */ +void ZSTDMT_updateCParams_whileCompressing(ZSTDMT_CCtx* mtctx, const ZSTD_CCtx_params* cctxParams) +{ + U32 const saved_wlog = mtctx->params.cParams.windowLog; /* Do not modify windowLog while compressing */ + int const compressionLevel = cctxParams->compressionLevel; + DEBUGLOG(5, "ZSTDMT_updateCParams_whileCompressing (level:%i)", + compressionLevel); + mtctx->params.compressionLevel = compressionLevel; + { ZSTD_compressionParameters cParams = ZSTD_getCParamsFromCCtxParams(cctxParams, 0, 0); + cParams.windowLog = saved_wlog; + mtctx->params.cParams = cParams; } } +/* ZSTDMT_getNbWorkers(): + * @return nb threads currently active in mtctx. + * mtctx must be valid */ +unsigned ZSTDMT_getNbWorkers(const ZSTDMT_CCtx* mtctx) +{ + assert(mtctx != NULL); + return mtctx->params.nbWorkers; +} + +/* ZSTDMT_getFrameProgression(): + * tells how much data has been consumed (input) and produced (output) for current frame. + * able to count progression inside worker threads. + * Note : mutex will be acquired during statistics collection. */ +ZSTD_frameProgression ZSTDMT_getFrameProgression(ZSTDMT_CCtx* mtctx) +{ + ZSTD_frameProgression fps; + DEBUGLOG(6, "ZSTDMT_getFrameProgression"); + fps.consumed = mtctx->consumed; + fps.produced = mtctx->produced; + fps.ingested = mtctx->consumed + mtctx->inBuff.filled; + { unsigned jobNb; + unsigned lastJobNb = mtctx->nextJobID + mtctx->jobReady; assert(mtctx->jobReady <= 1); + DEBUGLOG(6, "ZSTDMT_getFrameProgression: jobs: from %u to <%u (jobReady:%u)", + mtctx->doneJobID, lastJobNb, mtctx->jobReady) + for (jobNb = mtctx->doneJobID ; jobNb < lastJobNb ; jobNb++) { + unsigned const wJobID = jobNb & mtctx->jobIDMask; + ZSTD_pthread_mutex_lock(&mtctx->jobs[wJobID].job_mutex); + { size_t const cResult = mtctx->jobs[wJobID].cSize; + size_t const produced = ZSTD_isError(cResult) ? 0 : cResult; + fps.consumed += mtctx->jobs[wJobID].consumed; + fps.ingested += mtctx->jobs[wJobID].src.size; + fps.produced += produced; + } + ZSTD_pthread_mutex_unlock(&mtctx->jobs[wJobID].job_mutex); + } + } + return fps; +} + /* ------------------------------------------ */ /* ===== Multi-threaded compression ===== */ /* ------------------------------------------ */ +static size_t ZSTDMT_computeTargetJobLog(ZSTD_CCtx_params const params) +{ + if (params.ldmParams.enableLdm) + return MAX(21, params.cParams.chainLog + 4); + return MAX(20, params.cParams.windowLog + 2); +} + +static size_t ZSTDMT_computeOverlapLog(ZSTD_CCtx_params const params) +{ + unsigned const overlapRLog = (params.overlapSizeLog>9) ? 0 : 9-params.overlapSizeLog; + if (params.ldmParams.enableLdm) + return (MIN(params.cParams.windowLog, ZSTDMT_computeTargetJobLog(params) - 2) - overlapRLog); + return overlapRLog >= 9 ? 0 : (params.cParams.windowLog - overlapRLog); +} + +static unsigned ZSTDMT_computeNbJobs(ZSTD_CCtx_params params, size_t srcSize, unsigned nbWorkers) { + assert(nbWorkers>0); + { size_t const jobSizeTarget = (size_t)1 << ZSTDMT_computeTargetJobLog(params); + size_t const jobMaxSize = jobSizeTarget << 2; + size_t const passSizeMax = jobMaxSize * nbWorkers; + unsigned const multiplier = (unsigned)(srcSize / passSizeMax) + 1; + unsigned const nbJobsLarge = multiplier * nbWorkers; + unsigned const nbJobsMax = (unsigned)(srcSize / jobSizeTarget) + 1; + unsigned const nbJobsSmall = MIN(nbJobsMax, nbWorkers); + return (multiplier>1) ? nbJobsLarge : nbJobsSmall; +} } + +/* ZSTDMT_compress_advanced_internal() : + * This is a blocking function : it will only give back control to caller after finishing its compression job. + */ +static size_t ZSTDMT_compress_advanced_internal( + ZSTDMT_CCtx* mtctx, + void* dst, size_t dstCapacity, + const void* src, size_t srcSize, + const ZSTD_CDict* cdict, + ZSTD_CCtx_params params) +{ + ZSTD_CCtx_params const jobParams = ZSTDMT_initJobCCtxParams(params); + size_t const overlapSize = (size_t)1 << ZSTDMT_computeOverlapLog(params); + unsigned const nbJobs = ZSTDMT_computeNbJobs(params, srcSize, params.nbWorkers); + size_t const proposedJobSize = (srcSize + (nbJobs-1)) / nbJobs; + size_t const avgJobSize = (((proposedJobSize-1) & 0x1FFFF) < 0x7FFF) ? proposedJobSize + 0xFFFF : proposedJobSize; /* avoid too small last block */ + const char* const srcStart = (const char*)src; + size_t remainingSrcSize = srcSize; + unsigned const compressWithinDst = (dstCapacity >= ZSTD_compressBound(srcSize)) ? nbJobs : (unsigned)(dstCapacity / ZSTD_compressBound(avgJobSize)); /* presumes avgJobSize >= 256 KB, which should be the case */ + size_t frameStartPos = 0, dstBufferPos = 0; + assert(jobParams.nbWorkers == 0); + assert(mtctx->cctxPool->totalCCtx == params.nbWorkers); + + params.jobSize = (U32)avgJobSize; + DEBUGLOG(4, "ZSTDMT_compress_advanced_internal: nbJobs=%2u (rawSize=%u bytes; fixedSize=%u) ", + nbJobs, (U32)proposedJobSize, (U32)avgJobSize); + + if ((nbJobs==1) | (params.nbWorkers<=1)) { /* fallback to single-thread mode : this is a blocking invocation anyway */ + ZSTD_CCtx* const cctx = mtctx->cctxPool->cctx[0]; + DEBUGLOG(4, "ZSTDMT_compress_advanced_internal: fallback to single-thread mode"); + if (cdict) return ZSTD_compress_usingCDict_advanced(cctx, dst, dstCapacity, src, srcSize, cdict, jobParams.fParams); + return ZSTD_compress_advanced_internal(cctx, dst, dstCapacity, src, srcSize, NULL, 0, jobParams); + } + + assert(avgJobSize >= 256 KB); /* condition for ZSTD_compressBound(A) + ZSTD_compressBound(B) <= ZSTD_compressBound(A+B), required to compress directly into Dst (no additional buffer) */ + ZSTDMT_setBufferSize(mtctx->bufPool, ZSTD_compressBound(avgJobSize) ); + if (ZSTDMT_serialState_reset(&mtctx->serial, mtctx->seqPool, params)) + return ERROR(memory_allocation); + + if (nbJobs > mtctx->jobIDMask+1) { /* enlarge job table */ + U32 jobsTableSize = nbJobs; + ZSTDMT_freeJobsTable(mtctx->jobs, mtctx->jobIDMask+1, mtctx->cMem); + mtctx->jobIDMask = 0; + mtctx->jobs = ZSTDMT_createJobsTable(&jobsTableSize, mtctx->cMem); + if (mtctx->jobs==NULL) return ERROR(memory_allocation); + assert((jobsTableSize != 0) && ((jobsTableSize & (jobsTableSize - 1)) == 0)); /* ensure jobsTableSize is a power of 2 */ + mtctx->jobIDMask = jobsTableSize - 1; + } + + { unsigned u; + for (u=0; ujobs[u].prefix.start = srcStart + frameStartPos - dictSize; + mtctx->jobs[u].prefix.size = dictSize; + mtctx->jobs[u].src.start = srcStart + frameStartPos; + mtctx->jobs[u].src.size = jobSize; assert(jobSize > 0); /* avoid job.src.size == 0 */ + mtctx->jobs[u].consumed = 0; + mtctx->jobs[u].cSize = 0; + mtctx->jobs[u].cdict = (u==0) ? cdict : NULL; + mtctx->jobs[u].fullFrameSize = srcSize; + mtctx->jobs[u].params = jobParams; + /* do not calculate checksum within sections, but write it in header for first section */ + mtctx->jobs[u].dstBuff = dstBuffer; + mtctx->jobs[u].cctxPool = mtctx->cctxPool; + mtctx->jobs[u].bufPool = mtctx->bufPool; + mtctx->jobs[u].seqPool = mtctx->seqPool; + mtctx->jobs[u].serial = &mtctx->serial; + mtctx->jobs[u].jobID = u; + mtctx->jobs[u].firstJob = (u==0); + mtctx->jobs[u].lastJob = (u==nbJobs-1); + + DEBUGLOG(5, "ZSTDMT_compress_advanced_internal: posting job %u (%u bytes)", u, (U32)jobSize); + DEBUG_PRINTHEX(6, mtctx->jobs[u].prefix.start, 12); + POOL_add(mtctx->factory, ZSTDMT_compressionJob, &mtctx->jobs[u]); + + frameStartPos += jobSize; + dstBufferPos += dstBufferCapacity; + remainingSrcSize -= jobSize; + } } + + /* collect result */ + { size_t error = 0, dstPos = 0; + unsigned jobID; + for (jobID=0; jobIDjobs[jobID].job_mutex); + while (mtctx->jobs[jobID].consumed < mtctx->jobs[jobID].src.size) { + DEBUGLOG(5, "waiting for jobCompleted signal from job %u", jobID); + ZSTD_pthread_cond_wait(&mtctx->jobs[jobID].job_cond, &mtctx->jobs[jobID].job_mutex); + } + ZSTD_pthread_mutex_unlock(&mtctx->jobs[jobID].job_mutex); + DEBUGLOG(5, "ready to write job %u ", jobID); + + { size_t const cSize = mtctx->jobs[jobID].cSize; + if (ZSTD_isError(cSize)) error = cSize; + if ((!error) && (dstPos + cSize > dstCapacity)) error = ERROR(dstSize_tooSmall); + if (jobID) { /* note : job 0 is written directly at dst, which is correct position */ + if (!error) + memmove((char*)dst + dstPos, mtctx->jobs[jobID].dstBuff.start, cSize); /* may overlap when job compressed within dst */ + if (jobID >= compressWithinDst) { /* job compressed into its own buffer, which must be released */ + DEBUGLOG(5, "releasing buffer %u>=%u", jobID, compressWithinDst); + ZSTDMT_releaseBuffer(mtctx->bufPool, mtctx->jobs[jobID].dstBuff); + } } + mtctx->jobs[jobID].dstBuff = g_nullBuffer; + mtctx->jobs[jobID].cSize = 0; + dstPos += cSize ; + } + } /* for (jobID=0; jobIDserial.xxhState); + if (dstPos + 4 > dstCapacity) { + error = ERROR(dstSize_tooSmall); + } else { + DEBUGLOG(4, "writing checksum : %08X \n", checksum); + MEM_writeLE32((char*)dst + dstPos, checksum); + dstPos += 4; + } } + + if (!error) DEBUGLOG(4, "compressed size : %u ", (U32)dstPos); + return error ? error : dstPos; + } +} + +size_t ZSTDMT_compress_advanced(ZSTDMT_CCtx* mtctx, + void* dst, size_t dstCapacity, + const void* src, size_t srcSize, + const ZSTD_CDict* cdict, + ZSTD_parameters params, + unsigned overlapLog) +{ + ZSTD_CCtx_params cctxParams = mtctx->params; + cctxParams.cParams = params.cParams; + cctxParams.fParams = params.fParams; + cctxParams.overlapSizeLog = overlapLog; + return ZSTDMT_compress_advanced_internal(mtctx, + dst, dstCapacity, + src, srcSize, + cdict, cctxParams); +} + + size_t ZSTDMT_compressCCtx(ZSTDMT_CCtx* mtctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize, int compressionLevel) { + U32 const overlapLog = (compressionLevel >= ZSTD_maxCLevel()) ? 9 : ZSTDMT_OVERLAPLOG_DEFAULT; ZSTD_parameters params = ZSTD_getParams(compressionLevel, srcSize, 0); - size_t const chunkTargetSize = (size_t)1 << (params.cParams.windowLog + 2); - unsigned const nbChunksMax = (unsigned)(srcSize / chunkTargetSize) + (srcSize < chunkTargetSize) /* min 1 */; - unsigned nbChunks = MIN(nbChunksMax, mtctx->nbThreads); - size_t const proposedChunkSize = (srcSize + (nbChunks-1)) / nbChunks; - size_t const avgChunkSize = ((proposedChunkSize & 0x1FFFF) < 0xFFFF) ? proposedChunkSize + 0xFFFF : proposedChunkSize; /* avoid too small last block */ - size_t remainingSrcSize = srcSize; - const char* const srcStart = (const char*)src; - size_t frameStartPos = 0; - - DEBUGLOG(3, "windowLog : %2u => chunkTargetSize : %u bytes ", params.cParams.windowLog, (U32)chunkTargetSize); - DEBUGLOG(2, "nbChunks : %2u (chunkSize : %u bytes) ", nbChunks, (U32)avgChunkSize); params.fParams.contentSizeFlag = 1; - - if (nbChunks==1) { /* fallback to single-thread mode */ - ZSTD_CCtx* const cctx = mtctx->cctxPool->cctx[0]; - return ZSTD_compressCCtx(cctx, dst, dstCapacity, src, srcSize, compressionLevel); - } - - { unsigned u; - for (u=0; ubuffPool, dstBufferCapacity) : dstAsBuffer; - ZSTD_CCtx* const cctx = ZSTDMT_getCCtx(mtctx->cctxPool); - - if ((cctx==NULL) || (dstBuffer.start==NULL)) { - mtctx->jobs[u].cSize = ERROR(memory_allocation); /* job result */ - mtctx->jobs[u].jobCompleted = 1; - nbChunks = u+1; - break; /* let's wait for previous jobs to complete, but don't start new ones */ - } - - mtctx->jobs[u].srcStart = srcStart + frameStartPos; - mtctx->jobs[u].srcSize = chunkSize; - mtctx->jobs[u].fullFrameSize = srcSize; - mtctx->jobs[u].params = params; - mtctx->jobs[u].dstBuff = dstBuffer; - mtctx->jobs[u].cctx = cctx; - mtctx->jobs[u].firstChunk = (u==0); - mtctx->jobs[u].lastChunk = (u==nbChunks-1); - mtctx->jobs[u].jobCompleted = 0; - mtctx->jobs[u].jobCompleted_mutex = &mtctx->jobCompleted_mutex; - mtctx->jobs[u].jobCompleted_cond = &mtctx->jobCompleted_cond; - - DEBUGLOG(3, "posting job %u (%u bytes)", u, (U32)chunkSize); - DEBUG_PRINTHEX(3, mtctx->jobs[u].srcStart, 12); - POOL_add(mtctx->factory, ZSTDMT_compressChunk, &mtctx->jobs[u]); - - frameStartPos += chunkSize; - remainingSrcSize -= chunkSize; - } } - /* note : since nbChunks <= nbThreads, all jobs should be running immediately in parallel */ - - { unsigned chunkID; - size_t error = 0, dstPos = 0; - for (chunkID=0; chunkIDjobCompleted_mutex); - while (mtctx->jobs[chunkID].jobCompleted==0) { - DEBUGLOG(4, "waiting for jobCompleted signal from chunk %u", chunkID); - pthread_cond_wait(&mtctx->jobCompleted_cond, &mtctx->jobCompleted_mutex); - } - pthread_mutex_unlock(&mtctx->jobCompleted_mutex); - DEBUGLOG(3, "ready to write chunk %u ", chunkID); - - ZSTDMT_releaseCCtx(mtctx->cctxPool, mtctx->jobs[chunkID].cctx); - mtctx->jobs[chunkID].cctx = NULL; - mtctx->jobs[chunkID].srcStart = NULL; - { size_t const cSize = mtctx->jobs[chunkID].cSize; - if (ZSTD_isError(cSize)) error = cSize; - if ((!error) && (dstPos + cSize > dstCapacity)) error = ERROR(dstSize_tooSmall); - if (chunkID) { /* note : chunk 0 is already written directly into dst */ - if (!error) memcpy((char*)dst + dstPos, mtctx->jobs[chunkID].dstBuff.start, cSize); - ZSTDMT_releaseBuffer(mtctx->buffPool, mtctx->jobs[chunkID].dstBuff); - mtctx->jobs[chunkID].dstBuff = g_nullBuffer; - } - dstPos += cSize ; - } - } - if (!error) DEBUGLOG(3, "compressed size : %u ", (U32)dstPos); - return error ? error : dstPos; - } - + return ZSTDMT_compress_advanced(mtctx, dst, dstCapacity, src, srcSize, NULL, params, overlapLog); } @@ -480,261 +1215,617 @@ /* ======= Streaming API ======= */ /* ====================================== */ -static void ZSTDMT_waitForAllJobsCompleted(ZSTDMT_CCtx* zcs) { - while (zcs->doneJobID < zcs->nextJobID) { - unsigned const jobID = zcs->doneJobID & zcs->jobIDMask; - PTHREAD_MUTEX_LOCK(&zcs->jobCompleted_mutex); - while (zcs->jobs[jobID].jobCompleted==0) { - DEBUGLOG(4, "waiting for jobCompleted signal from chunk %u", zcs->doneJobID); /* we want to block when waiting for data to flush */ - pthread_cond_wait(&zcs->jobCompleted_cond, &zcs->jobCompleted_mutex); +size_t ZSTDMT_initCStream_internal( + ZSTDMT_CCtx* mtctx, + const void* dict, size_t dictSize, ZSTD_dictContentType_e dictContentType, + const ZSTD_CDict* cdict, ZSTD_CCtx_params params, + unsigned long long pledgedSrcSize) +{ + DEBUGLOG(4, "ZSTDMT_initCStream_internal (pledgedSrcSize=%u, nbWorkers=%u, cctxPool=%u, disableLiteralCompression=%i)", + (U32)pledgedSrcSize, params.nbWorkers, mtctx->cctxPool->totalCCtx, params.disableLiteralCompression); + /* params are supposed to be fully validated at this point */ + assert(!ZSTD_isError(ZSTD_checkCParams(params.cParams))); + assert(!((dict) && (cdict))); /* either dict or cdict, not both */ + assert(mtctx->cctxPool->totalCCtx == params.nbWorkers); + + /* init */ + if (params.jobSize == 0) { + params.jobSize = 1U << ZSTDMT_computeTargetJobLog(params); + } + if (params.jobSize > ZSTDMT_JOBSIZE_MAX) params.jobSize = ZSTDMT_JOBSIZE_MAX; + + mtctx->singleBlockingThread = (pledgedSrcSize <= ZSTDMT_JOBSIZE_MIN); /* do not trigger multi-threading when srcSize is too small */ + if (mtctx->singleBlockingThread) { + ZSTD_CCtx_params const singleThreadParams = ZSTDMT_initJobCCtxParams(params); + DEBUGLOG(5, "ZSTDMT_initCStream_internal: switch to single blocking thread mode"); + assert(singleThreadParams.nbWorkers == 0); + return ZSTD_initCStream_internal(mtctx->cctxPool->cctx[0], + dict, dictSize, cdict, + singleThreadParams, pledgedSrcSize); + } + + DEBUGLOG(4, "ZSTDMT_initCStream_internal: %u workers", params.nbWorkers); + + if (mtctx->allJobsCompleted == 0) { /* previous compression not correctly finished */ + ZSTDMT_waitForAllJobsCompleted(mtctx); + ZSTDMT_releaseAllJobResources(mtctx); + mtctx->allJobsCompleted = 1; + } + + mtctx->params = params; + mtctx->frameContentSize = pledgedSrcSize; + if (dict) { + ZSTD_freeCDict(mtctx->cdictLocal); + mtctx->cdictLocal = ZSTD_createCDict_advanced(dict, dictSize, + ZSTD_dlm_byCopy, dictContentType, /* note : a loadPrefix becomes an internal CDict */ + params.cParams, mtctx->cMem); + mtctx->cdict = mtctx->cdictLocal; + if (mtctx->cdictLocal == NULL) return ERROR(memory_allocation); + } else { + ZSTD_freeCDict(mtctx->cdictLocal); + mtctx->cdictLocal = NULL; + mtctx->cdict = cdict; + } + + mtctx->targetPrefixSize = (size_t)1 << ZSTDMT_computeOverlapLog(params); + DEBUGLOG(4, "overlapLog=%u => %u KB", params.overlapSizeLog, (U32)(mtctx->targetPrefixSize>>10)); + mtctx->targetSectionSize = params.jobSize; + if (mtctx->targetSectionSize < ZSTDMT_JOBSIZE_MIN) mtctx->targetSectionSize = ZSTDMT_JOBSIZE_MIN; + if (mtctx->targetSectionSize < mtctx->targetPrefixSize) mtctx->targetSectionSize = mtctx->targetPrefixSize; /* job size must be >= overlap size */ + DEBUGLOG(4, "Job Size : %u KB (note : set to %u)", (U32)(mtctx->targetSectionSize>>10), params.jobSize); + DEBUGLOG(4, "inBuff Size : %u KB", (U32)(mtctx->targetSectionSize>>10)); + ZSTDMT_setBufferSize(mtctx->bufPool, ZSTD_compressBound(mtctx->targetSectionSize)); + { + /* If ldm is enabled we need windowSize space. */ + size_t const windowSize = mtctx->params.ldmParams.enableLdm ? (1U << mtctx->params.cParams.windowLog) : 0; + /* Two buffers of slack, plus extra space for the overlap + * This is the minimum slack that LDM works with. One extra because + * flush might waste up to targetSectionSize-1 bytes. Another extra + * for the overlap (if > 0), then one to fill which doesn't overlap + * with the LDM window. + */ + size_t const nbSlackBuffers = 2 + (mtctx->targetPrefixSize > 0); + size_t const slackSize = mtctx->targetSectionSize * nbSlackBuffers; + /* Compute the total size, and always have enough slack */ + size_t const nbWorkers = MAX(mtctx->params.nbWorkers, 1); + size_t const sectionsSize = mtctx->targetSectionSize * nbWorkers; + size_t const capacity = MAX(windowSize, sectionsSize) + slackSize; + if (mtctx->roundBuff.capacity < capacity) { + if (mtctx->roundBuff.buffer) + ZSTD_free(mtctx->roundBuff.buffer, mtctx->cMem); + mtctx->roundBuff.buffer = (BYTE*)ZSTD_malloc(capacity, mtctx->cMem); + if (mtctx->roundBuff.buffer == NULL) { + mtctx->roundBuff.capacity = 0; + return ERROR(memory_allocation); + } + mtctx->roundBuff.capacity = capacity; + } + } + DEBUGLOG(4, "roundBuff capacity : %u KB", (U32)(mtctx->roundBuff.capacity>>10)); + mtctx->roundBuff.pos = 0; + mtctx->inBuff.buffer = g_nullBuffer; + mtctx->inBuff.filled = 0; + mtctx->inBuff.prefix = kNullRange; + mtctx->doneJobID = 0; + mtctx->nextJobID = 0; + mtctx->frameEnded = 0; + mtctx->allJobsCompleted = 0; + mtctx->consumed = 0; + mtctx->produced = 0; + if (ZSTDMT_serialState_reset(&mtctx->serial, mtctx->seqPool, params)) + return ERROR(memory_allocation); + return 0; +} + +size_t ZSTDMT_initCStream_advanced(ZSTDMT_CCtx* mtctx, + const void* dict, size_t dictSize, + ZSTD_parameters params, + unsigned long long pledgedSrcSize) +{ + ZSTD_CCtx_params cctxParams = mtctx->params; /* retrieve sticky params */ + DEBUGLOG(4, "ZSTDMT_initCStream_advanced (pledgedSrcSize=%u)", (U32)pledgedSrcSize); + cctxParams.cParams = params.cParams; + cctxParams.fParams = params.fParams; + return ZSTDMT_initCStream_internal(mtctx, dict, dictSize, ZSTD_dct_auto, NULL, + cctxParams, pledgedSrcSize); +} + +size_t ZSTDMT_initCStream_usingCDict(ZSTDMT_CCtx* mtctx, + const ZSTD_CDict* cdict, + ZSTD_frameParameters fParams, + unsigned long long pledgedSrcSize) +{ + ZSTD_CCtx_params cctxParams = mtctx->params; + if (cdict==NULL) return ERROR(dictionary_wrong); /* method incompatible with NULL cdict */ + cctxParams.cParams = ZSTD_getCParamsFromCDict(cdict); + cctxParams.fParams = fParams; + return ZSTDMT_initCStream_internal(mtctx, NULL, 0 /*dictSize*/, ZSTD_dct_auto, cdict, + cctxParams, pledgedSrcSize); +} + + +/* ZSTDMT_resetCStream() : + * pledgedSrcSize can be zero == unknown (for the time being) + * prefer using ZSTD_CONTENTSIZE_UNKNOWN, + * as `0` might mean "empty" in the future */ +size_t ZSTDMT_resetCStream(ZSTDMT_CCtx* mtctx, unsigned long long pledgedSrcSize) +{ + if (!pledgedSrcSize) pledgedSrcSize = ZSTD_CONTENTSIZE_UNKNOWN; + return ZSTDMT_initCStream_internal(mtctx, NULL, 0, ZSTD_dct_auto, 0, mtctx->params, + pledgedSrcSize); +} + +size_t ZSTDMT_initCStream(ZSTDMT_CCtx* mtctx, int compressionLevel) { + ZSTD_parameters const params = ZSTD_getParams(compressionLevel, ZSTD_CONTENTSIZE_UNKNOWN, 0); + ZSTD_CCtx_params cctxParams = mtctx->params; /* retrieve sticky params */ + DEBUGLOG(4, "ZSTDMT_initCStream (cLevel=%i)", compressionLevel); + cctxParams.cParams = params.cParams; + cctxParams.fParams = params.fParams; + return ZSTDMT_initCStream_internal(mtctx, NULL, 0, ZSTD_dct_auto, NULL, cctxParams, ZSTD_CONTENTSIZE_UNKNOWN); +} + + +/* ZSTDMT_writeLastEmptyBlock() + * Write a single empty block with an end-of-frame to finish a frame. + * Job must be created from streaming variant. + * This function is always successfull if expected conditions are fulfilled. + */ +static void ZSTDMT_writeLastEmptyBlock(ZSTDMT_jobDescription* job) +{ + assert(job->lastJob == 1); + assert(job->src.size == 0); /* last job is empty -> will be simplified into a last empty block */ + assert(job->firstJob == 0); /* cannot be first job, as it also needs to create frame header */ + assert(job->dstBuff.start == NULL); /* invoked from streaming variant only (otherwise, dstBuff might be user's output) */ + job->dstBuff = ZSTDMT_getBuffer(job->bufPool); + if (job->dstBuff.start == NULL) { + job->cSize = ERROR(memory_allocation); + return; + } + assert(job->dstBuff.capacity >= ZSTD_blockHeaderSize); /* no buffer should ever be that small */ + job->src = kNullRange; + job->cSize = ZSTD_writeLastEmptyBlock(job->dstBuff.start, job->dstBuff.capacity); + assert(!ZSTD_isError(job->cSize)); + assert(job->consumed == 0); +} + +static size_t ZSTDMT_createCompressionJob(ZSTDMT_CCtx* mtctx, size_t srcSize, ZSTD_EndDirective endOp) +{ + unsigned const jobID = mtctx->nextJobID & mtctx->jobIDMask; + int const endFrame = (endOp == ZSTD_e_end); + + if (mtctx->nextJobID > mtctx->doneJobID + mtctx->jobIDMask) { + DEBUGLOG(5, "ZSTDMT_createCompressionJob: will not create new job : table is full"); + assert((mtctx->nextJobID & mtctx->jobIDMask) == (mtctx->doneJobID & mtctx->jobIDMask)); + return 0; + } + + if (!mtctx->jobReady) { + BYTE const* src = (BYTE const*)mtctx->inBuff.buffer.start; + DEBUGLOG(5, "ZSTDMT_createCompressionJob: preparing job %u to compress %u bytes with %u preload ", + mtctx->nextJobID, (U32)srcSize, (U32)mtctx->inBuff.prefix.size); + mtctx->jobs[jobID].src.start = src; + mtctx->jobs[jobID].src.size = srcSize; + assert(mtctx->inBuff.filled >= srcSize); + mtctx->jobs[jobID].prefix = mtctx->inBuff.prefix; + mtctx->jobs[jobID].consumed = 0; + mtctx->jobs[jobID].cSize = 0; + mtctx->jobs[jobID].params = mtctx->params; + mtctx->jobs[jobID].cdict = mtctx->nextJobID==0 ? mtctx->cdict : NULL; + mtctx->jobs[jobID].fullFrameSize = mtctx->frameContentSize; + mtctx->jobs[jobID].dstBuff = g_nullBuffer; + mtctx->jobs[jobID].cctxPool = mtctx->cctxPool; + mtctx->jobs[jobID].bufPool = mtctx->bufPool; + mtctx->jobs[jobID].seqPool = mtctx->seqPool; + mtctx->jobs[jobID].serial = &mtctx->serial; + mtctx->jobs[jobID].jobID = mtctx->nextJobID; + mtctx->jobs[jobID].firstJob = (mtctx->nextJobID==0); + mtctx->jobs[jobID].lastJob = endFrame; + mtctx->jobs[jobID].frameChecksumNeeded = endFrame && (mtctx->nextJobID>0) && mtctx->params.fParams.checksumFlag; + mtctx->jobs[jobID].dstFlushed = 0; + + /* Update the round buffer pos and clear the input buffer to be reset */ + mtctx->roundBuff.pos += srcSize; + mtctx->inBuff.buffer = g_nullBuffer; + mtctx->inBuff.filled = 0; + /* Set the prefix */ + if (!endFrame) { + size_t const newPrefixSize = MIN(srcSize, mtctx->targetPrefixSize); + mtctx->inBuff.prefix.start = src + srcSize - newPrefixSize; + mtctx->inBuff.prefix.size = newPrefixSize; + } else { /* endFrame==1 => no need for another input buffer */ + mtctx->inBuff.prefix = kNullRange; + mtctx->frameEnded = endFrame; + if (mtctx->nextJobID == 0) { + /* single job exception : checksum is already calculated directly within worker thread */ + mtctx->params.fParams.checksumFlag = 0; + } } + + if ( (srcSize == 0) + && (mtctx->nextJobID>0)/*single job must also write frame header*/ ) { + DEBUGLOG(5, "ZSTDMT_createCompressionJob: creating a last empty block to end frame"); + assert(endOp == ZSTD_e_end); /* only possible case : need to end the frame with an empty last block */ + ZSTDMT_writeLastEmptyBlock(mtctx->jobs + jobID); + mtctx->nextJobID++; + return 0; } - pthread_mutex_unlock(&zcs->jobCompleted_mutex); - zcs->doneJobID++; + } + + DEBUGLOG(5, "ZSTDMT_createCompressionJob: posting job %u : %u bytes (end:%u, jobNb == %u (mod:%u))", + mtctx->nextJobID, + (U32)mtctx->jobs[jobID].src.size, + mtctx->jobs[jobID].lastJob, + mtctx->nextJobID, + jobID); + if (POOL_tryAdd(mtctx->factory, ZSTDMT_compressionJob, &mtctx->jobs[jobID])) { + mtctx->nextJobID++; + mtctx->jobReady = 0; + } else { + DEBUGLOG(5, "ZSTDMT_createCompressionJob: no worker available for job %u", mtctx->nextJobID); + mtctx->jobReady = 1; + } + return 0; +} + + +/*! ZSTDMT_flushProduced() : + * `output` : `pos` will be updated with amount of data flushed . + * `blockToFlush` : if >0, the function will block and wait if there is no data available to flush . + * @return : amount of data remaining within internal buffer, 0 if no more, 1 if unknown but > 0, or an error code */ +static size_t ZSTDMT_flushProduced(ZSTDMT_CCtx* mtctx, ZSTD_outBuffer* output, unsigned blockToFlush, ZSTD_EndDirective end) +{ + unsigned const wJobID = mtctx->doneJobID & mtctx->jobIDMask; + DEBUGLOG(5, "ZSTDMT_flushProduced (blocking:%u , job %u <= %u)", + blockToFlush, mtctx->doneJobID, mtctx->nextJobID); + assert(output->size >= output->pos); + + ZSTD_PTHREAD_MUTEX_LOCK(&mtctx->jobs[wJobID].job_mutex); + if ( blockToFlush + && (mtctx->doneJobID < mtctx->nextJobID) ) { + assert(mtctx->jobs[wJobID].dstFlushed <= mtctx->jobs[wJobID].cSize); + while (mtctx->jobs[wJobID].dstFlushed == mtctx->jobs[wJobID].cSize) { /* nothing to flush */ + if (mtctx->jobs[wJobID].consumed == mtctx->jobs[wJobID].src.size) { + DEBUGLOG(5, "job %u is completely consumed (%u == %u) => don't wait for cond, there will be none", + mtctx->doneJobID, (U32)mtctx->jobs[wJobID].consumed, (U32)mtctx->jobs[wJobID].src.size); + break; + } + DEBUGLOG(5, "waiting for something to flush from job %u (currently flushed: %u bytes)", + mtctx->doneJobID, (U32)mtctx->jobs[wJobID].dstFlushed); + ZSTD_pthread_cond_wait(&mtctx->jobs[wJobID].job_cond, &mtctx->jobs[wJobID].job_mutex); /* block when nothing to flush but some to come */ + } } + + /* try to flush something */ + { size_t cSize = mtctx->jobs[wJobID].cSize; /* shared */ + size_t const srcConsumed = mtctx->jobs[wJobID].consumed; /* shared */ + size_t const srcSize = mtctx->jobs[wJobID].src.size; /* read-only, could be done after mutex lock, but no-declaration-after-statement */ + ZSTD_pthread_mutex_unlock(&mtctx->jobs[wJobID].job_mutex); + if (ZSTD_isError(cSize)) { + DEBUGLOG(5, "ZSTDMT_flushProduced: job %u : compression error detected : %s", + mtctx->doneJobID, ZSTD_getErrorName(cSize)); + ZSTDMT_waitForAllJobsCompleted(mtctx); + ZSTDMT_releaseAllJobResources(mtctx); + return cSize; + } + /* add frame checksum if necessary (can only happen once) */ + assert(srcConsumed <= srcSize); + if ( (srcConsumed == srcSize) /* job completed -> worker no longer active */ + && mtctx->jobs[wJobID].frameChecksumNeeded ) { + U32 const checksum = (U32)XXH64_digest(&mtctx->serial.xxhState); + DEBUGLOG(4, "ZSTDMT_flushProduced: writing checksum : %08X \n", checksum); + MEM_writeLE32((char*)mtctx->jobs[wJobID].dstBuff.start + mtctx->jobs[wJobID].cSize, checksum); + cSize += 4; + mtctx->jobs[wJobID].cSize += 4; /* can write this shared value, as worker is no longer active */ + mtctx->jobs[wJobID].frameChecksumNeeded = 0; + } + if (cSize > 0) { /* compression is ongoing or completed */ + size_t const toFlush = MIN(cSize - mtctx->jobs[wJobID].dstFlushed, output->size - output->pos); + DEBUGLOG(5, "ZSTDMT_flushProduced: Flushing %u bytes from job %u (completion:%u/%u, generated:%u)", + (U32)toFlush, mtctx->doneJobID, (U32)srcConsumed, (U32)srcSize, (U32)cSize); + assert(mtctx->doneJobID < mtctx->nextJobID); + assert(cSize >= mtctx->jobs[wJobID].dstFlushed); + assert(mtctx->jobs[wJobID].dstBuff.start != NULL); + memcpy((char*)output->dst + output->pos, + (const char*)mtctx->jobs[wJobID].dstBuff.start + mtctx->jobs[wJobID].dstFlushed, + toFlush); + output->pos += toFlush; + mtctx->jobs[wJobID].dstFlushed += toFlush; /* can write : this value is only used by mtctx */ + + if ( (srcConsumed == srcSize) /* job completed */ + && (mtctx->jobs[wJobID].dstFlushed == cSize) ) { /* output buffer fully flushed => free this job position */ + DEBUGLOG(5, "Job %u completed (%u bytes), moving to next one", + mtctx->doneJobID, (U32)mtctx->jobs[wJobID].dstFlushed); + ZSTDMT_releaseBuffer(mtctx->bufPool, mtctx->jobs[wJobID].dstBuff); + mtctx->jobs[wJobID].dstBuff = g_nullBuffer; + mtctx->jobs[wJobID].cSize = 0; /* ensure this job slot is considered "not started" in future check */ + mtctx->consumed += srcSize; + mtctx->produced += cSize; + mtctx->doneJobID++; + } } + + /* return value : how many bytes left in buffer ; fake it to 1 when unknown but >0 */ + if (cSize > mtctx->jobs[wJobID].dstFlushed) return (cSize - mtctx->jobs[wJobID].dstFlushed); + if (srcSize > srcConsumed) return 1; /* current job not completely compressed */ + } + if (mtctx->doneJobID < mtctx->nextJobID) return 1; /* some more jobs ongoing */ + if (mtctx->jobReady) return 1; /* one job is ready to push, just not yet in the list */ + if (mtctx->inBuff.filled > 0) return 1; /* input is not empty, and still needs to be converted into a job */ + mtctx->allJobsCompleted = mtctx->frameEnded; /* all jobs are entirely flushed => if this one is last one, frame is completed */ + if (end == ZSTD_e_end) return !mtctx->frameEnded; /* for ZSTD_e_end, question becomes : is frame completed ? instead of : are internal buffers fully flushed ? */ + return 0; /* internal buffers fully flushed */ +} + +/** + * Returns the range of data used by the earliest job that is not yet complete. + * If the data of the first job is broken up into two segments, we cover both + * sections. + */ +static range_t ZSTDMT_getInputDataInUse(ZSTDMT_CCtx* mtctx) +{ + unsigned const firstJobID = mtctx->doneJobID; + unsigned const lastJobID = mtctx->nextJobID; + unsigned jobID; + + for (jobID = firstJobID; jobID < lastJobID; ++jobID) { + unsigned const wJobID = jobID & mtctx->jobIDMask; + size_t consumed; + + ZSTD_PTHREAD_MUTEX_LOCK(&mtctx->jobs[wJobID].job_mutex); + consumed = mtctx->jobs[wJobID].consumed; + ZSTD_pthread_mutex_unlock(&mtctx->jobs[wJobID].job_mutex); + + if (consumed < mtctx->jobs[wJobID].src.size) { + range_t range = mtctx->jobs[wJobID].prefix; + if (range.size == 0) { + /* Empty prefix */ + range = mtctx->jobs[wJobID].src; + } + /* Job source in multiple segments not supported yet */ + assert(range.start <= mtctx->jobs[wJobID].src.start); + return range; + } + } + return kNullRange; +} + +/** + * Returns non-zero iff buffer and range overlap. + */ +static int ZSTDMT_isOverlapped(buffer_t buffer, range_t range) +{ + BYTE const* const bufferStart = (BYTE const*)buffer.start; + BYTE const* const bufferEnd = bufferStart + buffer.capacity; + BYTE const* const rangeStart = (BYTE const*)range.start; + BYTE const* const rangeEnd = rangeStart + range.size; + + if (rangeStart == NULL || bufferStart == NULL) + return 0; + /* Empty ranges cannot overlap */ + if (bufferStart == bufferEnd || rangeStart == rangeEnd) + return 0; + + return bufferStart < rangeEnd && rangeStart < bufferEnd; +} + +static int ZSTDMT_doesOverlapWindow(buffer_t buffer, ZSTD_window_t window) +{ + range_t extDict; + range_t prefix; + + extDict.start = window.dictBase + window.lowLimit; + extDict.size = window.dictLimit - window.lowLimit; + + prefix.start = window.base + window.dictLimit; + prefix.size = window.nextSrc - (window.base + window.dictLimit); + DEBUGLOG(5, "extDict [0x%zx, 0x%zx)", + (size_t)extDict.start, + (size_t)extDict.start + extDict.size); + DEBUGLOG(5, "prefix [0x%zx, 0x%zx)", + (size_t)prefix.start, + (size_t)prefix.start + prefix.size); + + return ZSTDMT_isOverlapped(buffer, extDict) + || ZSTDMT_isOverlapped(buffer, prefix); +} + +static void ZSTDMT_waitForLdmComplete(ZSTDMT_CCtx* mtctx, buffer_t buffer) +{ + if (mtctx->params.ldmParams.enableLdm) { + ZSTD_pthread_mutex_t* mutex = &mtctx->serial.ldmWindowMutex; + DEBUGLOG(5, "source [0x%zx, 0x%zx)", + (size_t)buffer.start, + (size_t)buffer.start + buffer.capacity); + ZSTD_PTHREAD_MUTEX_LOCK(mutex); + while (ZSTDMT_doesOverlapWindow(buffer, mtctx->serial.ldmWindow)) { + DEBUGLOG(6, "Waiting for LDM to finish..."); + ZSTD_pthread_cond_wait(&mtctx->serial.ldmWindowCond, mutex); + } + DEBUGLOG(6, "Done waiting for LDM to finish"); + ZSTD_pthread_mutex_unlock(mutex); + } +} + +/** + * Attempts to set the inBuff to the next section to fill. + * If any part of the new section is still in use we give up. + * Returns non-zero if the buffer is filled. + */ +static int ZSTDMT_tryGetInputRange(ZSTDMT_CCtx* mtctx) +{ + range_t const inUse = ZSTDMT_getInputDataInUse(mtctx); + size_t const spaceLeft = mtctx->roundBuff.capacity - mtctx->roundBuff.pos; + size_t const target = mtctx->targetSectionSize; + buffer_t buffer; + + assert(mtctx->inBuff.buffer.start == NULL); + assert(mtctx->roundBuff.capacity >= target); + + if (spaceLeft < target) { + /* ZSTD_invalidateRepCodes() doesn't work for extDict variants. + * Simply copy the prefix to the beginning in that case. + */ + BYTE* const start = (BYTE*)mtctx->roundBuff.buffer; + size_t const prefixSize = mtctx->inBuff.prefix.size; + + buffer.start = start; + buffer.capacity = prefixSize; + if (ZSTDMT_isOverlapped(buffer, inUse)) { + DEBUGLOG(6, "Waiting for buffer..."); + return 0; + } + ZSTDMT_waitForLdmComplete(mtctx, buffer); + memmove(start, mtctx->inBuff.prefix.start, prefixSize); + mtctx->inBuff.prefix.start = start; + mtctx->roundBuff.pos = prefixSize; + } + buffer.start = mtctx->roundBuff.buffer + mtctx->roundBuff.pos; + buffer.capacity = target; + + if (ZSTDMT_isOverlapped(buffer, inUse)) { + DEBUGLOG(6, "Waiting for buffer..."); + return 0; + } + assert(!ZSTDMT_isOverlapped(buffer, mtctx->inBuff.prefix)); + + ZSTDMT_waitForLdmComplete(mtctx, buffer); + + DEBUGLOG(5, "Using prefix range [%zx, %zx)", + (size_t)mtctx->inBuff.prefix.start, + (size_t)mtctx->inBuff.prefix.start + mtctx->inBuff.prefix.size); + DEBUGLOG(5, "Using source range [%zx, %zx)", + (size_t)buffer.start, + (size_t)buffer.start + buffer.capacity); + + + mtctx->inBuff.buffer = buffer; + mtctx->inBuff.filled = 0; + assert(mtctx->roundBuff.pos + buffer.capacity <= mtctx->roundBuff.capacity); + return 1; +} + + +/** ZSTDMT_compressStream_generic() : + * internal use only - exposed to be invoked from zstd_compress.c + * assumption : output and input are valid (pos <= size) + * @return : minimum amount of data remaining to flush, 0 if none */ +size_t ZSTDMT_compressStream_generic(ZSTDMT_CCtx* mtctx, + ZSTD_outBuffer* output, + ZSTD_inBuffer* input, + ZSTD_EndDirective endOp) +{ + unsigned forwardInputProgress = 0; + DEBUGLOG(5, "ZSTDMT_compressStream_generic (endOp=%u, srcSize=%u)", + (U32)endOp, (U32)(input->size - input->pos)); + assert(output->pos <= output->size); + assert(input->pos <= input->size); + + if (mtctx->singleBlockingThread) { /* delegate to single-thread (synchronous) */ + return ZSTD_compressStream_generic(mtctx->cctxPool->cctx[0], output, input, endOp); + } + + if ((mtctx->frameEnded) && (endOp==ZSTD_e_continue)) { + /* current frame being ended. Only flush/end are allowed */ + return ERROR(stage_wrong); + } + + /* single-pass shortcut (note : synchronous-mode) */ + if ( (mtctx->nextJobID == 0) /* just started */ + && (mtctx->inBuff.filled == 0) /* nothing buffered */ + && (!mtctx->jobReady) /* no job already created */ + && (endOp == ZSTD_e_end) /* end order */ + && (output->size - output->pos >= ZSTD_compressBound(input->size - input->pos)) ) { /* enough space in dst */ + size_t const cSize = ZSTDMT_compress_advanced_internal(mtctx, + (char*)output->dst + output->pos, output->size - output->pos, + (const char*)input->src + input->pos, input->size - input->pos, + mtctx->cdict, mtctx->params); + if (ZSTD_isError(cSize)) return cSize; + input->pos = input->size; + output->pos += cSize; + mtctx->allJobsCompleted = 1; + mtctx->frameEnded = 1; + return 0; + } + + /* fill input buffer */ + if ( (!mtctx->jobReady) + && (input->size > input->pos) ) { /* support NULL input */ + if (mtctx->inBuff.buffer.start == NULL) { + assert(mtctx->inBuff.filled == 0); /* Can't fill an empty buffer */ + if (!ZSTDMT_tryGetInputRange(mtctx)) { + /* It is only possible for this operation to fail if there are + * still compression jobs ongoing. + */ + assert(mtctx->doneJobID != mtctx->nextJobID); + } + } + if (mtctx->inBuff.buffer.start != NULL) { + size_t const toLoad = MIN(input->size - input->pos, mtctx->targetSectionSize - mtctx->inBuff.filled); + assert(mtctx->inBuff.buffer.capacity >= mtctx->targetSectionSize); + DEBUGLOG(5, "ZSTDMT_compressStream_generic: adding %u bytes on top of %u to buffer of size %u", + (U32)toLoad, (U32)mtctx->inBuff.filled, (U32)mtctx->targetSectionSize); + memcpy((char*)mtctx->inBuff.buffer.start + mtctx->inBuff.filled, (const char*)input->src + input->pos, toLoad); + input->pos += toLoad; + mtctx->inBuff.filled += toLoad; + forwardInputProgress = toLoad>0; + } + if ((input->pos < input->size) && (endOp == ZSTD_e_end)) + endOp = ZSTD_e_flush; /* can't end now : not all input consumed */ + } + + if ( (mtctx->jobReady) + || (mtctx->inBuff.filled >= mtctx->targetSectionSize) /* filled enough : let's compress */ + || ((endOp != ZSTD_e_continue) && (mtctx->inBuff.filled > 0)) /* something to flush : let's go */ + || ((endOp == ZSTD_e_end) && (!mtctx->frameEnded)) ) { /* must finish the frame with a zero-size block */ + size_t const jobSize = mtctx->inBuff.filled; + assert(mtctx->inBuff.filled <= mtctx->targetSectionSize); + CHECK_F( ZSTDMT_createCompressionJob(mtctx, jobSize, endOp) ); + } + + /* check for potential compressed data ready to be flushed */ + { size_t const remainingToFlush = ZSTDMT_flushProduced(mtctx, output, !forwardInputProgress, endOp); /* block if there was no forward input progress */ + if (input->pos < input->size) return MAX(remainingToFlush, 1); /* input not consumed : do not end flush yet */ + return remainingToFlush; } } -static size_t ZSTDMT_initCStream_internal(ZSTDMT_CCtx* zcs, - const void* dict, size_t dictSize, unsigned updateDict, - ZSTD_parameters params, unsigned long long pledgedSrcSize) +size_t ZSTDMT_compressStream(ZSTDMT_CCtx* mtctx, ZSTD_outBuffer* output, ZSTD_inBuffer* input) { - ZSTD_customMem const cmem = { NULL, NULL, NULL }; - DEBUGLOG(3, "Started new compression, with windowLog : %u", params.cParams.windowLog); - if (zcs->nbThreads==1) return ZSTD_initCStream_advanced(zcs->cstream, dict, dictSize, params, pledgedSrcSize); - if (zcs->allJobsCompleted == 0) { /* previous job not correctly finished */ - ZSTDMT_waitForAllJobsCompleted(zcs); - ZSTDMT_releaseAllJobResources(zcs); - zcs->allJobsCompleted = 1; - } - zcs->params = params; - if (updateDict) { - ZSTD_freeCDict(zcs->cdict); zcs->cdict = NULL; - if (dict && dictSize) { - zcs->cdict = ZSTD_createCDict_advanced(dict, dictSize, 0, params, cmem); - if (zcs->cdict == NULL) return ERROR(memory_allocation); - } } - zcs->frameContentSize = pledgedSrcSize; - zcs->targetDictSize = (zcs->overlapRLog>=9) ? 0 : (size_t)1 << (zcs->params.cParams.windowLog - zcs->overlapRLog); - DEBUGLOG(4, "overlapRLog : %u ", zcs->overlapRLog); - DEBUGLOG(3, "overlap Size : %u KB", (U32)(zcs->targetDictSize>>10)); - zcs->targetSectionSize = zcs->sectionSize ? zcs->sectionSize : (size_t)1 << (zcs->params.cParams.windowLog + 2); - zcs->targetSectionSize = MAX(ZSTDMT_SECTION_SIZE_MIN, zcs->targetSectionSize); - zcs->targetSectionSize = MAX(zcs->targetDictSize, zcs->targetSectionSize); - DEBUGLOG(3, "Section Size : %u KB", (U32)(zcs->targetSectionSize>>10)); - zcs->marginSize = zcs->targetSectionSize >> 2; - zcs->inBuffSize = zcs->targetDictSize + zcs->targetSectionSize + zcs->marginSize; - zcs->inBuff.buffer = ZSTDMT_getBuffer(zcs->buffPool, zcs->inBuffSize); - if (zcs->inBuff.buffer.start == NULL) return ERROR(memory_allocation); - zcs->inBuff.filled = 0; - zcs->dictSize = 0; - zcs->doneJobID = 0; - zcs->nextJobID = 0; - zcs->frameEnded = 0; - zcs->allJobsCompleted = 0; - if (params.fParams.checksumFlag) XXH64_reset(&zcs->xxhState, 0); - return 0; -} + CHECK_F( ZSTDMT_compressStream_generic(mtctx, output, input, ZSTD_e_continue) ); -size_t ZSTDMT_initCStream_advanced(ZSTDMT_CCtx* zcs, - const void* dict, size_t dictSize, - ZSTD_parameters params, unsigned long long pledgedSrcSize) -{ - return ZSTDMT_initCStream_internal(zcs, dict, dictSize, 1, params, pledgedSrcSize); -} - -/* ZSTDMT_resetCStream() : - * pledgedSrcSize is optional and can be zero == unknown */ -size_t ZSTDMT_resetCStream(ZSTDMT_CCtx* zcs, unsigned long long pledgedSrcSize) -{ - if (zcs->nbThreads==1) return ZSTD_resetCStream(zcs->cstream, pledgedSrcSize); - return ZSTDMT_initCStream_internal(zcs, NULL, 0, 0, zcs->params, pledgedSrcSize); -} - -size_t ZSTDMT_initCStream(ZSTDMT_CCtx* zcs, int compressionLevel) { - ZSTD_parameters const params = ZSTD_getParams(compressionLevel, 0, 0); - return ZSTDMT_initCStream_internal(zcs, NULL, 0, 1, params, 0); + /* recommended next input size : fill current input buffer */ + return mtctx->targetSectionSize - mtctx->inBuff.filled; /* note : could be zero when input buffer is fully filled and no more availability to create new job */ } -static size_t ZSTDMT_createCompressionJob(ZSTDMT_CCtx* zcs, size_t srcSize, unsigned endFrame) +static size_t ZSTDMT_flushStream_internal(ZSTDMT_CCtx* mtctx, ZSTD_outBuffer* output, ZSTD_EndDirective endFrame) { - size_t const dstBufferCapacity = ZSTD_compressBound(srcSize); - buffer_t const dstBuffer = ZSTDMT_getBuffer(zcs->buffPool, dstBufferCapacity); - ZSTD_CCtx* const cctx = ZSTDMT_getCCtx(zcs->cctxPool); - unsigned const jobID = zcs->nextJobID & zcs->jobIDMask; + size_t const srcSize = mtctx->inBuff.filled; + DEBUGLOG(5, "ZSTDMT_flushStream_internal"); - if ((cctx==NULL) || (dstBuffer.start==NULL)) { - zcs->jobs[jobID].jobCompleted = 1; - zcs->nextJobID++; - ZSTDMT_waitForAllJobsCompleted(zcs); - ZSTDMT_releaseAllJobResources(zcs); - return ERROR(memory_allocation); + if ( mtctx->jobReady /* one job ready for a worker to pick up */ + || (srcSize > 0) /* still some data within input buffer */ + || ((endFrame==ZSTD_e_end) && !mtctx->frameEnded)) { /* need a last 0-size block to end frame */ + DEBUGLOG(5, "ZSTDMT_flushStream_internal : create a new job (%u bytes, end:%u)", + (U32)srcSize, (U32)endFrame); + CHECK_F( ZSTDMT_createCompressionJob(mtctx, srcSize, endFrame) ); } - DEBUGLOG(4, "preparing job %u to compress %u bytes with %u preload ", zcs->nextJobID, (U32)srcSize, (U32)zcs->dictSize); - zcs->jobs[jobID].src = zcs->inBuff.buffer; - zcs->jobs[jobID].srcStart = zcs->inBuff.buffer.start; - zcs->jobs[jobID].srcSize = srcSize; - zcs->jobs[jobID].dictSize = zcs->dictSize; /* note : zcs->inBuff.filled is presumed >= srcSize + dictSize */ - zcs->jobs[jobID].params = zcs->params; - if (zcs->nextJobID) zcs->jobs[jobID].params.fParams.checksumFlag = 0; /* do not calculate checksum within sections, just keep it in header for first section */ - zcs->jobs[jobID].cdict = zcs->nextJobID==0 ? zcs->cdict : NULL; - zcs->jobs[jobID].fullFrameSize = zcs->frameContentSize; - zcs->jobs[jobID].dstBuff = dstBuffer; - zcs->jobs[jobID].cctx = cctx; - zcs->jobs[jobID].firstChunk = (zcs->nextJobID==0); - zcs->jobs[jobID].lastChunk = endFrame; - zcs->jobs[jobID].jobCompleted = 0; - zcs->jobs[jobID].dstFlushed = 0; - zcs->jobs[jobID].jobCompleted_mutex = &zcs->jobCompleted_mutex; - zcs->jobs[jobID].jobCompleted_cond = &zcs->jobCompleted_cond; - - /* get a new buffer for next input */ - if (!endFrame) { - size_t const newDictSize = MIN(srcSize + zcs->dictSize, zcs->targetDictSize); - zcs->inBuff.buffer = ZSTDMT_getBuffer(zcs->buffPool, zcs->inBuffSize); - if (zcs->inBuff.buffer.start == NULL) { /* not enough memory to allocate next input buffer */ - zcs->jobs[jobID].jobCompleted = 1; - zcs->nextJobID++; - ZSTDMT_waitForAllJobsCompleted(zcs); - ZSTDMT_releaseAllJobResources(zcs); - return ERROR(memory_allocation); - } - DEBUGLOG(5, "inBuff filled to %u", (U32)zcs->inBuff.filled); - zcs->inBuff.filled -= srcSize + zcs->dictSize - newDictSize; - DEBUGLOG(5, "new job : filled to %u, with %u dict and %u src", (U32)zcs->inBuff.filled, (U32)newDictSize, (U32)(zcs->inBuff.filled - newDictSize)); - memmove(zcs->inBuff.buffer.start, (const char*)zcs->jobs[jobID].srcStart + zcs->dictSize + srcSize - newDictSize, zcs->inBuff.filled); - DEBUGLOG(5, "new inBuff pre-filled"); - zcs->dictSize = newDictSize; - } else { - zcs->inBuff.buffer = g_nullBuffer; - zcs->inBuff.filled = 0; - zcs->dictSize = 0; - zcs->frameEnded = 1; - if (zcs->nextJobID == 0) - zcs->params.fParams.checksumFlag = 0; /* single chunk : checksum is calculated directly within worker thread */ - } - - DEBUGLOG(3, "posting job %u : %u bytes (end:%u) (note : doneJob = %u=>%u)", zcs->nextJobID, (U32)zcs->jobs[jobID].srcSize, zcs->jobs[jobID].lastChunk, zcs->doneJobID, zcs->doneJobID & zcs->jobIDMask); - POOL_add(zcs->factory, ZSTDMT_compressChunk, &zcs->jobs[jobID]); /* this call is blocking when thread worker pool is exhausted */ - zcs->nextJobID++; - return 0; + /* check if there is any data available to flush */ + return ZSTDMT_flushProduced(mtctx, output, 1 /* blockToFlush */, endFrame); } -/* ZSTDMT_flushNextJob() : - * output : will be updated with amount of data flushed . - * blockToFlush : if >0, the function will block and wait if there is no data available to flush . - * @return : amount of data remaining within internal buffer, 1 if unknown but > 0, 0 if no more, or an error code */ -static size_t ZSTDMT_flushNextJob(ZSTDMT_CCtx* zcs, ZSTD_outBuffer* output, unsigned blockToFlush) +size_t ZSTDMT_flushStream(ZSTDMT_CCtx* mtctx, ZSTD_outBuffer* output) { - unsigned const wJobID = zcs->doneJobID & zcs->jobIDMask; - if (zcs->doneJobID == zcs->nextJobID) return 0; /* all flushed ! */ - PTHREAD_MUTEX_LOCK(&zcs->jobCompleted_mutex); - while (zcs->jobs[wJobID].jobCompleted==0) { - DEBUGLOG(5, "waiting for jobCompleted signal from job %u", zcs->doneJobID); - if (!blockToFlush) { pthread_mutex_unlock(&zcs->jobCompleted_mutex); return 0; } /* nothing ready to be flushed => skip */ - pthread_cond_wait(&zcs->jobCompleted_cond, &zcs->jobCompleted_mutex); /* block when nothing available to flush */ - } - pthread_mutex_unlock(&zcs->jobCompleted_mutex); - /* compression job completed : output can be flushed */ - { ZSTDMT_jobDescription job = zcs->jobs[wJobID]; - if (!job.jobScanned) { - if (ZSTD_isError(job.cSize)) { - DEBUGLOG(5, "compression error detected "); - ZSTDMT_waitForAllJobsCompleted(zcs); - ZSTDMT_releaseAllJobResources(zcs); - return job.cSize; - } - ZSTDMT_releaseCCtx(zcs->cctxPool, job.cctx); - zcs->jobs[wJobID].cctx = NULL; - DEBUGLOG(5, "zcs->params.fParams.checksumFlag : %u ", zcs->params.fParams.checksumFlag); - if (zcs->params.fParams.checksumFlag) { - XXH64_update(&zcs->xxhState, (const char*)job.srcStart + job.dictSize, job.srcSize); - if (zcs->frameEnded && (zcs->doneJobID+1 == zcs->nextJobID)) { /* write checksum at end of last section */ - U32 const checksum = (U32)XXH64_digest(&zcs->xxhState); - DEBUGLOG(4, "writing checksum : %08X \n", checksum); - MEM_writeLE32((char*)job.dstBuff.start + job.cSize, checksum); - job.cSize += 4; - zcs->jobs[wJobID].cSize += 4; - } } - ZSTDMT_releaseBuffer(zcs->buffPool, job.src); - zcs->jobs[wJobID].srcStart = NULL; - zcs->jobs[wJobID].src = g_nullBuffer; - zcs->jobs[wJobID].jobScanned = 1; - } - { size_t const toWrite = MIN(job.cSize - job.dstFlushed, output->size - output->pos); - DEBUGLOG(4, "Flushing %u bytes from job %u ", (U32)toWrite, zcs->doneJobID); - memcpy((char*)output->dst + output->pos, (const char*)job.dstBuff.start + job.dstFlushed, toWrite); - output->pos += toWrite; - job.dstFlushed += toWrite; - } - if (job.dstFlushed == job.cSize) { /* output buffer fully flushed => move to next one */ - ZSTDMT_releaseBuffer(zcs->buffPool, job.dstBuff); - zcs->jobs[wJobID].dstBuff = g_nullBuffer; - zcs->jobs[wJobID].jobCompleted = 0; - zcs->doneJobID++; - } else { - zcs->jobs[wJobID].dstFlushed = job.dstFlushed; - } - /* return value : how many bytes left in buffer ; fake it to 1 if unknown but >0 */ - if (job.cSize > job.dstFlushed) return (job.cSize - job.dstFlushed); - if (zcs->doneJobID < zcs->nextJobID) return 1; /* still some buffer to flush */ - zcs->allJobsCompleted = zcs->frameEnded; /* frame completed and entirely flushed */ - return 0; /* everything flushed */ -} } - - -size_t ZSTDMT_compressStream(ZSTDMT_CCtx* zcs, ZSTD_outBuffer* output, ZSTD_inBuffer* input) -{ - size_t const newJobThreshold = zcs->dictSize + zcs->targetSectionSize + zcs->marginSize; - if (zcs->frameEnded) return ERROR(stage_wrong); /* current frame being ended. Only flush is allowed. Restart with init */ - if (zcs->nbThreads==1) return ZSTD_compressStream(zcs->cstream, output, input); - - /* fill input buffer */ - { size_t const toLoad = MIN(input->size - input->pos, zcs->inBuffSize - zcs->inBuff.filled); - memcpy((char*)zcs->inBuff.buffer.start + zcs->inBuff.filled, input->src, toLoad); - input->pos += toLoad; - zcs->inBuff.filled += toLoad; - } - - if ( (zcs->inBuff.filled >= newJobThreshold) /* filled enough : let's compress */ - && (zcs->nextJobID <= zcs->doneJobID + zcs->jobIDMask) ) { /* avoid overwriting job round buffer */ - CHECK_F( ZSTDMT_createCompressionJob(zcs, zcs->targetSectionSize, 0) ); - } - - /* check for data to flush */ - CHECK_F( ZSTDMT_flushNextJob(zcs, output, (zcs->inBuff.filled == zcs->inBuffSize)) ); /* block if it wasn't possible to create new job due to saturation */ - - /* recommended next input size : fill current input buffer */ - return zcs->inBuffSize - zcs->inBuff.filled; /* note : could be zero when input buffer is fully filled and no more availability to create new job */ + DEBUGLOG(5, "ZSTDMT_flushStream"); + if (mtctx->singleBlockingThread) + return ZSTD_flushStream(mtctx->cctxPool->cctx[0], output); + return ZSTDMT_flushStream_internal(mtctx, output, ZSTD_e_flush); } - -static size_t ZSTDMT_flushStream_internal(ZSTDMT_CCtx* zcs, ZSTD_outBuffer* output, unsigned endFrame) +size_t ZSTDMT_endStream(ZSTDMT_CCtx* mtctx, ZSTD_outBuffer* output) { - size_t const srcSize = zcs->inBuff.filled - zcs->dictSize; - - if (srcSize) DEBUGLOG(4, "flushing : %u bytes left to compress", (U32)srcSize); - if ( ((srcSize > 0) || (endFrame && !zcs->frameEnded)) - && (zcs->nextJobID <= zcs->doneJobID + zcs->jobIDMask) ) { - CHECK_F( ZSTDMT_createCompressionJob(zcs, srcSize, endFrame) ); - } - - /* check if there is any data available to flush */ - DEBUGLOG(5, "zcs->doneJobID : %u ; zcs->nextJobID : %u ", zcs->doneJobID, zcs->nextJobID); - return ZSTDMT_flushNextJob(zcs, output, 1); + DEBUGLOG(4, "ZSTDMT_endStream"); + if (mtctx->singleBlockingThread) + return ZSTD_endStream(mtctx->cctxPool->cctx[0], output); + return ZSTDMT_flushStream_internal(mtctx, output, ZSTD_e_end); } - - -size_t ZSTDMT_flushStream(ZSTDMT_CCtx* zcs, ZSTD_outBuffer* output) -{ - if (zcs->nbThreads==1) return ZSTD_flushStream(zcs->cstream, output); - return ZSTDMT_flushStream_internal(zcs, output, 0); -} - -size_t ZSTDMT_endStream(ZSTDMT_CCtx* zcs, ZSTD_outBuffer* output) -{ - if (zcs->nbThreads==1) return ZSTD_endStream(zcs->cstream, output); - return ZSTDMT_flushStream_internal(zcs, output, 1); -} diff -r fb92df8b634c -r ed5448edcbfa contrib/python-zstandard/zstd/compress/zstdmt_compress.h --- a/contrib/python-zstandard/zstd/compress/zstdmt_compress.h Wed Apr 04 10:35:09 2018 -0400 +++ b/contrib/python-zstandard/zstd/compress/zstdmt_compress.h Wed Apr 18 15:32:08 2018 -0400 @@ -1,10 +1,11 @@ -/** +/* * Copyright (c) 2016-present, Yann Collet, Facebook, Inc. * All rights reserved. * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. An additional grant - * of patent rights can be found in the PATENTS file in the same directory. + * This source code is licensed under both the BSD-style license (found in the + * LICENSE file in the root directory of this source tree) and the GPLv2 (found + * in the COPYING file in the root directory of this source tree). + * You may select, at your option, one of the above-listed licenses. */ #ifndef ZSTDMT_COMPRESS_H @@ -15,31 +16,41 @@ #endif -/* Note : All prototypes defined in this file shall be considered experimental. - * There is no guarantee of API continuity (yet) on any of these prototypes */ +/* Note : This is an internal API. + * Some methods are still exposed (ZSTDLIB_API), + * because it used to be the only way to invoke MT compression. + * Now, it's recommended to use ZSTD_compress_generic() instead. + * These methods will stop being exposed in a future version */ /* === Dependencies === */ -#include /* size_t */ +#include /* size_t */ #define ZSTD_STATIC_LINKING_ONLY /* ZSTD_parameters */ -#include "zstd.h" /* ZSTD_inBuffer, ZSTD_outBuffer, ZSTDLIB_API */ +#include "zstd.h" /* ZSTD_inBuffer, ZSTD_outBuffer, ZSTDLIB_API */ -/* === Simple one-pass functions === */ - +/* === Memory management === */ typedef struct ZSTDMT_CCtx_s ZSTDMT_CCtx; -ZSTDLIB_API ZSTDMT_CCtx* ZSTDMT_createCCtx(unsigned nbThreads); -ZSTDLIB_API size_t ZSTDMT_freeCCtx(ZSTDMT_CCtx* cctx); +ZSTDLIB_API ZSTDMT_CCtx* ZSTDMT_createCCtx(unsigned nbWorkers); +ZSTDLIB_API ZSTDMT_CCtx* ZSTDMT_createCCtx_advanced(unsigned nbWorkers, + ZSTD_customMem cMem); +ZSTDLIB_API size_t ZSTDMT_freeCCtx(ZSTDMT_CCtx* mtctx); + +ZSTDLIB_API size_t ZSTDMT_sizeof_CCtx(ZSTDMT_CCtx* mtctx); + -ZSTDLIB_API size_t ZSTDMT_compressCCtx(ZSTDMT_CCtx* cctx, - void* dst, size_t dstCapacity, - const void* src, size_t srcSize, - int compressionLevel); +/* === Simple one-pass compression function === */ + +ZSTDLIB_API size_t ZSTDMT_compressCCtx(ZSTDMT_CCtx* mtctx, + void* dst, size_t dstCapacity, + const void* src, size_t srcSize, + int compressionLevel); + /* === Streaming functions === */ ZSTDLIB_API size_t ZSTDMT_initCStream(ZSTDMT_CCtx* mtctx, int compressionLevel); -ZSTDLIB_API size_t ZSTDMT_resetCStream(ZSTDMT_CCtx* mtctx, unsigned long long pledgedSrcSize); /**< pledgedSrcSize is optional and can be zero == unknown */ +ZSTDLIB_API size_t ZSTDMT_resetCStream(ZSTDMT_CCtx* mtctx, unsigned long long pledgedSrcSize); /**< if srcSize is not known at reset time, use ZSTD_CONTENTSIZE_UNKNOWN. Note: for compatibility with older programs, 0 means the same as ZSTD_CONTENTSIZE_UNKNOWN, but it will change in the future to mean "empty" */ ZSTDLIB_API size_t ZSTDMT_compressStream(ZSTDMT_CCtx* mtctx, ZSTD_outBuffer* output, ZSTD_inBuffer* input); @@ -49,26 +60,93 @@ /* === Advanced functions and parameters === */ -#ifndef ZSTDMT_SECTION_SIZE_MIN -# define ZSTDMT_SECTION_SIZE_MIN (1U << 20) /* 1 MB - Minimum size of each compression job */ +#ifndef ZSTDMT_JOBSIZE_MIN +# define ZSTDMT_JOBSIZE_MIN (1U << 20) /* 1 MB - Minimum size of each compression job */ #endif -ZSTDLIB_API size_t ZSTDMT_initCStream_advanced(ZSTDMT_CCtx* mtctx, const void* dict, size_t dictSize, /**< dict can be released after init, a local copy is preserved within zcs */ - ZSTD_parameters params, unsigned long long pledgedSrcSize); /**< pledgedSrcSize is optional and can be zero == unknown */ +ZSTDLIB_API size_t ZSTDMT_compress_advanced(ZSTDMT_CCtx* mtctx, + void* dst, size_t dstCapacity, + const void* src, size_t srcSize, + const ZSTD_CDict* cdict, + ZSTD_parameters params, + unsigned overlapLog); -/* ZSDTMT_parameter : +ZSTDLIB_API size_t ZSTDMT_initCStream_advanced(ZSTDMT_CCtx* mtctx, + const void* dict, size_t dictSize, /* dict can be released after init, a local copy is preserved within zcs */ + ZSTD_parameters params, + unsigned long long pledgedSrcSize); /* pledgedSrcSize is optional and can be zero == unknown */ + +ZSTDLIB_API size_t ZSTDMT_initCStream_usingCDict(ZSTDMT_CCtx* mtctx, + const ZSTD_CDict* cdict, + ZSTD_frameParameters fparams, + unsigned long long pledgedSrcSize); /* note : zero means empty */ + +/* ZSTDMT_parameter : * List of parameters that can be set using ZSTDMT_setMTCtxParameter() */ typedef enum { - ZSTDMT_p_sectionSize, /* size of input "section". Each section is compressed in parallel. 0 means default, which is dynamically determined within compression functions */ - ZSTDMT_p_overlapSectionLog /* Log of overlapped section; 0 == no overlap, 6(default) == use 1/8th of window, >=9 == use full window */ -} ZSDTMT_parameter; + ZSTDMT_p_jobSize, /* Each job is compressed in parallel. By default, this value is dynamically determined depending on compression parameters. Can be set explicitly here. */ + ZSTDMT_p_overlapSectionLog /* Each job may reload a part of previous job to enhance compressionr ratio; 0 == no overlap, 6(default) == use 1/8th of window, >=9 == use full window. This is a "sticky" parameter : its value will be re-used on next compression job */ +} ZSTDMT_parameter; /* ZSTDMT_setMTCtxParameter() : * allow setting individual parameters, one at a time, among a list of enums defined in ZSTDMT_parameter. - * The function must be called typically after ZSTD_createCCtx(). + * The function must be called typically after ZSTD_createCCtx() but __before ZSTDMT_init*() !__ * Parameters not explicitly reset by ZSTDMT_init*() remain the same in consecutive compression sessions. * @return : 0, or an error code (which can be tested using ZSTD_isError()) */ -ZSTDLIB_API size_t ZSTDMT_setMTCtxParameter(ZSTDMT_CCtx* mtctx, ZSDTMT_parameter parameter, unsigned value); +ZSTDLIB_API size_t ZSTDMT_setMTCtxParameter(ZSTDMT_CCtx* mtctx, ZSTDMT_parameter parameter, unsigned value); + + +/*! ZSTDMT_compressStream_generic() : + * Combines ZSTDMT_compressStream() with optional ZSTDMT_flushStream() or ZSTDMT_endStream() + * depending on flush directive. + * @return : minimum amount of data still to be flushed + * 0 if fully flushed + * or an error code + * note : needs to be init using any ZSTD_initCStream*() variant */ +ZSTDLIB_API size_t ZSTDMT_compressStream_generic(ZSTDMT_CCtx* mtctx, + ZSTD_outBuffer* output, + ZSTD_inBuffer* input, + ZSTD_EndDirective endOp); + + +/* ======================================================== + * === Private interface, for use by ZSTD_compress.c === + * === Not exposed in libzstd. Never invoke directly === + * ======================================================== */ + +size_t ZSTDMT_CCtxParam_setMTCtxParameter(ZSTD_CCtx_params* params, ZSTDMT_parameter parameter, unsigned value); + +/* ZSTDMT_CCtxParam_setNbWorkers() + * Set nbWorkers, and clamp it. + * Also reset jobSize and overlapLog */ +size_t ZSTDMT_CCtxParam_setNbWorkers(ZSTD_CCtx_params* params, unsigned nbWorkers); + +/*! ZSTDMT_updateCParams_whileCompressing() : + * Updates only a selected set of compression parameters, to remain compatible with current frame. + * New parameters will be applied to next compression job. */ +void ZSTDMT_updateCParams_whileCompressing(ZSTDMT_CCtx* mtctx, const ZSTD_CCtx_params* cctxParams); + +/* ZSTDMT_getNbWorkers(): + * @return nb threads currently active in mtctx. + * mtctx must be valid */ +unsigned ZSTDMT_getNbWorkers(const ZSTDMT_CCtx* mtctx); + +/* ZSTDMT_getFrameProgression(): + * tells how much data has been consumed (input) and produced (output) for current frame. + * able to count progression inside worker threads. + */ +ZSTD_frameProgression ZSTDMT_getFrameProgression(ZSTDMT_CCtx* mtctx); + + +/*! ZSTDMT_initCStream_internal() : + * Private use only. Init streaming operation. + * expects params to be valid. + * must receive dict, or cdict, or none, but not both. + * @return : 0, or an error code */ +size_t ZSTDMT_initCStream_internal(ZSTDMT_CCtx* zcs, + const void* dict, size_t dictSize, ZSTD_dictContentType_e dictContentType, + const ZSTD_CDict* cdict, + ZSTD_CCtx_params params, unsigned long long pledgedSrcSize); #if defined (__cplusplus) diff -r fb92df8b634c -r ed5448edcbfa contrib/python-zstandard/zstd/decompress/huf_decompress.c --- a/contrib/python-zstandard/zstd/decompress/huf_decompress.c Wed Apr 04 10:35:09 2018 -0400 +++ b/contrib/python-zstandard/zstd/decompress/huf_decompress.c Wed Apr 18 15:32:08 2018 -0400 @@ -33,41 +33,35 @@ ****************************************************************** */ /* ************************************************************** -* Compiler specifics -****************************************************************/ -#if defined (__cplusplus) || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */) -/* inline is defined */ -#elif defined(_MSC_VER) || defined(__GNUC__) -# define inline __inline -#else -# define inline /* disable inline */ -#endif - -#ifdef _MSC_VER /* Visual Studio */ -# pragma warning(disable : 4127) /* disable: C4127: conditional expression is constant */ -#endif - - -/* ************************************************************** * Dependencies ****************************************************************/ #include /* memcpy, memset */ #include "bitstream.h" /* BIT_* */ +#include "compiler.h" #include "fse.h" /* header compression */ #define HUF_STATIC_LINKING_ONLY #include "huf.h" +#include "error_private.h" /* ************************************************************** * Error Management ****************************************************************/ +#define HUF_isError ERR_isError #define HUF_STATIC_ASSERT(c) { enum { HUF_static_assert = 1/(int)(!!(c)) }; } /* use only *after* variable declarations */ +#define CHECK_F(f) { size_t const err_ = (f); if (HUF_isError(err_)) return err_; } + + +/* ************************************************************** +* Byte alignment for workSpace management +****************************************************************/ +#define HUF_ALIGN(x, a) HUF_ALIGN_MASK((x), (a) - 1) +#define HUF_ALIGN_MASK(x, mask) (((x) + (mask)) & ~(mask)) /*-***************************/ /* generic DTableDesc */ /*-***************************/ - typedef struct { BYTE maxTableLog; BYTE tableType; BYTE tableLog; BYTE reserved; } DTableDesc; static DTableDesc HUF_getDTableDesc(const HUF_DTable* table) @@ -81,19 +75,27 @@ /*-***************************/ /* single-symbol decoding */ /*-***************************/ - typedef struct { BYTE byte; BYTE nbBits; } HUF_DEltX2; /* single-symbol decoding */ -size_t HUF_readDTableX2 (HUF_DTable* DTable, const void* src, size_t srcSize) +size_t HUF_readDTableX2_wksp(HUF_DTable* DTable, const void* src, size_t srcSize, void* workSpace, size_t wkspSize) { - BYTE huffWeight[HUF_SYMBOLVALUE_MAX + 1]; - U32 rankVal[HUF_TABLELOG_ABSOLUTEMAX + 1]; /* large enough for values from 0 to 16 */ U32 tableLog = 0; U32 nbSymbols = 0; size_t iSize; void* const dtPtr = DTable + 1; HUF_DEltX2* const dt = (HUF_DEltX2*)dtPtr; + U32* rankVal; + BYTE* huffWeight; + size_t spaceUsed32 = 0; + + rankVal = (U32 *)workSpace + spaceUsed32; + spaceUsed32 += HUF_TABLELOG_ABSOLUTEMAX + 1; + huffWeight = (BYTE *)((U32 *)workSpace + spaceUsed32); + spaceUsed32 += HUF_ALIGN(HUF_SYMBOLVALUE_MAX + 1, sizeof(U32)) >> 2; + + if ((spaceUsed32 << 2) > wkspSize) return ERROR(tableLog_tooLarge); + HUF_STATIC_ASSERT(sizeof(DTableDesc) == sizeof(HUF_DTable)); /* memset(huffWeight, 0, sizeof(huffWeight)); */ /* is not necessary, even though some analyzer complain ... */ @@ -102,16 +104,16 @@ /* Table header */ { DTableDesc dtd = HUF_getDTableDesc(DTable); - if (tableLog > (U32)(dtd.maxTableLog+1)) return ERROR(tableLog_tooLarge); /* DTable too small, huffman tree cannot fit in */ + if (tableLog > (U32)(dtd.maxTableLog+1)) return ERROR(tableLog_tooLarge); /* DTable too small, Huffman tree cannot fit in */ dtd.tableType = 0; dtd.tableLog = (BYTE)tableLog; memcpy(DTable, &dtd, sizeof(dtd)); } - /* Prepare ranks */ + /* Calculate starting value for each rank */ { U32 n, nextRankStart = 0; for (n=1; n> 1; - U32 i; + U32 u; HUF_DEltX2 D; D.byte = (BYTE)n; D.nbBits = (BYTE)(tableLog + 1 - w); - for (i = rankVal[w]; i < rankVal[w] + length; i++) - dt[i] = D; + for (u = rankVal[w]; u < rankVal[w] + length; u++) + dt[u] = D; rankVal[w] += length; } } return iSize; } +size_t HUF_readDTableX2(HUF_DTable* DTable, const void* src, size_t srcSize) +{ + U32 workSpace[HUF_DECOMPRESS_WORKSPACE_SIZE_U32]; + return HUF_readDTableX2_wksp(DTable, src, srcSize, + workSpace, sizeof(workSpace)); +} -static BYTE HUF_decodeSymbolX2(BIT_DStream_t* Dstream, const HUF_DEltX2* dt, const U32 dtLog) +typedef struct { U16 sequence; BYTE nbBits; BYTE length; } HUF_DEltX4; /* double-symbols decoding */ + +FORCE_INLINE_TEMPLATE BYTE +HUF_decodeSymbolX2(BIT_DStream_t* Dstream, const HUF_DEltX2* dt, const U32 dtLog) { size_t const val = BIT_lookBitsFast(Dstream, dtLog); /* note : dtLog >= 1 */ BYTE const c = dt[val].byte; @@ -144,7 +155,7 @@ #define HUF_DECODE_SYMBOLX2_0(ptr, DStreamPtr) \ *ptr++ = HUF_decodeSymbolX2(DStreamPtr, dt, dtLog) -#define HUF_DECODE_SYMBOLX2_1(ptr, DStreamPtr) \ +#define HUF_DECODE_SYMBOLX2_1(ptr, DStreamPtr) \ if (MEM_64bits() || (HUF_TABLELOG_MAX<=12)) \ HUF_DECODE_SYMBOLX2_0(ptr, DStreamPtr) @@ -152,30 +163,33 @@ if (MEM_64bits()) \ HUF_DECODE_SYMBOLX2_0(ptr, DStreamPtr) -static inline size_t HUF_decodeStreamX2(BYTE* p, BIT_DStream_t* const bitDPtr, BYTE* const pEnd, const HUF_DEltX2* const dt, const U32 dtLog) +HINT_INLINE size_t +HUF_decodeStreamX2(BYTE* p, BIT_DStream_t* const bitDPtr, BYTE* const pEnd, const HUF_DEltX2* const dt, const U32 dtLog) { BYTE* const pStart = p; /* up to 4 symbols at a time */ - while ((BIT_reloadDStream(bitDPtr) == BIT_DStream_unfinished) && (p <= pEnd-4)) { + while ((BIT_reloadDStream(bitDPtr) == BIT_DStream_unfinished) & (p < pEnd-3)) { HUF_DECODE_SYMBOLX2_2(p, bitDPtr); HUF_DECODE_SYMBOLX2_1(p, bitDPtr); HUF_DECODE_SYMBOLX2_2(p, bitDPtr); HUF_DECODE_SYMBOLX2_0(p, bitDPtr); } - /* closer to the end */ - while ((BIT_reloadDStream(bitDPtr) == BIT_DStream_unfinished) && (p < pEnd)) - HUF_DECODE_SYMBOLX2_0(p, bitDPtr); + /* [0-3] symbols remaining */ + if (MEM_32bits()) + while ((BIT_reloadDStream(bitDPtr) == BIT_DStream_unfinished) & (p < pEnd)) + HUF_DECODE_SYMBOLX2_0(p, bitDPtr); - /* no more data to retrieve from bitstream, hence no need to reload */ + /* no more data to retrieve from bitstream, no need to reload */ while (p < pEnd) HUF_DECODE_SYMBOLX2_0(p, bitDPtr); return pEnd-pStart; } -static size_t HUF_decompress1X2_usingDTable_internal( +FORCE_INLINE_TEMPLATE size_t +HUF_decompress1X2_usingDTable_internal_body( void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable) @@ -188,47 +202,17 @@ DTableDesc const dtd = HUF_getDTableDesc(DTable); U32 const dtLog = dtd.tableLog; - { size_t const errorCode = BIT_initDStream(&bitD, cSrc, cSrcSize); - if (HUF_isError(errorCode)) return errorCode; } + CHECK_F( BIT_initDStream(&bitD, cSrc, cSrcSize) ); HUF_decodeStreamX2(op, &bitD, oend, dt, dtLog); - /* check */ if (!BIT_endOfDStream(&bitD)) return ERROR(corruption_detected); return dstSize; } -size_t HUF_decompress1X2_usingDTable( - void* dst, size_t dstSize, - const void* cSrc, size_t cSrcSize, - const HUF_DTable* DTable) -{ - DTableDesc dtd = HUF_getDTableDesc(DTable); - if (dtd.tableType != 0) return ERROR(GENERIC); - return HUF_decompress1X2_usingDTable_internal(dst, dstSize, cSrc, cSrcSize, DTable); -} - -size_t HUF_decompress1X2_DCtx (HUF_DTable* DCtx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize) -{ - const BYTE* ip = (const BYTE*) cSrc; - - size_t const hSize = HUF_readDTableX2 (DCtx, cSrc, cSrcSize); - if (HUF_isError(hSize)) return hSize; - if (hSize >= cSrcSize) return ERROR(srcSize_wrong); - ip += hSize; cSrcSize -= hSize; - - return HUF_decompress1X2_usingDTable_internal (dst, dstSize, ip, cSrcSize, DCtx); -} - -size_t HUF_decompress1X2 (void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize) -{ - HUF_CREATE_STATIC_DTABLEX2(DTable, HUF_TABLELOG_MAX); - return HUF_decompress1X2_DCtx (DTable, dst, dstSize, cSrc, cSrcSize); -} - - -static size_t HUF_decompress4X2_usingDTable_internal( +FORCE_INLINE_TEMPLATE size_t +HUF_decompress4X2_usingDTable_internal_body( void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable) @@ -263,23 +247,19 @@ BYTE* op2 = opStart2; BYTE* op3 = opStart3; BYTE* op4 = opStart4; - U32 endSignal; + U32 endSignal = BIT_DStream_unfinished; DTableDesc const dtd = HUF_getDTableDesc(DTable); U32 const dtLog = dtd.tableLog; if (length4 > cSrcSize) return ERROR(corruption_detected); /* overflow */ - { size_t const errorCode = BIT_initDStream(&bitD1, istart1, length1); - if (HUF_isError(errorCode)) return errorCode; } - { size_t const errorCode = BIT_initDStream(&bitD2, istart2, length2); - if (HUF_isError(errorCode)) return errorCode; } - { size_t const errorCode = BIT_initDStream(&bitD3, istart3, length3); - if (HUF_isError(errorCode)) return errorCode; } - { size_t const errorCode = BIT_initDStream(&bitD4, istart4, length4); - if (HUF_isError(errorCode)) return errorCode; } + CHECK_F( BIT_initDStream(&bitD1, istart1, length1) ); + CHECK_F( BIT_initDStream(&bitD2, istart2, length2) ); + CHECK_F( BIT_initDStream(&bitD3, istart3, length3) ); + CHECK_F( BIT_initDStream(&bitD4, istart4, length4) ); - /* 16-32 symbols per loop (4-8 symbols per stream) */ + /* up to 16 symbols per loop (4 symbols per stream) in 64-bit mode */ endSignal = BIT_reloadDStream(&bitD1) | BIT_reloadDStream(&bitD2) | BIT_reloadDStream(&bitD3) | BIT_reloadDStream(&bitD4); - for ( ; (endSignal==BIT_DStream_unfinished) && (op4<(oend-7)) ; ) { + while ( (endSignal==BIT_DStream_unfinished) && (op4<(oend-3)) ) { HUF_DECODE_SYMBOLX2_2(op1, &bitD1); HUF_DECODE_SYMBOLX2_2(op2, &bitD2); HUF_DECODE_SYMBOLX2_2(op3, &bitD3); @@ -296,10 +276,15 @@ HUF_DECODE_SYMBOLX2_0(op2, &bitD2); HUF_DECODE_SYMBOLX2_0(op3, &bitD3); HUF_DECODE_SYMBOLX2_0(op4, &bitD4); - endSignal = BIT_reloadDStream(&bitD1) | BIT_reloadDStream(&bitD2) | BIT_reloadDStream(&bitD3) | BIT_reloadDStream(&bitD4); + BIT_reloadDStream(&bitD1); + BIT_reloadDStream(&bitD2); + BIT_reloadDStream(&bitD3); + BIT_reloadDStream(&bitD4); } /* check corruption */ + /* note : should not be necessary : op# advance in lock step, and we control op4. + * but curiously, binary generated by gcc 7.2 & 7.3 with -mbmi2 runs faster when >=1 test is present */ if (op1 > opStart2) return ERROR(corruption_detected); if (op2 > opStart3) return ERROR(corruption_detected); if (op3 > opStart4) return ERROR(corruption_detected); @@ -312,8 +297,8 @@ HUF_decodeStreamX2(op4, &bitD4, oend, dt, dtLog); /* check */ - endSignal = BIT_endOfDStream(&bitD1) & BIT_endOfDStream(&bitD2) & BIT_endOfDStream(&bitD3) & BIT_endOfDStream(&bitD4); - if (!endSignal) return ERROR(corruption_detected); + { U32 const endCheck = BIT_endOfDStream(&bitD1) & BIT_endOfDStream(&bitD2) & BIT_endOfDStream(&bitD3) & BIT_endOfDStream(&bitD4); + if (!endCheck) return ERROR(corruption_detected); } /* decoded size */ return dstSize; @@ -321,6 +306,279 @@ } +FORCE_INLINE_TEMPLATE U32 +HUF_decodeSymbolX4(void* op, BIT_DStream_t* DStream, const HUF_DEltX4* dt, const U32 dtLog) +{ + size_t const val = BIT_lookBitsFast(DStream, dtLog); /* note : dtLog >= 1 */ + memcpy(op, dt+val, 2); + BIT_skipBits(DStream, dt[val].nbBits); + return dt[val].length; +} + +FORCE_INLINE_TEMPLATE U32 +HUF_decodeLastSymbolX4(void* op, BIT_DStream_t* DStream, const HUF_DEltX4* dt, const U32 dtLog) +{ + size_t const val = BIT_lookBitsFast(DStream, dtLog); /* note : dtLog >= 1 */ + memcpy(op, dt+val, 1); + if (dt[val].length==1) BIT_skipBits(DStream, dt[val].nbBits); + else { + if (DStream->bitsConsumed < (sizeof(DStream->bitContainer)*8)) { + BIT_skipBits(DStream, dt[val].nbBits); + if (DStream->bitsConsumed > (sizeof(DStream->bitContainer)*8)) + /* ugly hack; works only because it's the last symbol. Note : can't easily extract nbBits from just this symbol */ + DStream->bitsConsumed = (sizeof(DStream->bitContainer)*8); + } } + return 1; +} + +#define HUF_DECODE_SYMBOLX4_0(ptr, DStreamPtr) \ + ptr += HUF_decodeSymbolX4(ptr, DStreamPtr, dt, dtLog) + +#define HUF_DECODE_SYMBOLX4_1(ptr, DStreamPtr) \ + if (MEM_64bits() || (HUF_TABLELOG_MAX<=12)) \ + ptr += HUF_decodeSymbolX4(ptr, DStreamPtr, dt, dtLog) + +#define HUF_DECODE_SYMBOLX4_2(ptr, DStreamPtr) \ + if (MEM_64bits()) \ + ptr += HUF_decodeSymbolX4(ptr, DStreamPtr, dt, dtLog) + +HINT_INLINE size_t +HUF_decodeStreamX4(BYTE* p, BIT_DStream_t* bitDPtr, BYTE* const pEnd, + const HUF_DEltX4* const dt, const U32 dtLog) +{ + BYTE* const pStart = p; + + /* up to 8 symbols at a time */ + while ((BIT_reloadDStream(bitDPtr) == BIT_DStream_unfinished) & (p < pEnd-(sizeof(bitDPtr->bitContainer)-1))) { + HUF_DECODE_SYMBOLX4_2(p, bitDPtr); + HUF_DECODE_SYMBOLX4_1(p, bitDPtr); + HUF_DECODE_SYMBOLX4_2(p, bitDPtr); + HUF_DECODE_SYMBOLX4_0(p, bitDPtr); + } + + /* closer to end : up to 2 symbols at a time */ + while ((BIT_reloadDStream(bitDPtr) == BIT_DStream_unfinished) & (p <= pEnd-2)) + HUF_DECODE_SYMBOLX4_0(p, bitDPtr); + + while (p <= pEnd-2) + HUF_DECODE_SYMBOLX4_0(p, bitDPtr); /* no need to reload : reached the end of DStream */ + + if (p < pEnd) + p += HUF_decodeLastSymbolX4(p, bitDPtr, dt, dtLog); + + return p-pStart; +} + +FORCE_INLINE_TEMPLATE size_t +HUF_decompress1X4_usingDTable_internal_body( + void* dst, size_t dstSize, + const void* cSrc, size_t cSrcSize, + const HUF_DTable* DTable) +{ + BIT_DStream_t bitD; + + /* Init */ + CHECK_F( BIT_initDStream(&bitD, cSrc, cSrcSize) ); + + /* decode */ + { BYTE* const ostart = (BYTE*) dst; + BYTE* const oend = ostart + dstSize; + const void* const dtPtr = DTable+1; /* force compiler to not use strict-aliasing */ + const HUF_DEltX4* const dt = (const HUF_DEltX4*)dtPtr; + DTableDesc const dtd = HUF_getDTableDesc(DTable); + HUF_decodeStreamX4(ostart, &bitD, oend, dt, dtd.tableLog); + } + + /* check */ + if (!BIT_endOfDStream(&bitD)) return ERROR(corruption_detected); + + /* decoded size */ + return dstSize; +} + + +FORCE_INLINE_TEMPLATE size_t +HUF_decompress4X4_usingDTable_internal_body( + void* dst, size_t dstSize, + const void* cSrc, size_t cSrcSize, + const HUF_DTable* DTable) +{ + if (cSrcSize < 10) return ERROR(corruption_detected); /* strict minimum : jump table + 1 byte per stream */ + + { const BYTE* const istart = (const BYTE*) cSrc; + BYTE* const ostart = (BYTE*) dst; + BYTE* const oend = ostart + dstSize; + const void* const dtPtr = DTable+1; + const HUF_DEltX4* const dt = (const HUF_DEltX4*)dtPtr; + + /* Init */ + BIT_DStream_t bitD1; + BIT_DStream_t bitD2; + BIT_DStream_t bitD3; + BIT_DStream_t bitD4; + size_t const length1 = MEM_readLE16(istart); + size_t const length2 = MEM_readLE16(istart+2); + size_t const length3 = MEM_readLE16(istart+4); + size_t const length4 = cSrcSize - (length1 + length2 + length3 + 6); + const BYTE* const istart1 = istart + 6; /* jumpTable */ + const BYTE* const istart2 = istart1 + length1; + const BYTE* const istart3 = istart2 + length2; + const BYTE* const istart4 = istart3 + length3; + size_t const segmentSize = (dstSize+3) / 4; + BYTE* const opStart2 = ostart + segmentSize; + BYTE* const opStart3 = opStart2 + segmentSize; + BYTE* const opStart4 = opStart3 + segmentSize; + BYTE* op1 = ostart; + BYTE* op2 = opStart2; + BYTE* op3 = opStart3; + BYTE* op4 = opStart4; + U32 endSignal; + DTableDesc const dtd = HUF_getDTableDesc(DTable); + U32 const dtLog = dtd.tableLog; + + if (length4 > cSrcSize) return ERROR(corruption_detected); /* overflow */ + CHECK_F( BIT_initDStream(&bitD1, istart1, length1) ); + CHECK_F( BIT_initDStream(&bitD2, istart2, length2) ); + CHECK_F( BIT_initDStream(&bitD3, istart3, length3) ); + CHECK_F( BIT_initDStream(&bitD4, istart4, length4) ); + + /* 16-32 symbols per loop (4-8 symbols per stream) */ + endSignal = BIT_reloadDStream(&bitD1) | BIT_reloadDStream(&bitD2) | BIT_reloadDStream(&bitD3) | BIT_reloadDStream(&bitD4); + for ( ; (endSignal==BIT_DStream_unfinished) & (op4<(oend-(sizeof(bitD4.bitContainer)-1))) ; ) { + HUF_DECODE_SYMBOLX4_2(op1, &bitD1); + HUF_DECODE_SYMBOLX4_2(op2, &bitD2); + HUF_DECODE_SYMBOLX4_2(op3, &bitD3); + HUF_DECODE_SYMBOLX4_2(op4, &bitD4); + HUF_DECODE_SYMBOLX4_1(op1, &bitD1); + HUF_DECODE_SYMBOLX4_1(op2, &bitD2); + HUF_DECODE_SYMBOLX4_1(op3, &bitD3); + HUF_DECODE_SYMBOLX4_1(op4, &bitD4); + HUF_DECODE_SYMBOLX4_2(op1, &bitD1); + HUF_DECODE_SYMBOLX4_2(op2, &bitD2); + HUF_DECODE_SYMBOLX4_2(op3, &bitD3); + HUF_DECODE_SYMBOLX4_2(op4, &bitD4); + HUF_DECODE_SYMBOLX4_0(op1, &bitD1); + HUF_DECODE_SYMBOLX4_0(op2, &bitD2); + HUF_DECODE_SYMBOLX4_0(op3, &bitD3); + HUF_DECODE_SYMBOLX4_0(op4, &bitD4); + + endSignal = BIT_reloadDStream(&bitD1) | BIT_reloadDStream(&bitD2) | BIT_reloadDStream(&bitD3) | BIT_reloadDStream(&bitD4); + } + + /* check corruption */ + if (op1 > opStart2) return ERROR(corruption_detected); + if (op2 > opStart3) return ERROR(corruption_detected); + if (op3 > opStart4) return ERROR(corruption_detected); + /* note : op4 already verified within main loop */ + + /* finish bitStreams one by one */ + HUF_decodeStreamX4(op1, &bitD1, opStart2, dt, dtLog); + HUF_decodeStreamX4(op2, &bitD2, opStart3, dt, dtLog); + HUF_decodeStreamX4(op3, &bitD3, opStart4, dt, dtLog); + HUF_decodeStreamX4(op4, &bitD4, oend, dt, dtLog); + + /* check */ + { U32 const endCheck = BIT_endOfDStream(&bitD1) & BIT_endOfDStream(&bitD2) & BIT_endOfDStream(&bitD3) & BIT_endOfDStream(&bitD4); + if (!endCheck) return ERROR(corruption_detected); } + + /* decoded size */ + return dstSize; + } +} + + +typedef size_t (*HUF_decompress_usingDTable_t)(void *dst, size_t dstSize, + const void *cSrc, + size_t cSrcSize, + const HUF_DTable *DTable); +#if DYNAMIC_BMI2 + +#define X(fn) \ + \ + static size_t fn##_default( \ + void* dst, size_t dstSize, \ + const void* cSrc, size_t cSrcSize, \ + const HUF_DTable* DTable) \ + { \ + return fn##_body(dst, dstSize, cSrc, cSrcSize, DTable); \ + } \ + \ + static TARGET_ATTRIBUTE("bmi2") size_t fn##_bmi2( \ + void* dst, size_t dstSize, \ + const void* cSrc, size_t cSrcSize, \ + const HUF_DTable* DTable) \ + { \ + return fn##_body(dst, dstSize, cSrc, cSrcSize, DTable); \ + } \ + \ + static size_t fn(void* dst, size_t dstSize, void const* cSrc, \ + size_t cSrcSize, HUF_DTable const* DTable, int bmi2) \ + { \ + if (bmi2) { \ + return fn##_bmi2(dst, dstSize, cSrc, cSrcSize, DTable); \ + } \ + return fn##_default(dst, dstSize, cSrc, cSrcSize, DTable); \ + } + +#else + +#define X(fn) \ + static size_t fn(void* dst, size_t dstSize, void const* cSrc, \ + size_t cSrcSize, HUF_DTable const* DTable, int bmi2) \ + { \ + (void)bmi2; \ + return fn##_body(dst, dstSize, cSrc, cSrcSize, DTable); \ + } + +#endif + +X(HUF_decompress1X2_usingDTable_internal) +X(HUF_decompress4X2_usingDTable_internal) +X(HUF_decompress1X4_usingDTable_internal) +X(HUF_decompress4X4_usingDTable_internal) + +#undef X + + +size_t HUF_decompress1X2_usingDTable( + void* dst, size_t dstSize, + const void* cSrc, size_t cSrcSize, + const HUF_DTable* DTable) +{ + DTableDesc dtd = HUF_getDTableDesc(DTable); + if (dtd.tableType != 0) return ERROR(GENERIC); + return HUF_decompress1X2_usingDTable_internal(dst, dstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0); +} + +size_t HUF_decompress1X2_DCtx_wksp(HUF_DTable* DCtx, void* dst, size_t dstSize, + const void* cSrc, size_t cSrcSize, + void* workSpace, size_t wkspSize) +{ + const BYTE* ip = (const BYTE*) cSrc; + + size_t const hSize = HUF_readDTableX2_wksp(DCtx, cSrc, cSrcSize, workSpace, wkspSize); + if (HUF_isError(hSize)) return hSize; + if (hSize >= cSrcSize) return ERROR(srcSize_wrong); + ip += hSize; cSrcSize -= hSize; + + return HUF_decompress1X2_usingDTable_internal(dst, dstSize, ip, cSrcSize, DCtx, /* bmi2 */ 0); +} + + +size_t HUF_decompress1X2_DCtx(HUF_DTable* DCtx, void* dst, size_t dstSize, + const void* cSrc, size_t cSrcSize) +{ + U32 workSpace[HUF_DECOMPRESS_WORKSPACE_SIZE_U32]; + return HUF_decompress1X2_DCtx_wksp(DCtx, dst, dstSize, cSrc, cSrcSize, + workSpace, sizeof(workSpace)); +} + +size_t HUF_decompress1X2 (void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize) +{ + HUF_CREATE_STATIC_DTABLEX2(DTable, HUF_TABLELOG_MAX); + return HUF_decompress1X2_DCtx (DTable, dst, dstSize, cSrc, cSrcSize); +} + size_t HUF_decompress4X2_usingDTable( void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, @@ -328,22 +586,38 @@ { DTableDesc dtd = HUF_getDTableDesc(DTable); if (dtd.tableType != 0) return ERROR(GENERIC); - return HUF_decompress4X2_usingDTable_internal(dst, dstSize, cSrc, cSrcSize, DTable); + return HUF_decompress4X2_usingDTable_internal(dst, dstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0); +} + +static size_t HUF_decompress4X2_DCtx_wksp_bmi2(HUF_DTable* dctx, void* dst, size_t dstSize, + const void* cSrc, size_t cSrcSize, + void* workSpace, size_t wkspSize, int bmi2) +{ + const BYTE* ip = (const BYTE*) cSrc; + + size_t const hSize = HUF_readDTableX2_wksp (dctx, cSrc, cSrcSize, + workSpace, wkspSize); + if (HUF_isError(hSize)) return hSize; + if (hSize >= cSrcSize) return ERROR(srcSize_wrong); + ip += hSize; cSrcSize -= hSize; + + return HUF_decompress4X2_usingDTable_internal(dst, dstSize, ip, cSrcSize, dctx, bmi2); +} + +size_t HUF_decompress4X2_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, + const void* cSrc, size_t cSrcSize, + void* workSpace, size_t wkspSize) +{ + return HUF_decompress4X2_DCtx_wksp_bmi2(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize, 0); } size_t HUF_decompress4X2_DCtx (HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize) { - const BYTE* ip = (const BYTE*) cSrc; - - size_t const hSize = HUF_readDTableX2 (dctx, cSrc, cSrcSize); - if (HUF_isError(hSize)) return hSize; - if (hSize >= cSrcSize) return ERROR(srcSize_wrong); - ip += hSize; cSrcSize -= hSize; - - return HUF_decompress4X2_usingDTable_internal (dst, dstSize, ip, cSrcSize, dctx); + U32 workSpace[HUF_DECOMPRESS_WORKSPACE_SIZE_U32]; + return HUF_decompress4X2_DCtx_wksp(dctx, dst, dstSize, cSrc, cSrcSize, + workSpace, sizeof(workSpace)); } - size_t HUF_decompress4X2 (void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize) { HUF_CREATE_STATIC_DTABLEX2(DTable, HUF_TABLELOG_MAX); @@ -354,8 +628,6 @@ /* *************************/ /* double-symbols decoding */ /* *************************/ -typedef struct { U16 sequence; BYTE nbBits; BYTE length; } HUF_DEltX4; /* double-symbols decoding */ - typedef struct { BYTE symbol; BYTE weight; } sortedSymbol_t; /* HUF_fillDTableX4Level2() : @@ -400,7 +672,8 @@ } } } -typedef U32 rankVal_t[HUF_TABLELOG_MAX][HUF_TABLELOG_MAX + 1]; +typedef U32 rankValCol_t[HUF_TABLELOG_MAX + 1]; +typedef rankValCol_t rankVal_t[HUF_TABLELOG_MAX]; static void HUF_fillDTableX4(HUF_DEltX4* DTable, const U32 targetLog, const sortedSymbol_t* sortedList, const U32 sortedListSize, @@ -444,22 +717,42 @@ } } -size_t HUF_readDTableX4 (HUF_DTable* DTable, const void* src, size_t srcSize) +size_t HUF_readDTableX4_wksp(HUF_DTable* DTable, const void* src, + size_t srcSize, void* workSpace, + size_t wkspSize) { - BYTE weightList[HUF_SYMBOLVALUE_MAX + 1]; - sortedSymbol_t sortedSymbol[HUF_SYMBOLVALUE_MAX + 1]; - U32 rankStats[HUF_TABLELOG_MAX + 1] = { 0 }; - U32 rankStart0[HUF_TABLELOG_MAX + 2] = { 0 }; - U32* const rankStart = rankStart0+1; - rankVal_t rankVal; U32 tableLog, maxW, sizeOfSort, nbSymbols; DTableDesc dtd = HUF_getDTableDesc(DTable); U32 const maxTableLog = dtd.maxTableLog; size_t iSize; void* dtPtr = DTable+1; /* force compiler to avoid strict-aliasing */ HUF_DEltX4* const dt = (HUF_DEltX4*)dtPtr; + U32 *rankStart; - HUF_STATIC_ASSERT(sizeof(HUF_DEltX4) == sizeof(HUF_DTable)); /* if compilation fails here, assertion is false */ + rankValCol_t* rankVal; + U32* rankStats; + U32* rankStart0; + sortedSymbol_t* sortedSymbol; + BYTE* weightList; + size_t spaceUsed32 = 0; + + rankVal = (rankValCol_t *)((U32 *)workSpace + spaceUsed32); + spaceUsed32 += (sizeof(rankValCol_t) * HUF_TABLELOG_MAX) >> 2; + rankStats = (U32 *)workSpace + spaceUsed32; + spaceUsed32 += HUF_TABLELOG_MAX + 1; + rankStart0 = (U32 *)workSpace + spaceUsed32; + spaceUsed32 += HUF_TABLELOG_MAX + 2; + sortedSymbol = (sortedSymbol_t *)workSpace + (spaceUsed32 * sizeof(U32)) / sizeof(sortedSymbol_t); + spaceUsed32 += HUF_ALIGN(sizeof(sortedSymbol_t) * (HUF_SYMBOLVALUE_MAX + 1), sizeof(U32)) >> 2; + weightList = (BYTE *)((U32 *)workSpace + spaceUsed32); + spaceUsed32 += HUF_ALIGN(HUF_SYMBOLVALUE_MAX + 1, sizeof(U32)) >> 2; + + if ((spaceUsed32 << 2) > wkspSize) return ERROR(tableLog_tooLarge); + + rankStart = rankStart0 + 1; + memset(rankStats, 0, sizeof(U32) * (2 * HUF_TABLELOG_MAX + 2 + 1)); + + HUF_STATIC_ASSERT(sizeof(HUF_DEltX4) == sizeof(HUF_DTable)); /* if compiler fails here, assertion is wrong */ if (maxTableLog > HUF_TABLELOG_MAX) return ERROR(tableLog_tooLarge); /* memset(weightList, 0, sizeof(weightList)); */ /* is not necessary, even though some analyzer complain ... */ @@ -524,93 +817,11 @@ return iSize; } - -static U32 HUF_decodeSymbolX4(void* op, BIT_DStream_t* DStream, const HUF_DEltX4* dt, const U32 dtLog) -{ - size_t const val = BIT_lookBitsFast(DStream, dtLog); /* note : dtLog >= 1 */ - memcpy(op, dt+val, 2); - BIT_skipBits(DStream, dt[val].nbBits); - return dt[val].length; -} - -static U32 HUF_decodeLastSymbolX4(void* op, BIT_DStream_t* DStream, const HUF_DEltX4* dt, const U32 dtLog) -{ - size_t const val = BIT_lookBitsFast(DStream, dtLog); /* note : dtLog >= 1 */ - memcpy(op, dt+val, 1); - if (dt[val].length==1) BIT_skipBits(DStream, dt[val].nbBits); - else { - if (DStream->bitsConsumed < (sizeof(DStream->bitContainer)*8)) { - BIT_skipBits(DStream, dt[val].nbBits); - if (DStream->bitsConsumed > (sizeof(DStream->bitContainer)*8)) - DStream->bitsConsumed = (sizeof(DStream->bitContainer)*8); /* ugly hack; works only because it's the last symbol. Note : can't easily extract nbBits from just this symbol */ - } } - return 1; -} - - -#define HUF_DECODE_SYMBOLX4_0(ptr, DStreamPtr) \ - ptr += HUF_decodeSymbolX4(ptr, DStreamPtr, dt, dtLog) - -#define HUF_DECODE_SYMBOLX4_1(ptr, DStreamPtr) \ - if (MEM_64bits() || (HUF_TABLELOG_MAX<=12)) \ - ptr += HUF_decodeSymbolX4(ptr, DStreamPtr, dt, dtLog) - -#define HUF_DECODE_SYMBOLX4_2(ptr, DStreamPtr) \ - if (MEM_64bits()) \ - ptr += HUF_decodeSymbolX4(ptr, DStreamPtr, dt, dtLog) - -static inline size_t HUF_decodeStreamX4(BYTE* p, BIT_DStream_t* bitDPtr, BYTE* const pEnd, const HUF_DEltX4* const dt, const U32 dtLog) +size_t HUF_readDTableX4(HUF_DTable* DTable, const void* src, size_t srcSize) { - BYTE* const pStart = p; - - /* up to 8 symbols at a time */ - while ((BIT_reloadDStream(bitDPtr) == BIT_DStream_unfinished) & (p < pEnd-(sizeof(bitDPtr->bitContainer)-1))) { - HUF_DECODE_SYMBOLX4_2(p, bitDPtr); - HUF_DECODE_SYMBOLX4_1(p, bitDPtr); - HUF_DECODE_SYMBOLX4_2(p, bitDPtr); - HUF_DECODE_SYMBOLX4_0(p, bitDPtr); - } - - /* closer to end : up to 2 symbols at a time */ - while ((BIT_reloadDStream(bitDPtr) == BIT_DStream_unfinished) & (p <= pEnd-2)) - HUF_DECODE_SYMBOLX4_0(p, bitDPtr); - - while (p <= pEnd-2) - HUF_DECODE_SYMBOLX4_0(p, bitDPtr); /* no need to reload : reached the end of DStream */ - - if (p < pEnd) - p += HUF_decodeLastSymbolX4(p, bitDPtr, dt, dtLog); - - return p-pStart; -} - - -static size_t HUF_decompress1X4_usingDTable_internal( - void* dst, size_t dstSize, - const void* cSrc, size_t cSrcSize, - const HUF_DTable* DTable) -{ - BIT_DStream_t bitD; - - /* Init */ - { size_t const errorCode = BIT_initDStream(&bitD, cSrc, cSrcSize); - if (HUF_isError(errorCode)) return errorCode; - } - - /* decode */ - { BYTE* const ostart = (BYTE*) dst; - BYTE* const oend = ostart + dstSize; - const void* const dtPtr = DTable+1; /* force compiler to not use strict-aliasing */ - const HUF_DEltX4* const dt = (const HUF_DEltX4*)dtPtr; - DTableDesc const dtd = HUF_getDTableDesc(DTable); - HUF_decodeStreamX4(ostart, &bitD, oend, dt, dtd.tableLog); - } - - /* check */ - if (!BIT_endOfDStream(&bitD)) return ERROR(corruption_detected); - - /* decoded size */ - return dstSize; + U32 workSpace[HUF_DECOMPRESS_WORKSPACE_SIZE_U32]; + return HUF_readDTableX4_wksp(DTable, src, srcSize, + workSpace, sizeof(workSpace)); } size_t HUF_decompress1X4_usingDTable( @@ -620,19 +831,31 @@ { DTableDesc dtd = HUF_getDTableDesc(DTable); if (dtd.tableType != 1) return ERROR(GENERIC); - return HUF_decompress1X4_usingDTable_internal(dst, dstSize, cSrc, cSrcSize, DTable); + return HUF_decompress1X4_usingDTable_internal(dst, dstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0); } -size_t HUF_decompress1X4_DCtx (HUF_DTable* DCtx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize) +size_t HUF_decompress1X4_DCtx_wksp(HUF_DTable* DCtx, void* dst, size_t dstSize, + const void* cSrc, size_t cSrcSize, + void* workSpace, size_t wkspSize) { const BYTE* ip = (const BYTE*) cSrc; - size_t const hSize = HUF_readDTableX4 (DCtx, cSrc, cSrcSize); + size_t const hSize = HUF_readDTableX4_wksp(DCtx, cSrc, cSrcSize, + workSpace, wkspSize); if (HUF_isError(hSize)) return hSize; if (hSize >= cSrcSize) return ERROR(srcSize_wrong); ip += hSize; cSrcSize -= hSize; - return HUF_decompress1X4_usingDTable_internal (dst, dstSize, ip, cSrcSize, DCtx); + return HUF_decompress1X4_usingDTable_internal(dst, dstSize, ip, cSrcSize, DCtx, /* bmi2 */ 0); +} + + +size_t HUF_decompress1X4_DCtx(HUF_DTable* DCtx, void* dst, size_t dstSize, + const void* cSrc, size_t cSrcSize) +{ + U32 workSpace[HUF_DECOMPRESS_WORKSPACE_SIZE_U32]; + return HUF_decompress1X4_DCtx_wksp(DCtx, dst, dstSize, cSrc, cSrcSize, + workSpace, sizeof(workSpace)); } size_t HUF_decompress1X4 (void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize) @@ -641,99 +864,6 @@ return HUF_decompress1X4_DCtx(DTable, dst, dstSize, cSrc, cSrcSize); } -static size_t HUF_decompress4X4_usingDTable_internal( - void* dst, size_t dstSize, - const void* cSrc, size_t cSrcSize, - const HUF_DTable* DTable) -{ - if (cSrcSize < 10) return ERROR(corruption_detected); /* strict minimum : jump table + 1 byte per stream */ - - { const BYTE* const istart = (const BYTE*) cSrc; - BYTE* const ostart = (BYTE*) dst; - BYTE* const oend = ostart + dstSize; - const void* const dtPtr = DTable+1; - const HUF_DEltX4* const dt = (const HUF_DEltX4*)dtPtr; - - /* Init */ - BIT_DStream_t bitD1; - BIT_DStream_t bitD2; - BIT_DStream_t bitD3; - BIT_DStream_t bitD4; - size_t const length1 = MEM_readLE16(istart); - size_t const length2 = MEM_readLE16(istart+2); - size_t const length3 = MEM_readLE16(istart+4); - size_t const length4 = cSrcSize - (length1 + length2 + length3 + 6); - const BYTE* const istart1 = istart + 6; /* jumpTable */ - const BYTE* const istart2 = istart1 + length1; - const BYTE* const istart3 = istart2 + length2; - const BYTE* const istart4 = istart3 + length3; - size_t const segmentSize = (dstSize+3) / 4; - BYTE* const opStart2 = ostart + segmentSize; - BYTE* const opStart3 = opStart2 + segmentSize; - BYTE* const opStart4 = opStart3 + segmentSize; - BYTE* op1 = ostart; - BYTE* op2 = opStart2; - BYTE* op3 = opStart3; - BYTE* op4 = opStart4; - U32 endSignal; - DTableDesc const dtd = HUF_getDTableDesc(DTable); - U32 const dtLog = dtd.tableLog; - - if (length4 > cSrcSize) return ERROR(corruption_detected); /* overflow */ - { size_t const errorCode = BIT_initDStream(&bitD1, istart1, length1); - if (HUF_isError(errorCode)) return errorCode; } - { size_t const errorCode = BIT_initDStream(&bitD2, istart2, length2); - if (HUF_isError(errorCode)) return errorCode; } - { size_t const errorCode = BIT_initDStream(&bitD3, istart3, length3); - if (HUF_isError(errorCode)) return errorCode; } - { size_t const errorCode = BIT_initDStream(&bitD4, istart4, length4); - if (HUF_isError(errorCode)) return errorCode; } - - /* 16-32 symbols per loop (4-8 symbols per stream) */ - endSignal = BIT_reloadDStream(&bitD1) | BIT_reloadDStream(&bitD2) | BIT_reloadDStream(&bitD3) | BIT_reloadDStream(&bitD4); - for ( ; (endSignal==BIT_DStream_unfinished) & (op4<(oend-(sizeof(bitD4.bitContainer)-1))) ; ) { - HUF_DECODE_SYMBOLX4_2(op1, &bitD1); - HUF_DECODE_SYMBOLX4_2(op2, &bitD2); - HUF_DECODE_SYMBOLX4_2(op3, &bitD3); - HUF_DECODE_SYMBOLX4_2(op4, &bitD4); - HUF_DECODE_SYMBOLX4_1(op1, &bitD1); - HUF_DECODE_SYMBOLX4_1(op2, &bitD2); - HUF_DECODE_SYMBOLX4_1(op3, &bitD3); - HUF_DECODE_SYMBOLX4_1(op4, &bitD4); - HUF_DECODE_SYMBOLX4_2(op1, &bitD1); - HUF_DECODE_SYMBOLX4_2(op2, &bitD2); - HUF_DECODE_SYMBOLX4_2(op3, &bitD3); - HUF_DECODE_SYMBOLX4_2(op4, &bitD4); - HUF_DECODE_SYMBOLX4_0(op1, &bitD1); - HUF_DECODE_SYMBOLX4_0(op2, &bitD2); - HUF_DECODE_SYMBOLX4_0(op3, &bitD3); - HUF_DECODE_SYMBOLX4_0(op4, &bitD4); - - endSignal = BIT_reloadDStream(&bitD1) | BIT_reloadDStream(&bitD2) | BIT_reloadDStream(&bitD3) | BIT_reloadDStream(&bitD4); - } - - /* check corruption */ - if (op1 > opStart2) return ERROR(corruption_detected); - if (op2 > opStart3) return ERROR(corruption_detected); - if (op3 > opStart4) return ERROR(corruption_detected); - /* note : op4 already verified within main loop */ - - /* finish bitStreams one by one */ - HUF_decodeStreamX4(op1, &bitD1, opStart2, dt, dtLog); - HUF_decodeStreamX4(op2, &bitD2, opStart3, dt, dtLog); - HUF_decodeStreamX4(op3, &bitD3, opStart4, dt, dtLog); - HUF_decodeStreamX4(op4, &bitD4, oend, dt, dtLog); - - /* check */ - { U32 const endCheck = BIT_endOfDStream(&bitD1) & BIT_endOfDStream(&bitD2) & BIT_endOfDStream(&bitD3) & BIT_endOfDStream(&bitD4); - if (!endCheck) return ERROR(corruption_detected); } - - /* decoded size */ - return dstSize; - } -} - - size_t HUF_decompress4X4_usingDTable( void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, @@ -741,20 +871,38 @@ { DTableDesc dtd = HUF_getDTableDesc(DTable); if (dtd.tableType != 1) return ERROR(GENERIC); - return HUF_decompress4X4_usingDTable_internal(dst, dstSize, cSrc, cSrcSize, DTable); + return HUF_decompress4X4_usingDTable_internal(dst, dstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0); } - -size_t HUF_decompress4X4_DCtx (HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize) +static size_t HUF_decompress4X4_DCtx_wksp_bmi2(HUF_DTable* dctx, void* dst, size_t dstSize, + const void* cSrc, size_t cSrcSize, + void* workSpace, size_t wkspSize, int bmi2) { const BYTE* ip = (const BYTE*) cSrc; - size_t hSize = HUF_readDTableX4 (dctx, cSrc, cSrcSize); + size_t hSize = HUF_readDTableX4_wksp(dctx, cSrc, cSrcSize, + workSpace, wkspSize); if (HUF_isError(hSize)) return hSize; if (hSize >= cSrcSize) return ERROR(srcSize_wrong); ip += hSize; cSrcSize -= hSize; - return HUF_decompress4X4_usingDTable_internal(dst, dstSize, ip, cSrcSize, dctx); + return HUF_decompress4X4_usingDTable_internal(dst, dstSize, ip, cSrcSize, dctx, bmi2); +} + +size_t HUF_decompress4X4_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, + const void* cSrc, size_t cSrcSize, + void* workSpace, size_t wkspSize) +{ + return HUF_decompress4X4_DCtx_wksp_bmi2(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize, /* bmi2 */ 0); +} + + +size_t HUF_decompress4X4_DCtx(HUF_DTable* dctx, void* dst, size_t dstSize, + const void* cSrc, size_t cSrcSize) +{ + U32 workSpace[HUF_DECOMPRESS_WORKSPACE_SIZE_U32]; + return HUF_decompress4X4_DCtx_wksp(dctx, dst, dstSize, cSrc, cSrcSize, + workSpace, sizeof(workSpace)); } size_t HUF_decompress4X4 (void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize) @@ -773,8 +921,8 @@ const HUF_DTable* DTable) { DTableDesc const dtd = HUF_getDTableDesc(DTable); - return dtd.tableType ? HUF_decompress1X4_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable) : - HUF_decompress1X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable); + return dtd.tableType ? HUF_decompress1X4_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0) : + HUF_decompress1X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0); } size_t HUF_decompress4X_usingDTable(void* dst, size_t maxDstSize, @@ -782,8 +930,8 @@ const HUF_DTable* DTable) { DTableDesc const dtd = HUF_getDTableDesc(DTable); - return dtd.tableType ? HUF_decompress4X4_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable) : - HUF_decompress4X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable); + return dtd.tableType ? HUF_decompress4X4_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0) : + HUF_decompress4X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0); } @@ -810,21 +958,22 @@ }; /** HUF_selectDecoder() : -* Tells which decoder is likely to decode faster, -* based on a set of pre-determined metrics. -* @return : 0==HUF_decompress4X2, 1==HUF_decompress4X4 . -* Assumption : 0 < cSrcSize < dstSize <= 128 KB */ + * Tells which decoder is likely to decode faster, + * based on a set of pre-computed metrics. + * @return : 0==HUF_decompress4X2, 1==HUF_decompress4X4 . + * Assumption : 0 < dstSize <= 128 KB */ U32 HUF_selectDecoder (size_t dstSize, size_t cSrcSize) { + assert(dstSize > 0); + assert(dstSize <= 128 KB); /* decoder timing evaluation */ - U32 const Q = (U32)(cSrcSize * 16 / dstSize); /* Q < 16 since dstSize > cSrcSize */ - U32 const D256 = (U32)(dstSize >> 8); - U32 const DTime0 = algoTime[Q][0].tableTime + (algoTime[Q][0].decode256Time * D256); - U32 DTime1 = algoTime[Q][1].tableTime + (algoTime[Q][1].decode256Time * D256); - DTime1 += DTime1 >> 3; /* advantage to algorithm using less memory, for cache eviction */ - - return DTime1 < DTime0; -} + { U32 const Q = (cSrcSize >= dstSize) ? 15 : (U32)(cSrcSize * 16 / dstSize); /* Q < 16 */ + U32 const D256 = (U32)(dstSize >> 8); + U32 const DTime0 = algoTime[Q][0].tableTime + (algoTime[Q][0].decode256Time * D256); + U32 DTime1 = algoTime[Q][1].tableTime + (algoTime[Q][1].decode256Time * D256); + DTime1 += DTime1 >> 3; /* advantage to algorithm using less memory, to reduce cache eviction */ + return DTime1 < DTime0; +} } typedef size_t (*decompressionAlgo)(void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize); @@ -858,19 +1007,32 @@ } } -size_t HUF_decompress4X_hufOnly (HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize) +size_t HUF_decompress4X_hufOnly(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize) +{ + U32 workSpace[HUF_DECOMPRESS_WORKSPACE_SIZE_U32]; + return HUF_decompress4X_hufOnly_wksp(dctx, dst, dstSize, cSrc, cSrcSize, + workSpace, sizeof(workSpace)); +} + + +size_t HUF_decompress4X_hufOnly_wksp(HUF_DTable* dctx, void* dst, + size_t dstSize, const void* cSrc, + size_t cSrcSize, void* workSpace, + size_t wkspSize) { /* validation checks */ if (dstSize == 0) return ERROR(dstSize_tooSmall); - if ((cSrcSize >= dstSize) || (cSrcSize <= 1)) return ERROR(corruption_detected); /* invalid */ + if (cSrcSize == 0) return ERROR(corruption_detected); { U32 const algoNb = HUF_selectDecoder(dstSize, cSrcSize); - return algoNb ? HUF_decompress4X4_DCtx(dctx, dst, dstSize, cSrc, cSrcSize) : - HUF_decompress4X2_DCtx(dctx, dst, dstSize, cSrc, cSrcSize) ; + return algoNb ? HUF_decompress4X4_DCtx_wksp(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize): + HUF_decompress4X2_DCtx_wksp(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize); } } -size_t HUF_decompress1X_DCtx (HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize) +size_t HUF_decompress1X_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, + const void* cSrc, size_t cSrcSize, + void* workSpace, size_t wkspSize) { /* validation checks */ if (dstSize == 0) return ERROR(dstSize_tooSmall); @@ -879,7 +1041,56 @@ if (cSrcSize == 1) { memset(dst, *(const BYTE*)cSrc, dstSize); return dstSize; } /* RLE */ { U32 const algoNb = HUF_selectDecoder(dstSize, cSrcSize); - return algoNb ? HUF_decompress1X4_DCtx(dctx, dst, dstSize, cSrc, cSrcSize) : - HUF_decompress1X2_DCtx(dctx, dst, dstSize, cSrc, cSrcSize) ; + return algoNb ? HUF_decompress1X4_DCtx_wksp(dctx, dst, dstSize, cSrc, + cSrcSize, workSpace, wkspSize): + HUF_decompress1X2_DCtx_wksp(dctx, dst, dstSize, cSrc, + cSrcSize, workSpace, wkspSize); } } + +size_t HUF_decompress1X_DCtx(HUF_DTable* dctx, void* dst, size_t dstSize, + const void* cSrc, size_t cSrcSize) +{ + U32 workSpace[HUF_DECOMPRESS_WORKSPACE_SIZE_U32]; + return HUF_decompress1X_DCtx_wksp(dctx, dst, dstSize, cSrc, cSrcSize, + workSpace, sizeof(workSpace)); +} + + +size_t HUF_decompress1X_usingDTable_bmi2(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable, int bmi2) +{ + DTableDesc const dtd = HUF_getDTableDesc(DTable); + return dtd.tableType ? HUF_decompress1X4_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, bmi2) : + HUF_decompress1X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, bmi2); +} + +size_t HUF_decompress1X2_DCtx_wksp_bmi2(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize, int bmi2) +{ + const BYTE* ip = (const BYTE*) cSrc; + + size_t const hSize = HUF_readDTableX2_wksp(dctx, cSrc, cSrcSize, workSpace, wkspSize); + if (HUF_isError(hSize)) return hSize; + if (hSize >= cSrcSize) return ERROR(srcSize_wrong); + ip += hSize; cSrcSize -= hSize; + + return HUF_decompress1X2_usingDTable_internal(dst, dstSize, ip, cSrcSize, dctx, bmi2); +} + +size_t HUF_decompress4X_usingDTable_bmi2(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable, int bmi2) +{ + DTableDesc const dtd = HUF_getDTableDesc(DTable); + return dtd.tableType ? HUF_decompress4X4_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, bmi2) : + HUF_decompress4X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, bmi2); +} + +size_t HUF_decompress4X_hufOnly_wksp_bmi2(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize, int bmi2) +{ + /* validation checks */ + if (dstSize == 0) return ERROR(dstSize_tooSmall); + if (cSrcSize == 0) return ERROR(corruption_detected); + + { U32 const algoNb = HUF_selectDecoder(dstSize, cSrcSize); + return algoNb ? HUF_decompress4X4_DCtx_wksp_bmi2(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize, bmi2) : + HUF_decompress4X2_DCtx_wksp_bmi2(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize, bmi2); + } +} diff -r fb92df8b634c -r ed5448edcbfa contrib/python-zstandard/zstd/decompress/zstd_decompress.c --- a/contrib/python-zstandard/zstd/decompress/zstd_decompress.c Wed Apr 04 10:35:09 2018 -0400 +++ b/contrib/python-zstandard/zstd/decompress/zstd_decompress.c Wed Apr 18 15:32:08 2018 -0400 @@ -1,10 +1,11 @@ -/** +/* * Copyright (c) 2016-present, Yann Collet, Facebook, Inc. * All rights reserved. * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. An additional grant - * of patent rights can be found in the PATENTS file in the same directory. + * This source code is licensed under both the BSD-style license (found in the + * LICENSE file in the root directory of this source tree) and the GPLv2 (found + * in the COPYING file in the root directory of this source tree). + * You may select, at your option, one of the above-listed licenses. */ @@ -13,8 +14,9 @@ *****************************************************************/ /*! * HEAPMODE : - * Select how default decompression function ZSTD_decompress() will allocate memory, - * in memory stack (0), or in memory heap (1, requires malloc()) + * Select how default decompression function ZSTD_decompress() allocates its context, + * on stack (0), or into heap (1, default; requires malloc()). + * Note that functions with explicit context such as ZSTD_decompressDCtx() are unaffected. */ #ifndef ZSTD_HEAPMODE # define ZSTD_HEAPMODE 1 @@ -22,19 +24,20 @@ /*! * LEGACY_SUPPORT : -* if set to 1, ZSTD_decompress() can decode older formats (v0.1+) +* if set to 1+, ZSTD_decompress() can decode older formats (v0.1+) */ #ifndef ZSTD_LEGACY_SUPPORT # define ZSTD_LEGACY_SUPPORT 0 #endif /*! -* MAXWINDOWSIZE_DEFAULT : -* maximum window size accepted by DStream, by default. -* Frames requiring more memory will be rejected. -*/ + * MAXWINDOWSIZE_DEFAULT : + * maximum window size accepted by DStream __by default__. + * Frames requiring more memory will be rejected. + * It's possible to set a different limit using ZSTD_DCtx_setMaxWindowSize(). + */ #ifndef ZSTD_MAXWINDOWSIZE_DEFAULT -# define ZSTD_MAXWINDOWSIZE_DEFAULT ((1 << ZSTD_WINDOWLOG_MAX) + 1) /* defined within zstd.h */ +# define ZSTD_MAXWINDOWSIZE_DEFAULT (((U32)1 << ZSTD_WINDOWLOG_DEFAULTMAX) + 1) #endif @@ -42,9 +45,8 @@ * Dependencies *********************************************************/ #include /* memcpy, memmove, memset */ +#include "cpu.h" #include "mem.h" /* low level memory routines */ -#define XXH_STATIC_LINKING_ONLY /* XXH64_state_t */ -#include "xxhash.h" /* XXH64_* */ #define FSE_STATIC_LINKING_ONLY #include "fse.h" #define HUF_STATIC_LINKING_ONLY @@ -56,17 +58,8 @@ #endif -#if defined(_MSC_VER) -# include /* https://msdn.microsoft.com/fr-fr/library/84szxsww(v=vs.90).aspx */ -# define ZSTD_PREFETCH(ptr) _mm_prefetch((const char*)ptr, _MM_HINT_T0) -#elif defined(__GNUC__) -# define ZSTD_PREFETCH(ptr) __builtin_prefetch(ptr, 0, 0) -#else -# define ZSTD_PREFETCH(ptr) /* disabled */ -#endif - /*-************************************* -* Macros +* Errors ***************************************/ #define ZSTD_isError ERR_isError /* for inlining */ #define FSE_isError ERR_isError @@ -87,119 +80,183 @@ ZSTDds_decompressLastBlock, ZSTDds_checkChecksum, ZSTDds_decodeSkippableHeader, ZSTDds_skipFrame } ZSTD_dStage; +typedef enum { zdss_init=0, zdss_loadHeader, + zdss_read, zdss_load, zdss_flush } ZSTD_dStreamStage; + + +typedef struct { + U32 fastMode; + U32 tableLog; +} ZSTD_seqSymbol_header; + +typedef struct { + U16 nextState; + BYTE nbAdditionalBits; + BYTE nbBits; + U32 baseValue; +} ZSTD_seqSymbol; + +#define SEQSYMBOL_TABLE_SIZE(log) (1 + (1 << (log))) + +typedef struct { + ZSTD_seqSymbol LLTable[SEQSYMBOL_TABLE_SIZE(LLFSELog)]; + ZSTD_seqSymbol OFTable[SEQSYMBOL_TABLE_SIZE(OffFSELog)]; + ZSTD_seqSymbol MLTable[SEQSYMBOL_TABLE_SIZE(MLFSELog)]; + HUF_DTable hufTable[HUF_DTABLE_SIZE(HufLog)]; /* can accommodate HUF_decompress4X */ + U32 workspace[HUF_DECOMPRESS_WORKSPACE_SIZE_U32]; + U32 rep[ZSTD_REP_NUM]; +} ZSTD_entropyDTables_t; + struct ZSTD_DCtx_s { - const FSE_DTable* LLTptr; - const FSE_DTable* MLTptr; - const FSE_DTable* OFTptr; + const ZSTD_seqSymbol* LLTptr; + const ZSTD_seqSymbol* MLTptr; + const ZSTD_seqSymbol* OFTptr; const HUF_DTable* HUFptr; - FSE_DTable LLTable[FSE_DTABLE_SIZE_U32(LLFSELog)]; - FSE_DTable OFTable[FSE_DTABLE_SIZE_U32(OffFSELog)]; - FSE_DTable MLTable[FSE_DTABLE_SIZE_U32(MLFSELog)]; - HUF_DTable hufTable[HUF_DTABLE_SIZE(HufLog)]; /* can accommodate HUF_decompress4X */ - const void* previousDstEnd; - const void* base; - const void* vBase; - const void* dictEnd; + ZSTD_entropyDTables_t entropy; + const void* previousDstEnd; /* detect continuity */ + const void* base; /* start of current segment */ + const void* vBase; /* virtual start of previous segment if it was just before current one */ + const void* dictEnd; /* end of previous segment */ size_t expected; - U32 rep[ZSTD_REP_NUM]; - ZSTD_frameParams fParams; - blockType_e bType; /* used in ZSTD_decompressContinue(), to transfer blockType between header decoding and block decoding stages */ + ZSTD_frameHeader fParams; + U64 decodedSize; + blockType_e bType; /* used in ZSTD_decompressContinue(), store blockType between block header decoding and block decompression stages */ ZSTD_dStage stage; U32 litEntropy; U32 fseEntropy; XXH64_state_t xxhState; size_t headerSize; U32 dictID; + ZSTD_format_e format; const BYTE* litPtr; ZSTD_customMem customMem; size_t litSize; size_t rleSize; - BYTE litBuffer[ZSTD_BLOCKSIZE_ABSOLUTEMAX + WILDCOPY_OVERLENGTH]; + size_t staticSize; + int bmi2; /* == 1 if the CPU supports BMI2 and 0 otherwise. CPU support is determined dynamically once per context lifetime. */ + + /* streaming */ + ZSTD_DDict* ddictLocal; + const ZSTD_DDict* ddict; + ZSTD_dStreamStage streamStage; + char* inBuff; + size_t inBuffSize; + size_t inPos; + size_t maxWindowSize; + char* outBuff; + size_t outBuffSize; + size_t outStart; + size_t outEnd; + size_t lhSize; + void* legacyContext; + U32 previousLegacyVersion; + U32 legacyVersion; + U32 hostageByte; + + /* workspace */ + BYTE litBuffer[ZSTD_BLOCKSIZE_MAX + WILDCOPY_OVERLENGTH]; BYTE headerBuffer[ZSTD_FRAMEHEADERSIZE_MAX]; }; /* typedef'd to ZSTD_DCtx within "zstd.h" */ -size_t ZSTD_sizeof_DCtx (const ZSTD_DCtx* dctx) { return (dctx==NULL) ? 0 : sizeof(ZSTD_DCtx); } +size_t ZSTD_sizeof_DCtx (const ZSTD_DCtx* dctx) +{ + if (dctx==NULL) return 0; /* support sizeof NULL */ + return sizeof(*dctx) + + ZSTD_sizeof_DDict(dctx->ddictLocal) + + dctx->inBuffSize + dctx->outBuffSize; +} size_t ZSTD_estimateDCtxSize(void) { return sizeof(ZSTD_DCtx); } -size_t ZSTD_decompressBegin(ZSTD_DCtx* dctx) + +static size_t ZSTD_startingInputLength(ZSTD_format_e format) +{ + size_t const startingInputLength = (format==ZSTD_f_zstd1_magicless) ? + ZSTD_frameHeaderSize_prefix - ZSTD_frameIdSize : + ZSTD_frameHeaderSize_prefix; + ZSTD_STATIC_ASSERT(ZSTD_FRAMEHEADERSIZE_PREFIX >= ZSTD_FRAMEIDSIZE); + /* only supports formats ZSTD_f_zstd1 and ZSTD_f_zstd1_magicless */ + assert( (format == ZSTD_f_zstd1) || (format == ZSTD_f_zstd1_magicless) ); + return startingInputLength; +} + +static void ZSTD_initDCtx_internal(ZSTD_DCtx* dctx) { - dctx->expected = ZSTD_frameHeaderSize_prefix; - dctx->stage = ZSTDds_getFrameHeaderSize; - dctx->previousDstEnd = NULL; - dctx->base = NULL; - dctx->vBase = NULL; - dctx->dictEnd = NULL; - dctx->hufTable[0] = (HUF_DTable)((HufLog)*0x1000001); /* cover both little and big endian */ - dctx->litEntropy = dctx->fseEntropy = 0; - dctx->dictID = 0; - MEM_STATIC_ASSERT(sizeof(dctx->rep) == sizeof(repStartValue)); - memcpy(dctx->rep, repStartValue, sizeof(repStartValue)); /* initial repcodes */ - dctx->LLTptr = dctx->LLTable; - dctx->MLTptr = dctx->MLTable; - dctx->OFTptr = dctx->OFTable; - dctx->HUFptr = dctx->hufTable; - return 0; + dctx->format = ZSTD_f_zstd1; /* ZSTD_decompressBegin() invokes ZSTD_startingInputLength() with argument dctx->format */ + dctx->staticSize = 0; + dctx->maxWindowSize = ZSTD_MAXWINDOWSIZE_DEFAULT; + dctx->ddict = NULL; + dctx->ddictLocal = NULL; + dctx->inBuff = NULL; + dctx->inBuffSize = 0; + dctx->outBuffSize = 0; + dctx->streamStage = zdss_init; + dctx->bmi2 = ZSTD_cpuid_bmi2(ZSTD_cpuid()); +} + +ZSTD_DCtx* ZSTD_initStaticDCtx(void *workspace, size_t workspaceSize) +{ + ZSTD_DCtx* const dctx = (ZSTD_DCtx*) workspace; + + if ((size_t)workspace & 7) return NULL; /* 8-aligned */ + if (workspaceSize < sizeof(ZSTD_DCtx)) return NULL; /* minimum size */ + + ZSTD_initDCtx_internal(dctx); + dctx->staticSize = workspaceSize; + dctx->inBuff = (char*)(dctx+1); + return dctx; } ZSTD_DCtx* ZSTD_createDCtx_advanced(ZSTD_customMem customMem) { - ZSTD_DCtx* dctx; - - if (!customMem.customAlloc && !customMem.customFree) customMem = defaultCustomMem; - if (!customMem.customAlloc || !customMem.customFree) return NULL; - - dctx = (ZSTD_DCtx*)ZSTD_malloc(sizeof(ZSTD_DCtx), customMem); - if (!dctx) return NULL; - memcpy(&dctx->customMem, &customMem, sizeof(customMem)); - ZSTD_decompressBegin(dctx); - return dctx; + if (!customMem.customAlloc ^ !customMem.customFree) return NULL; + + { ZSTD_DCtx* const dctx = (ZSTD_DCtx*)ZSTD_malloc(sizeof(*dctx), customMem); + if (!dctx) return NULL; + dctx->customMem = customMem; + dctx->legacyContext = NULL; + dctx->previousLegacyVersion = 0; + ZSTD_initDCtx_internal(dctx); + return dctx; + } } ZSTD_DCtx* ZSTD_createDCtx(void) { - return ZSTD_createDCtx_advanced(defaultCustomMem); + DEBUGLOG(3, "ZSTD_createDCtx"); + return ZSTD_createDCtx_advanced(ZSTD_defaultCMem); } size_t ZSTD_freeDCtx(ZSTD_DCtx* dctx) { if (dctx==NULL) return 0; /* support free on NULL */ - ZSTD_free(dctx, dctx->customMem); - return 0; /* reserved as a potential error code in the future */ + if (dctx->staticSize) return ERROR(memory_allocation); /* not compatible with static DCtx */ + { ZSTD_customMem const cMem = dctx->customMem; + ZSTD_freeDDict(dctx->ddictLocal); + dctx->ddictLocal = NULL; + ZSTD_free(dctx->inBuff, cMem); + dctx->inBuff = NULL; +#if defined(ZSTD_LEGACY_SUPPORT) && (ZSTD_LEGACY_SUPPORT >= 1) + if (dctx->legacyContext) + ZSTD_freeLegacyStreamContext(dctx->legacyContext, dctx->previousLegacyVersion); +#endif + ZSTD_free(dctx, cMem); + return 0; + } } +/* no longer useful */ void ZSTD_copyDCtx(ZSTD_DCtx* dstDCtx, const ZSTD_DCtx* srcDCtx) { - size_t const workSpaceSize = (ZSTD_BLOCKSIZE_ABSOLUTEMAX+WILDCOPY_OVERLENGTH) + ZSTD_frameHeaderSize_max; - memcpy(dstDCtx, srcDCtx, sizeof(ZSTD_DCtx) - workSpaceSize); /* no need to copy workspace */ -} - -static void ZSTD_refDCtx(ZSTD_DCtx* dstDCtx, const ZSTD_DCtx* srcDCtx) -{ - ZSTD_decompressBegin(dstDCtx); /* init */ - if (srcDCtx) { /* support refDCtx on NULL */ - dstDCtx->dictEnd = srcDCtx->dictEnd; - dstDCtx->vBase = srcDCtx->vBase; - dstDCtx->base = srcDCtx->base; - dstDCtx->previousDstEnd = srcDCtx->previousDstEnd; - dstDCtx->dictID = srcDCtx->dictID; - dstDCtx->litEntropy = srcDCtx->litEntropy; - dstDCtx->fseEntropy = srcDCtx->fseEntropy; - dstDCtx->LLTptr = srcDCtx->LLTable; - dstDCtx->MLTptr = srcDCtx->MLTable; - dstDCtx->OFTptr = srcDCtx->OFTable; - dstDCtx->HUFptr = srcDCtx->hufTable; - dstDCtx->rep[0] = srcDCtx->rep[0]; - dstDCtx->rep[1] = srcDCtx->rep[1]; - dstDCtx->rep[2] = srcDCtx->rep[2]; - } + size_t const toCopy = (size_t)((char*)(&dstDCtx->inBuff) - (char*)dstDCtx); + memcpy(dstDCtx, srcDCtx, toCopy); /* no need to copy workspace */ } /*-************************************************************* -* Decompression section -***************************************************************/ + * Frame header decoding + ***************************************************************/ /*! ZSTD_isFrame() : * Tells if the content of `buffer` starts with a valid Frame Identifier. @@ -208,7 +265,7 @@ * Note 3 : Skippable Frame Identifiers are considered valid. */ unsigned ZSTD_isFrame(const void* buffer, size_t size) { - if (size < 4) return 0; + if (size < ZSTD_frameIdSize) return 0; { U32 const magic = MEM_readLE32(buffer); if (magic == ZSTD_MAGICNUMBER) return 1; if ((magic & 0xFFFFFFF0U) == ZSTD_MAGIC_SKIPPABLE_START) return 1; @@ -219,70 +276,91 @@ return 0; } - -/** ZSTD_frameHeaderSize() : -* srcSize must be >= ZSTD_frameHeaderSize_prefix. -* @return : size of the Frame Header */ -static size_t ZSTD_frameHeaderSize(const void* src, size_t srcSize) +/** ZSTD_frameHeaderSize_internal() : + * srcSize must be large enough to reach header size fields. + * note : only works for formats ZSTD_f_zstd1 and ZSTD_f_zstd1_magicless. + * @return : size of the Frame Header + * or an error code, which can be tested with ZSTD_isError() */ +static size_t ZSTD_frameHeaderSize_internal(const void* src, size_t srcSize, ZSTD_format_e format) { - if (srcSize < ZSTD_frameHeaderSize_prefix) return ERROR(srcSize_wrong); - { BYTE const fhd = ((const BYTE*)src)[4]; + size_t const minInputSize = ZSTD_startingInputLength(format); + if (srcSize < minInputSize) return ERROR(srcSize_wrong); + + { BYTE const fhd = ((const BYTE*)src)[minInputSize-1]; U32 const dictID= fhd & 3; U32 const singleSegment = (fhd >> 5) & 1; U32 const fcsId = fhd >> 6; - return ZSTD_frameHeaderSize_prefix + !singleSegment + ZSTD_did_fieldSize[dictID] + ZSTD_fcs_fieldSize[fcsId] - + (singleSegment && !fcsId); + return minInputSize + !singleSegment + + ZSTD_did_fieldSize[dictID] + ZSTD_fcs_fieldSize[fcsId] + + (singleSegment && !fcsId); } } - -/** ZSTD_getFrameParams() : -* decode Frame Header, or require larger `srcSize`. -* @return : 0, `fparamsPtr` is correctly filled, -* >0, `srcSize` is too small, result is expected `srcSize`, -* or an error code, which can be tested using ZSTD_isError() */ -size_t ZSTD_getFrameParams(ZSTD_frameParams* fparamsPtr, const void* src, size_t srcSize) +/** ZSTD_frameHeaderSize() : + * srcSize must be >= ZSTD_frameHeaderSize_prefix. + * @return : size of the Frame Header */ +size_t ZSTD_frameHeaderSize(const void* src, size_t srcSize) +{ + return ZSTD_frameHeaderSize_internal(src, srcSize, ZSTD_f_zstd1); +} + + +/** ZSTD_getFrameHeader_internal() : + * decode Frame Header, or require larger `srcSize`. + * note : only works for formats ZSTD_f_zstd1 and ZSTD_f_zstd1_magicless + * @return : 0, `zfhPtr` is correctly filled, + * >0, `srcSize` is too small, value is wanted `srcSize` amount, + * or an error code, which can be tested using ZSTD_isError() */ +static size_t ZSTD_getFrameHeader_internal(ZSTD_frameHeader* zfhPtr, const void* src, size_t srcSize, ZSTD_format_e format) { const BYTE* ip = (const BYTE*)src; - - if (srcSize < ZSTD_frameHeaderSize_prefix) return ZSTD_frameHeaderSize_prefix; - if (MEM_readLE32(src) != ZSTD_MAGICNUMBER) { + size_t const minInputSize = ZSTD_startingInputLength(format); + + if (srcSize < minInputSize) return minInputSize; + + if ( (format != ZSTD_f_zstd1_magicless) + && (MEM_readLE32(src) != ZSTD_MAGICNUMBER) ) { if ((MEM_readLE32(src) & 0xFFFFFFF0U) == ZSTD_MAGIC_SKIPPABLE_START) { - if (srcSize < ZSTD_skippableHeaderSize) return ZSTD_skippableHeaderSize; /* magic number + skippable frame length */ - memset(fparamsPtr, 0, sizeof(*fparamsPtr)); - fparamsPtr->frameContentSize = MEM_readLE32((const char *)src + 4); - fparamsPtr->windowSize = 0; /* windowSize==0 means a frame is skippable */ + /* skippable frame */ + if (srcSize < ZSTD_skippableHeaderSize) + return ZSTD_skippableHeaderSize; /* magic number + frame length */ + memset(zfhPtr, 0, sizeof(*zfhPtr)); + zfhPtr->frameContentSize = MEM_readLE32((const char *)src + ZSTD_frameIdSize); + zfhPtr->frameType = ZSTD_skippableFrame; return 0; } return ERROR(prefix_unknown); } /* ensure there is enough `srcSize` to fully read/decode frame header */ - { size_t const fhsize = ZSTD_frameHeaderSize(src, srcSize); - if (srcSize < fhsize) return fhsize; } - - { BYTE const fhdByte = ip[4]; - size_t pos = 5; + { size_t const fhsize = ZSTD_frameHeaderSize_internal(src, srcSize, format); + if (srcSize < fhsize) return fhsize; + zfhPtr->headerSize = (U32)fhsize; + } + + { BYTE const fhdByte = ip[minInputSize-1]; + size_t pos = minInputSize; U32 const dictIDSizeCode = fhdByte&3; U32 const checksumFlag = (fhdByte>>2)&1; U32 const singleSegment = (fhdByte>>5)&1; U32 const fcsID = fhdByte>>6; - U32 const windowSizeMax = 1U << ZSTD_WINDOWLOG_MAX; - U32 windowSize = 0; + U64 windowSize = 0; U32 dictID = 0; - U64 frameContentSize = 0; - if ((fhdByte & 0x08) != 0) return ERROR(frameParameter_unsupported); /* reserved bits, which must be zero */ + U64 frameContentSize = ZSTD_CONTENTSIZE_UNKNOWN; + if ((fhdByte & 0x08) != 0) + return ERROR(frameParameter_unsupported); /* reserved bits, must be zero */ + if (!singleSegment) { BYTE const wlByte = ip[pos++]; U32 const windowLog = (wlByte >> 3) + ZSTD_WINDOWLOG_ABSOLUTEMIN; - if (windowLog > ZSTD_WINDOWLOG_MAX) return ERROR(frameParameter_windowTooLarge); /* avoids issue with 1 << windowLog */ - windowSize = (1U << windowLog); + if (windowLog > ZSTD_WINDOWLOG_MAX) + return ERROR(frameParameter_windowTooLarge); + windowSize = (1ULL << windowLog); windowSize += (windowSize >> 3) * (wlByte&7); } - switch(dictIDSizeCode) { - default: /* impossible */ + default: assert(0); /* impossible */ case 0 : break; case 1 : dictID = ip[pos]; pos++; break; case 2 : dictID = MEM_readLE16(ip+pos); pos+=2; break; @@ -290,40 +368,121 @@ } switch(fcsID) { - default: /* impossible */ + default: assert(0); /* impossible */ case 0 : if (singleSegment) frameContentSize = ip[pos]; break; case 1 : frameContentSize = MEM_readLE16(ip+pos)+256; break; case 2 : frameContentSize = MEM_readLE32(ip+pos); break; case 3 : frameContentSize = MEM_readLE64(ip+pos); break; } - if (!windowSize) windowSize = (U32)frameContentSize; - if (windowSize > windowSizeMax) return ERROR(frameParameter_windowTooLarge); - fparamsPtr->frameContentSize = frameContentSize; - fparamsPtr->windowSize = windowSize; - fparamsPtr->dictID = dictID; - fparamsPtr->checksumFlag = checksumFlag; + if (singleSegment) windowSize = frameContentSize; + + zfhPtr->frameType = ZSTD_frame; + zfhPtr->frameContentSize = frameContentSize; + zfhPtr->windowSize = windowSize; + zfhPtr->blockSizeMax = (unsigned) MIN(windowSize, ZSTD_BLOCKSIZE_MAX); + zfhPtr->dictID = dictID; + zfhPtr->checksumFlag = checksumFlag; } return 0; } +/** ZSTD_getFrameHeader() : + * decode Frame Header, or require larger `srcSize`. + * note : this function does not consume input, it only reads it. + * @return : 0, `zfhPtr` is correctly filled, + * >0, `srcSize` is too small, value is wanted `srcSize` amount, + * or an error code, which can be tested using ZSTD_isError() */ +size_t ZSTD_getFrameHeader(ZSTD_frameHeader* zfhPtr, const void* src, size_t srcSize) +{ + return ZSTD_getFrameHeader_internal(zfhPtr, src, srcSize, ZSTD_f_zstd1); +} + + +/** ZSTD_getFrameContentSize() : + * compatible with legacy mode + * @return : decompressed size of the single frame pointed to be `src` if known, otherwise + * - ZSTD_CONTENTSIZE_UNKNOWN if the size cannot be determined + * - ZSTD_CONTENTSIZE_ERROR if an error occurred (e.g. invalid magic number, srcSize too small) */ +unsigned long long ZSTD_getFrameContentSize(const void *src, size_t srcSize) +{ +#if defined(ZSTD_LEGACY_SUPPORT) && (ZSTD_LEGACY_SUPPORT >= 1) + if (ZSTD_isLegacy(src, srcSize)) { + unsigned long long const ret = ZSTD_getDecompressedSize_legacy(src, srcSize); + return ret == 0 ? ZSTD_CONTENTSIZE_UNKNOWN : ret; + } +#endif + { ZSTD_frameHeader zfh; + if (ZSTD_getFrameHeader(&zfh, src, srcSize) != 0) + return ZSTD_CONTENTSIZE_ERROR; + if (zfh.frameType == ZSTD_skippableFrame) { + return 0; + } else { + return zfh.frameContentSize; + } } +} + +/** ZSTD_findDecompressedSize() : + * compatible with legacy mode + * `srcSize` must be the exact length of some number of ZSTD compressed and/or + * skippable frames + * @return : decompressed size of the frames contained */ +unsigned long long ZSTD_findDecompressedSize(const void* src, size_t srcSize) +{ + unsigned long long totalDstSize = 0; + + while (srcSize >= ZSTD_frameHeaderSize_prefix) { + U32 const magicNumber = MEM_readLE32(src); + + if ((magicNumber & 0xFFFFFFF0U) == ZSTD_MAGIC_SKIPPABLE_START) { + size_t skippableSize; + if (srcSize < ZSTD_skippableHeaderSize) + return ERROR(srcSize_wrong); + skippableSize = MEM_readLE32((const BYTE *)src + ZSTD_frameIdSize) + + ZSTD_skippableHeaderSize; + if (srcSize < skippableSize) { + return ZSTD_CONTENTSIZE_ERROR; + } + + src = (const BYTE *)src + skippableSize; + srcSize -= skippableSize; + continue; + } + + { unsigned long long const ret = ZSTD_getFrameContentSize(src, srcSize); + if (ret >= ZSTD_CONTENTSIZE_ERROR) return ret; + + /* check for overflow */ + if (totalDstSize + ret < totalDstSize) return ZSTD_CONTENTSIZE_ERROR; + totalDstSize += ret; + } + { size_t const frameSrcSize = ZSTD_findFrameCompressedSize(src, srcSize); + if (ZSTD_isError(frameSrcSize)) { + return ZSTD_CONTENTSIZE_ERROR; + } + + src = (const BYTE *)src + frameSrcSize; + srcSize -= frameSrcSize; + } + } /* while (srcSize >= ZSTD_frameHeaderSize_prefix) */ + + if (srcSize) return ZSTD_CONTENTSIZE_ERROR; + + return totalDstSize; +} /** ZSTD_getDecompressedSize() : * compatible with legacy mode * @return : decompressed size if known, 0 otherwise note : 0 can mean any of the following : - - decompressed size is not present within frame header + - frame content is empty + - decompressed size field is not present in frame header - frame header unknown / not supported - frame header not complete (`srcSize` too small) */ unsigned long long ZSTD_getDecompressedSize(const void* src, size_t srcSize) { -#if defined(ZSTD_LEGACY_SUPPORT) && (ZSTD_LEGACY_SUPPORT==1) - if (ZSTD_isLegacy(src, srcSize)) return ZSTD_getDecompressedSize_legacy(src, srcSize); -#endif - { ZSTD_frameParams fparams; - size_t const frResult = ZSTD_getFrameParams(&fparams, src, srcSize); - if (frResult!=0) return 0; - return fparams.frameContentSize; - } + unsigned long long const ret = ZSTD_getFrameContentSize(src, srcSize); + ZSTD_STATIC_ASSERT(ZSTD_CONTENTSIZE_ERROR < ZSTD_CONTENTSIZE_UNKNOWN); + return (ret >= ZSTD_CONTENTSIZE_ERROR) ? 0 : ret; } @@ -332,25 +491,24 @@ * @return : 0 if success, or an error code, which can be tested using ZSTD_isError() */ static size_t ZSTD_decodeFrameHeader(ZSTD_DCtx* dctx, const void* src, size_t headerSize) { - size_t const result = ZSTD_getFrameParams(&(dctx->fParams), src, headerSize); - if (ZSTD_isError(result)) return result; /* invalid header */ - if (result>0) return ERROR(srcSize_wrong); /* headerSize too small */ - if (dctx->fParams.dictID && (dctx->dictID != dctx->fParams.dictID)) return ERROR(dictionary_wrong); + size_t const result = ZSTD_getFrameHeader_internal(&(dctx->fParams), src, headerSize, dctx->format); + if (ZSTD_isError(result)) return result; /* invalid header */ + if (result>0) return ERROR(srcSize_wrong); /* headerSize too small */ + if (dctx->fParams.dictID && (dctx->dictID != dctx->fParams.dictID)) + return ERROR(dictionary_wrong); if (dctx->fParams.checksumFlag) XXH64_reset(&dctx->xxhState, 0); return 0; } -typedef struct -{ - blockType_e blockType; - U32 lastBlock; - U32 origSize; -} blockProperties_t; +/*-************************************************************* + * Block decoding + ***************************************************************/ /*! ZSTD_getcBlockSize() : * Provides the size of compressed block from block header `src` */ -size_t ZSTD_getcBlockSize(const void* src, size_t srcSize, blockProperties_t* bpPtr) +size_t ZSTD_getcBlockSize(const void* src, size_t srcSize, + blockProperties_t* bpPtr) { if (srcSize < ZSTD_blockHeaderSize) return ERROR(srcSize_wrong); { U32 const cBlockHeader = MEM_readLE24(src); @@ -365,7 +523,8 @@ } -static size_t ZSTD_copyRawBlock(void* dst, size_t dstCapacity, const void* src, size_t srcSize) +static size_t ZSTD_copyRawBlock(void* dst, size_t dstCapacity, + const void* src, size_t srcSize) { if (srcSize > dstCapacity) return ERROR(dstSize_tooSmall); memcpy(dst, src, srcSize); @@ -373,7 +532,9 @@ } -static size_t ZSTD_setRleBlock(void* dst, size_t dstCapacity, const void* src, size_t srcSize, size_t regenSize) +static size_t ZSTD_setRleBlock(void* dst, size_t dstCapacity, + const void* src, size_t srcSize, + size_t regenSize) { if (srcSize != 1) return ERROR(srcSize_wrong); if (regenSize > dstCapacity) return ERROR(dstSize_tooSmall); @@ -382,7 +543,8 @@ } /*! ZSTD_decodeLiteralsBlock() : - @return : nb of bytes read from src (< srcSize ) */ + * @return : nb of bytes read from src (< srcSize ) + * note : symbol not declared but exposed for fullbench */ size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx, const void* src, size_t srcSize) /* note : srcSize < BLOCKSIZE */ { @@ -424,22 +586,24 @@ litCSize = (lhc >> 22) + (istart[4] << 10); break; } - if (litSize > ZSTD_BLOCKSIZE_ABSOLUTEMAX) return ERROR(corruption_detected); + if (litSize > ZSTD_BLOCKSIZE_MAX) return ERROR(corruption_detected); if (litCSize + lhSize > srcSize) return ERROR(corruption_detected); if (HUF_isError((litEncType==set_repeat) ? ( singleStream ? - HUF_decompress1X_usingDTable(dctx->litBuffer, litSize, istart+lhSize, litCSize, dctx->HUFptr) : - HUF_decompress4X_usingDTable(dctx->litBuffer, litSize, istart+lhSize, litCSize, dctx->HUFptr) ) : + HUF_decompress1X_usingDTable_bmi2(dctx->litBuffer, litSize, istart+lhSize, litCSize, dctx->HUFptr, dctx->bmi2) : + HUF_decompress4X_usingDTable_bmi2(dctx->litBuffer, litSize, istart+lhSize, litCSize, dctx->HUFptr, dctx->bmi2) ) : ( singleStream ? - HUF_decompress1X2_DCtx(dctx->hufTable, dctx->litBuffer, litSize, istart+lhSize, litCSize) : - HUF_decompress4X_hufOnly (dctx->hufTable, dctx->litBuffer, litSize, istart+lhSize, litCSize)) )) + HUF_decompress1X2_DCtx_wksp_bmi2(dctx->entropy.hufTable, dctx->litBuffer, litSize, istart+lhSize, litCSize, + dctx->entropy.workspace, sizeof(dctx->entropy.workspace), dctx->bmi2) : + HUF_decompress4X_hufOnly_wksp_bmi2(dctx->entropy.hufTable, dctx->litBuffer, litSize, istart+lhSize, litCSize, + dctx->entropy.workspace, sizeof(dctx->entropy.workspace), dctx->bmi2)))) return ERROR(corruption_detected); dctx->litPtr = dctx->litBuffer; dctx->litSize = litSize; dctx->litEntropy = 1; - if (litEncType==set_compressed) dctx->HUFptr = dctx->hufTable; + if (litEncType==set_compressed) dctx->HUFptr = dctx->entropy.hufTable; memset(dctx->litBuffer + dctx->litSize, 0, WILDCOPY_OVERLENGTH); return litCSize + lhSize; } @@ -496,7 +660,7 @@ if (srcSize<4) return ERROR(corruption_detected); /* srcSize >= MIN_CBLOCK_SIZE == 3; here we need lhSize+1 = 4 */ break; } - if (litSize > ZSTD_BLOCKSIZE_ABSOLUTEMAX) return ERROR(corruption_detected); + if (litSize > ZSTD_BLOCKSIZE_MAX) return ERROR(corruption_detected); memset(dctx->litBuffer, istart[lhSize], litSize + WILDCOPY_OVERLENGTH); dctx->litPtr = dctx->litBuffer; dctx->litSize = litSize; @@ -508,227 +672,275 @@ } } - -typedef union { - FSE_decode_t realData; - U32 alignedBy4; -} FSE_decode_t4; - -static const FSE_decode_t4 LL_defaultDTable[(1<tableLog = 0; + DTableH->fastMode = 0; + + cell->nbBits = 0; + cell->nextState = 0; + assert(nbAddBits < 255); + cell->nbAdditionalBits = (BYTE)nbAddBits; + cell->baseValue = baseValue; +} + + +/* ZSTD_buildFSETable() : + * generate FSE decoding table for one symbol (ll, ml or off) */ +static void +ZSTD_buildFSETable(ZSTD_seqSymbol* dt, + const short* normalizedCounter, unsigned maxSymbolValue, + const U32* baseValue, const U32* nbAdditionalBits, + unsigned tableLog) +{ + ZSTD_seqSymbol* const tableDecode = dt+1; + U16 symbolNext[MaxSeq+1]; + + U32 const maxSV1 = maxSymbolValue + 1; + U32 const tableSize = 1 << tableLog; + U32 highThreshold = tableSize-1; + + /* Sanity Checks */ + assert(maxSymbolValue <= MaxSeq); + assert(tableLog <= MaxFSELog); + + /* Init, lay down lowprob symbols */ + { ZSTD_seqSymbol_header DTableH; + DTableH.tableLog = tableLog; + DTableH.fastMode = 1; + { S16 const largeLimit= (S16)(1 << (tableLog-1)); + U32 s; + for (s=0; s= largeLimit) DTableH.fastMode=0; + symbolNext[s] = normalizedCounter[s]; + } } } + memcpy(dt, &DTableH, sizeof(DTableH)); + } + + /* Spread symbols */ + { U32 const tableMask = tableSize-1; + U32 const step = FSE_TABLESTEP(tableSize); + U32 s, position = 0; + for (s=0; s highThreshold) position = (position + step) & tableMask; /* lowprob area */ + } } + assert(position == 0); /* position must reach all cells once, otherwise normalizedCounter is incorrect */ + } + + /* Build Decoding table */ + { U32 u; + for (u=0; u max) return ERROR(corruption_detected); - FSE_buildDTable_rle(DTableSpace, *(const BYTE*)src); + { U32 const symbol = *(const BYTE*)src; + U32 const baseline = baseValue[symbol]; + U32 const nbBits = nbAdditionalBits[symbol]; + ZSTD_buildSeqTable_rle(DTableSpace, baseline, nbBits); + } *DTablePtr = DTableSpace; return 1; case set_basic : - *DTablePtr = (const FSE_DTable*)tmpPtr; + *DTablePtr = defaultTable; return 0; case set_repeat: if (!flagRepeatTable) return ERROR(corruption_detected); return 0; - default : /* impossible */ case set_compressed : { U32 tableLog; S16 norm[MaxSeq+1]; size_t const headerSize = FSE_readNCount(norm, &max, &tableLog, src, srcSize); if (FSE_isError(headerSize)) return ERROR(corruption_detected); if (tableLog > maxLog) return ERROR(corruption_detected); - FSE_buildDTable(DTableSpace, norm, max, tableLog); + ZSTD_buildFSETable(DTableSpace, norm, max, baseValue, nbAdditionalBits, tableLog); *DTablePtr = DTableSpace; return headerSize; - } } + } + default : /* impossible */ + assert(0); + return ERROR(GENERIC); + } } +static const U32 LL_base[MaxLL+1] = { + 0, 1, 2, 3, 4, 5, 6, 7, + 8, 9, 10, 11, 12, 13, 14, 15, + 16, 18, 20, 22, 24, 28, 32, 40, + 48, 64, 0x80, 0x100, 0x200, 0x400, 0x800, 0x1000, + 0x2000, 0x4000, 0x8000, 0x10000 }; + +static const U32 OF_base[MaxOff+1] = { + 0, 1, 1, 5, 0xD, 0x1D, 0x3D, 0x7D, + 0xFD, 0x1FD, 0x3FD, 0x7FD, 0xFFD, 0x1FFD, 0x3FFD, 0x7FFD, + 0xFFFD, 0x1FFFD, 0x3FFFD, 0x7FFFD, 0xFFFFD, 0x1FFFFD, 0x3FFFFD, 0x7FFFFD, + 0xFFFFFD, 0x1FFFFFD, 0x3FFFFFD, 0x7FFFFFD, 0xFFFFFFD, 0x1FFFFFFD, 0x3FFFFFFD, 0x7FFFFFFD }; + +static const U32 OF_bits[MaxOff+1] = { + 0, 1, 2, 3, 4, 5, 6, 7, + 8, 9, 10, 11, 12, 13, 14, 15, + 16, 17, 18, 19, 20, 21, 22, 23, + 24, 25, 26, 27, 28, 29, 30, 31 }; + +static const U32 ML_base[MaxML+1] = { + 3, 4, 5, 6, 7, 8, 9, 10, + 11, 12, 13, 14, 15, 16, 17, 18, + 19, 20, 21, 22, 23, 24, 25, 26, + 27, 28, 29, 30, 31, 32, 33, 34, + 35, 37, 39, 41, 43, 47, 51, 59, + 67, 83, 99, 0x83, 0x103, 0x203, 0x403, 0x803, + 0x1003, 0x2003, 0x4003, 0x8003, 0x10003 }; + + size_t ZSTD_decodeSeqHeaders(ZSTD_DCtx* dctx, int* nbSeqPtr, const void* src, size_t srcSize) { const BYTE* const istart = (const BYTE* const)src; const BYTE* const iend = istart + srcSize; const BYTE* ip = istart; + DEBUGLOG(5, "ZSTD_decodeSeqHeaders"); /* check */ if (srcSize < MIN_SEQUENCES_SIZE) return ERROR(srcSize_wrong); @@ -756,21 +968,29 @@ ip++; /* Build DTables */ - { size_t const llhSize = ZSTD_buildSeqTable(dctx->LLTable, &dctx->LLTptr, + { size_t const llhSize = ZSTD_buildSeqTable(dctx->entropy.LLTable, &dctx->LLTptr, LLtype, MaxLL, LLFSELog, - ip, iend-ip, LL_defaultDTable, dctx->fseEntropy); + ip, iend-ip, + LL_base, LL_bits, + LL_defaultDTable, dctx->fseEntropy); if (ZSTD_isError(llhSize)) return ERROR(corruption_detected); ip += llhSize; } - { size_t const ofhSize = ZSTD_buildSeqTable(dctx->OFTable, &dctx->OFTptr, + + { size_t const ofhSize = ZSTD_buildSeqTable(dctx->entropy.OFTable, &dctx->OFTptr, OFtype, MaxOff, OffFSELog, - ip, iend-ip, OF_defaultDTable, dctx->fseEntropy); + ip, iend-ip, + OF_base, OF_bits, + OF_defaultDTable, dctx->fseEntropy); if (ZSTD_isError(ofhSize)) return ERROR(corruption_detected); ip += ofhSize; } - { size_t const mlhSize = ZSTD_buildSeqTable(dctx->MLTable, &dctx->MLTptr, + + { size_t const mlhSize = ZSTD_buildSeqTable(dctx->entropy.MLTable, &dctx->MLTptr, MLtype, MaxML, MLFSELog, - ip, iend-ip, ML_defaultDTable, dctx->fseEntropy); + ip, iend-ip, + ML_base, ML_bits, + ML_defaultDTable, dctx->fseEntropy); if (ZSTD_isError(mlhSize)) return ERROR(corruption_detected); ip += mlhSize; } @@ -788,14 +1008,19 @@ } seq_t; typedef struct { + size_t state; + const ZSTD_seqSymbol* table; +} ZSTD_fseState; + +typedef struct { BIT_DStream_t DStream; - FSE_DState_t stateLL; - FSE_DState_t stateOffb; - FSE_DState_t stateML; + ZSTD_fseState stateLL; + ZSTD_fseState stateOffb; + ZSTD_fseState stateML; size_t prevOffset[ZSTD_REP_NUM]; - const BYTE* base; + const BYTE* prefixStart; + const BYTE* dictEnd; size_t pos; - iPtrDiff gotoDict; } seqState_t; @@ -846,88 +1071,11 @@ } - - -static seq_t ZSTD_decodeSequence(seqState_t* seqState) -{ - seq_t seq; - - U32 const llCode = FSE_peekSymbol(&seqState->stateLL); - U32 const mlCode = FSE_peekSymbol(&seqState->stateML); - U32 const ofCode = FSE_peekSymbol(&seqState->stateOffb); /* <= maxOff, by table construction */ - - U32 const llBits = LL_bits[llCode]; - U32 const mlBits = ML_bits[mlCode]; - U32 const ofBits = ofCode; - U32 const totalBits = llBits+mlBits+ofBits; - - static const U32 LL_base[MaxLL+1] = { - 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, - 16, 18, 20, 22, 24, 28, 32, 40, 48, 64, 0x80, 0x100, 0x200, 0x400, 0x800, 0x1000, - 0x2000, 0x4000, 0x8000, 0x10000 }; - - static const U32 ML_base[MaxML+1] = { - 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, - 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, - 35, 37, 39, 41, 43, 47, 51, 59, 67, 83, 99, 0x83, 0x103, 0x203, 0x403, 0x803, - 0x1003, 0x2003, 0x4003, 0x8003, 0x10003 }; - - static const U32 OF_base[MaxOff+1] = { - 0, 1, 1, 5, 0xD, 0x1D, 0x3D, 0x7D, - 0xFD, 0x1FD, 0x3FD, 0x7FD, 0xFFD, 0x1FFD, 0x3FFD, 0x7FFD, - 0xFFFD, 0x1FFFD, 0x3FFFD, 0x7FFFD, 0xFFFFD, 0x1FFFFD, 0x3FFFFD, 0x7FFFFD, - 0xFFFFFD, 0x1FFFFFD, 0x3FFFFFD, 0x7FFFFFD, 0xFFFFFFD }; - - /* sequence */ - { size_t offset; - if (!ofCode) - offset = 0; - else { - offset = OF_base[ofCode] + BIT_readBitsFast(&seqState->DStream, ofBits); /* <= (ZSTD_WINDOWLOG_MAX-1) bits */ - if (MEM_32bits()) BIT_reloadDStream(&seqState->DStream); - } - - if (ofCode <= 1) { - offset += (llCode==0); - if (offset) { - size_t temp = (offset==3) ? seqState->prevOffset[0] - 1 : seqState->prevOffset[offset]; - temp += !temp; /* 0 is not valid; input is corrupted; force offset to 1 */ - if (offset != 1) seqState->prevOffset[2] = seqState->prevOffset[1]; - seqState->prevOffset[1] = seqState->prevOffset[0]; - seqState->prevOffset[0] = offset = temp; - } else { - offset = seqState->prevOffset[0]; - } - } else { - seqState->prevOffset[2] = seqState->prevOffset[1]; - seqState->prevOffset[1] = seqState->prevOffset[0]; - seqState->prevOffset[0] = offset; - } - seq.offset = offset; - } - - seq.matchLength = ML_base[mlCode] + ((mlCode>31) ? BIT_readBitsFast(&seqState->DStream, mlBits) : 0); /* <= 16 bits */ - if (MEM_32bits() && (mlBits+llBits>24)) BIT_reloadDStream(&seqState->DStream); - - seq.litLength = LL_base[llCode] + ((llCode>15) ? BIT_readBitsFast(&seqState->DStream, llBits) : 0); /* <= 16 bits */ - if (MEM_32bits() || - (totalBits > 64 - 7 - (LLFSELog+MLFSELog+OffFSELog)) ) BIT_reloadDStream(&seqState->DStream); - - /* ANS state update */ - FSE_updateState(&seqState->stateLL, &seqState->DStream); /* <= 9 bits */ - FSE_updateState(&seqState->stateML, &seqState->DStream); /* <= 9 bits */ - if (MEM_32bits()) BIT_reloadDStream(&seqState->DStream); /* <= 18 bits */ - FSE_updateState(&seqState->stateOffb, &seqState->DStream); /* <= 8 bits */ - - return seq; -} - - -FORCE_INLINE +HINT_INLINE size_t ZSTD_execSequence(BYTE* op, - BYTE* const oend, seq_t sequence, - const BYTE** litPtr, const BYTE* const litLimit, - const BYTE* const base, const BYTE* const vBase, const BYTE* const dictEnd) + BYTE* const oend, seq_t sequence, + const BYTE** litPtr, const BYTE* const litLimit, + const BYTE* const base, const BYTE* const vBase, const BYTE* const dictEnd) { BYTE* const oLitEnd = op + sequence.litLength; size_t const sequenceLength = sequence.litLength + sequence.matchLength; @@ -950,9 +1098,10 @@ /* copy Match */ if (sequence.offset > (size_t)(oLitEnd - base)) { - /* offset beyond prefix */ - if (sequence.offset > (size_t)(oLitEnd - vBase)) return ERROR(corruption_detected); - match += (dictEnd-base); + /* offset beyond prefix -> go into extDict */ + if (sequence.offset > (size_t)(oLitEnd - vBase)) + return ERROR(corruption_detected); + match = dictEnd + (match - base); if (match + sequence.matchLength <= dictEnd) { memmove(oLitEnd, match, sequence.matchLength); return sequenceLength; @@ -975,7 +1124,87 @@ if (sequence.offset < 8) { /* close range match, overlap */ static const U32 dec32table[] = { 0, 1, 2, 1, 4, 4, 4, 4 }; /* added */ - static const int dec64table[] = { 8, 8, 8, 7, 8, 9,10,11 }; /* substracted */ + static const int dec64table[] = { 8, 8, 8, 7, 8, 9,10,11 }; /* subtracted */ + int const sub2 = dec64table[sequence.offset]; + op[0] = match[0]; + op[1] = match[1]; + op[2] = match[2]; + op[3] = match[3]; + match += dec32table[sequence.offset]; + ZSTD_copy4(op+4, match); + match -= sub2; + } else { + ZSTD_copy8(op, match); + } + op += 8; match += 8; + + if (oMatchEnd > oend-(16-MINMATCH)) { + if (op < oend_w) { + ZSTD_wildcopy(op, match, oend_w - op); + match += oend_w - op; + op = oend_w; + } + while (op < oMatchEnd) *op++ = *match++; + } else { + ZSTD_wildcopy(op, match, (ptrdiff_t)sequence.matchLength-8); /* works even if matchLength < 8 */ + } + return sequenceLength; +} + + +HINT_INLINE +size_t ZSTD_execSequenceLong(BYTE* op, + BYTE* const oend, seq_t sequence, + const BYTE** litPtr, const BYTE* const litLimit, + const BYTE* const prefixStart, const BYTE* const dictStart, const BYTE* const dictEnd) +{ + BYTE* const oLitEnd = op + sequence.litLength; + size_t const sequenceLength = sequence.litLength + sequence.matchLength; + BYTE* const oMatchEnd = op + sequenceLength; /* risk : address space overflow (32-bits) */ + BYTE* const oend_w = oend - WILDCOPY_OVERLENGTH; + const BYTE* const iLitEnd = *litPtr + sequence.litLength; + const BYTE* match = sequence.match; + + /* check */ + if (oMatchEnd > oend) return ERROR(dstSize_tooSmall); /* last match must start at a minimum distance of WILDCOPY_OVERLENGTH from oend */ + if (iLitEnd > litLimit) return ERROR(corruption_detected); /* over-read beyond lit buffer */ + if (oLitEnd > oend_w) return ZSTD_execSequenceLast7(op, oend, sequence, litPtr, litLimit, prefixStart, dictStart, dictEnd); + + /* copy Literals */ + ZSTD_copy8(op, *litPtr); /* note : op <= oLitEnd <= oend_w == oend - 8 */ + if (sequence.litLength > 8) + ZSTD_wildcopy(op+8, (*litPtr)+8, sequence.litLength - 8); /* note : since oLitEnd <= oend-WILDCOPY_OVERLENGTH, no risk of overwrite beyond oend */ + op = oLitEnd; + *litPtr = iLitEnd; /* update for next sequence */ + + /* copy Match */ + if (sequence.offset > (size_t)(oLitEnd - prefixStart)) { + /* offset beyond prefix */ + if (sequence.offset > (size_t)(oLitEnd - dictStart)) return ERROR(corruption_detected); + if (match + sequence.matchLength <= dictEnd) { + memmove(oLitEnd, match, sequence.matchLength); + return sequenceLength; + } + /* span extDict & currentPrefixSegment */ + { size_t const length1 = dictEnd - match; + memmove(oLitEnd, match, length1); + op = oLitEnd + length1; + sequence.matchLength -= length1; + match = prefixStart; + if (op > oend_w || sequence.matchLength < MINMATCH) { + U32 i; + for (i = 0; i < sequence.matchLength; ++i) op[i] = match[i]; + return sequenceLength; + } + } } + assert(op <= oend_w); + assert(sequence.matchLength >= MINMATCH); + + /* match within prefix */ + if (sequence.offset < 8) { + /* close range match, overlap */ + static const U32 dec32table[] = { 0, 1, 2, 1, 4, 4, 4, 4 }; /* added */ + static const int dec64table[] = { 8, 8, 8, 7, 8, 9,10,11 }; /* subtracted */ int const sub2 = dec64table[sequence.offset]; op[0] = match[0]; op[1] = match[1]; @@ -1002,11 +1231,121 @@ return sequenceLength; } - -static size_t ZSTD_decompressSequences( - ZSTD_DCtx* dctx, +static void +ZSTD_initFseState(ZSTD_fseState* DStatePtr, BIT_DStream_t* bitD, const ZSTD_seqSymbol* dt) +{ + const void* ptr = dt; + const ZSTD_seqSymbol_header* const DTableH = (const ZSTD_seqSymbol_header*)ptr; + DStatePtr->state = BIT_readBits(bitD, DTableH->tableLog); + DEBUGLOG(6, "ZSTD_initFseState : val=%u using %u bits", + (U32)DStatePtr->state, DTableH->tableLog); + BIT_reloadDStream(bitD); + DStatePtr->table = dt + 1; +} + +FORCE_INLINE_TEMPLATE void +ZSTD_updateFseState(ZSTD_fseState* DStatePtr, BIT_DStream_t* bitD) +{ + ZSTD_seqSymbol const DInfo = DStatePtr->table[DStatePtr->state]; + U32 const nbBits = DInfo.nbBits; + size_t const lowBits = BIT_readBits(bitD, nbBits); + DStatePtr->state = DInfo.nextState + lowBits; +} + +/* We need to add at most (ZSTD_WINDOWLOG_MAX_32 - 1) bits to read the maximum + * offset bits. But we can only read at most (STREAM_ACCUMULATOR_MIN_32 - 1) + * bits before reloading. This value is the maximum number of bytes we read + * after reloading when we are decoding long offets. + */ +#define LONG_OFFSETS_MAX_EXTRA_BITS_32 \ + (ZSTD_WINDOWLOG_MAX_32 > STREAM_ACCUMULATOR_MIN_32 \ + ? ZSTD_WINDOWLOG_MAX_32 - STREAM_ACCUMULATOR_MIN_32 \ + : 0) + +typedef enum { ZSTD_lo_isRegularOffset, ZSTD_lo_isLongOffset=1 } ZSTD_longOffset_e; + +FORCE_INLINE_TEMPLATE seq_t +ZSTD_decodeSequence(seqState_t* seqState, const ZSTD_longOffset_e longOffsets) +{ + seq_t seq; + U32 const llBits = seqState->stateLL.table[seqState->stateLL.state].nbAdditionalBits; + U32 const mlBits = seqState->stateML.table[seqState->stateML.state].nbAdditionalBits; + U32 const ofBits = seqState->stateOffb.table[seqState->stateOffb.state].nbAdditionalBits; + U32 const totalBits = llBits+mlBits+ofBits; + U32 const llBase = seqState->stateLL.table[seqState->stateLL.state].baseValue; + U32 const mlBase = seqState->stateML.table[seqState->stateML.state].baseValue; + U32 const ofBase = seqState->stateOffb.table[seqState->stateOffb.state].baseValue; + + /* sequence */ + { size_t offset; + if (!ofBits) + offset = 0; + else { + ZSTD_STATIC_ASSERT(ZSTD_lo_isLongOffset == 1); + ZSTD_STATIC_ASSERT(LONG_OFFSETS_MAX_EXTRA_BITS_32 == 5); + assert(ofBits <= MaxOff); + if (MEM_32bits() && longOffsets && (ofBits >= STREAM_ACCUMULATOR_MIN_32)) { + U32 const extraBits = ofBits - MIN(ofBits, 32 - seqState->DStream.bitsConsumed); + offset = ofBase + (BIT_readBitsFast(&seqState->DStream, ofBits - extraBits) << extraBits); + BIT_reloadDStream(&seqState->DStream); + if (extraBits) offset += BIT_readBitsFast(&seqState->DStream, extraBits); + assert(extraBits <= LONG_OFFSETS_MAX_EXTRA_BITS_32); /* to avoid another reload */ + } else { + offset = ofBase + BIT_readBitsFast(&seqState->DStream, ofBits/*>0*/); /* <= (ZSTD_WINDOWLOG_MAX-1) bits */ + if (MEM_32bits()) BIT_reloadDStream(&seqState->DStream); + } + } + + if (ofBits <= 1) { + offset += (llBase==0); + if (offset) { + size_t temp = (offset==3) ? seqState->prevOffset[0] - 1 : seqState->prevOffset[offset]; + temp += !temp; /* 0 is not valid; input is corrupted; force offset to 1 */ + if (offset != 1) seqState->prevOffset[2] = seqState->prevOffset[1]; + seqState->prevOffset[1] = seqState->prevOffset[0]; + seqState->prevOffset[0] = offset = temp; + } else { /* offset == 0 */ + offset = seqState->prevOffset[0]; + } + } else { + seqState->prevOffset[2] = seqState->prevOffset[1]; + seqState->prevOffset[1] = seqState->prevOffset[0]; + seqState->prevOffset[0] = offset; + } + seq.offset = offset; + } + + seq.matchLength = mlBase + + ((mlBits>0) ? BIT_readBitsFast(&seqState->DStream, mlBits/*>0*/) : 0); /* <= 16 bits */ + if (MEM_32bits() && (mlBits+llBits >= STREAM_ACCUMULATOR_MIN_32-LONG_OFFSETS_MAX_EXTRA_BITS_32)) + BIT_reloadDStream(&seqState->DStream); + if (MEM_64bits() && (totalBits >= STREAM_ACCUMULATOR_MIN_64-(LLFSELog+MLFSELog+OffFSELog))) + BIT_reloadDStream(&seqState->DStream); + /* Ensure there are enough bits to read the rest of data in 64-bit mode. */ + ZSTD_STATIC_ASSERT(16+LLFSELog+MLFSELog+OffFSELog < STREAM_ACCUMULATOR_MIN_64); + + seq.litLength = llBase + + ((llBits>0) ? BIT_readBitsFast(&seqState->DStream, llBits/*>0*/) : 0); /* <= 16 bits */ + if (MEM_32bits()) + BIT_reloadDStream(&seqState->DStream); + + DEBUGLOG(6, "seq: litL=%u, matchL=%u, offset=%u", + (U32)seq.litLength, (U32)seq.matchLength, (U32)seq.offset); + + /* ANS state update */ + ZSTD_updateFseState(&seqState->stateLL, &seqState->DStream); /* <= 9 bits */ + ZSTD_updateFseState(&seqState->stateML, &seqState->DStream); /* <= 9 bits */ + if (MEM_32bits()) BIT_reloadDStream(&seqState->DStream); /* <= 18 bits */ + ZSTD_updateFseState(&seqState->stateOffb, &seqState->DStream); /* <= 8 bits */ + + return seq; +} + +FORCE_INLINE_TEMPLATE size_t +ZSTD_decompressSequences_body( ZSTD_DCtx* dctx, void* dst, size_t maxDstSize, - const void* seqStart, size_t seqSize) + const void* seqStart, size_t seqSize, int nbSeq, + const ZSTD_longOffset_e isLongOffset) { const BYTE* ip = (const BYTE*)seqStart; const BYTE* const iend = ip + seqSize; @@ -1018,36 +1357,32 @@ const BYTE* const base = (const BYTE*) (dctx->base); const BYTE* const vBase = (const BYTE*) (dctx->vBase); const BYTE* const dictEnd = (const BYTE*) (dctx->dictEnd); - int nbSeq; - - /* Build Decoding Tables */ - { size_t const seqHSize = ZSTD_decodeSeqHeaders(dctx, &nbSeq, ip, seqSize); - if (ZSTD_isError(seqHSize)) return seqHSize; - ip += seqHSize; - } + DEBUGLOG(5, "ZSTD_decompressSequences"); /* Regen sequences */ if (nbSeq) { seqState_t seqState; dctx->fseEntropy = 1; - { U32 i; for (i=0; irep[i]; } + { U32 i; for (i=0; ientropy.rep[i]; } CHECK_E(BIT_initDStream(&seqState.DStream, ip, iend-ip), corruption_detected); - FSE_initDState(&seqState.stateLL, &seqState.DStream, dctx->LLTptr); - FSE_initDState(&seqState.stateOffb, &seqState.DStream, dctx->OFTptr); - FSE_initDState(&seqState.stateML, &seqState.DStream, dctx->MLTptr); + ZSTD_initFseState(&seqState.stateLL, &seqState.DStream, dctx->LLTptr); + ZSTD_initFseState(&seqState.stateOffb, &seqState.DStream, dctx->OFTptr); + ZSTD_initFseState(&seqState.stateML, &seqState.DStream, dctx->MLTptr); for ( ; (BIT_reloadDStream(&(seqState.DStream)) <= BIT_DStream_completed) && nbSeq ; ) { nbSeq--; - { seq_t const sequence = ZSTD_decodeSequence(&seqState); + { seq_t const sequence = ZSTD_decodeSequence(&seqState, isLongOffset); size_t const oneSeqSize = ZSTD_execSequence(op, oend, sequence, &litPtr, litEnd, base, vBase, dictEnd); + DEBUGLOG(6, "regenerated sequence size : %u", (U32)oneSeqSize); if (ZSTD_isError(oneSeqSize)) return oneSeqSize; op += oneSeqSize; } } /* check if reached exact end */ + DEBUGLOG(5, "ZSTD_decompressSequences: after decode loop, remaining nbSeq : %i", nbSeq); if (nbSeq) return ERROR(corruption_detected); /* save reps for next block */ - { U32 i; for (i=0; irep[i] = (U32)(seqState.prevOffset[i]); } + { U32 i; for (i=0; ientropy.rep[i] = (U32)(seqState.prevOffset[i]); } } /* last literal segment */ @@ -1060,48 +1395,50 @@ return op-ostart; } - -static seq_t ZSTD_decodeSequenceLong(seqState_t* seqState) +static size_t +ZSTD_decompressSequences_default(ZSTD_DCtx* dctx, + void* dst, size_t maxDstSize, + const void* seqStart, size_t seqSize, int nbSeq, + const ZSTD_longOffset_e isLongOffset) +{ + return ZSTD_decompressSequences_body(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset); +} + + + +FORCE_INLINE_TEMPLATE seq_t +ZSTD_decodeSequenceLong(seqState_t* seqState, ZSTD_longOffset_e const longOffsets) { seq_t seq; - - U32 const llCode = FSE_peekSymbol(&seqState->stateLL); - U32 const mlCode = FSE_peekSymbol(&seqState->stateML); - U32 const ofCode = FSE_peekSymbol(&seqState->stateOffb); /* <= maxOff, by table construction */ - - U32 const llBits = LL_bits[llCode]; - U32 const mlBits = ML_bits[mlCode]; - U32 const ofBits = ofCode; + U32 const llBits = seqState->stateLL.table[seqState->stateLL.state].nbAdditionalBits; + U32 const mlBits = seqState->stateML.table[seqState->stateML.state].nbAdditionalBits; + U32 const ofBits = seqState->stateOffb.table[seqState->stateOffb.state].nbAdditionalBits; U32 const totalBits = llBits+mlBits+ofBits; - - static const U32 LL_base[MaxLL+1] = { - 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, - 16, 18, 20, 22, 24, 28, 32, 40, 48, 64, 0x80, 0x100, 0x200, 0x400, 0x800, 0x1000, - 0x2000, 0x4000, 0x8000, 0x10000 }; - - static const U32 ML_base[MaxML+1] = { - 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, - 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, - 35, 37, 39, 41, 43, 47, 51, 59, 67, 83, 99, 0x83, 0x103, 0x203, 0x403, 0x803, - 0x1003, 0x2003, 0x4003, 0x8003, 0x10003 }; - - static const U32 OF_base[MaxOff+1] = { - 0, 1, 1, 5, 0xD, 0x1D, 0x3D, 0x7D, - 0xFD, 0x1FD, 0x3FD, 0x7FD, 0xFFD, 0x1FFD, 0x3FFD, 0x7FFD, - 0xFFFD, 0x1FFFD, 0x3FFFD, 0x7FFFD, 0xFFFFD, 0x1FFFFD, 0x3FFFFD, 0x7FFFFD, - 0xFFFFFD, 0x1FFFFFD, 0x3FFFFFD, 0x7FFFFFD, 0xFFFFFFD }; + U32 const llBase = seqState->stateLL.table[seqState->stateLL.state].baseValue; + U32 const mlBase = seqState->stateML.table[seqState->stateML.state].baseValue; + U32 const ofBase = seqState->stateOffb.table[seqState->stateOffb.state].baseValue; /* sequence */ { size_t offset; - if (!ofCode) + if (!ofBits) offset = 0; else { - offset = OF_base[ofCode] + BIT_readBitsFast(&seqState->DStream, ofBits); /* <= (ZSTD_WINDOWLOG_MAX-1) bits */ - if (MEM_32bits()) BIT_reloadDStream(&seqState->DStream); + ZSTD_STATIC_ASSERT(ZSTD_lo_isLongOffset == 1); + ZSTD_STATIC_ASSERT(LONG_OFFSETS_MAX_EXTRA_BITS_32 == 5); + assert(ofBits <= MaxOff); + if (MEM_32bits() && longOffsets) { + U32 const extraBits = ofBits - MIN(ofBits, STREAM_ACCUMULATOR_MIN_32-1); + offset = ofBase + (BIT_readBitsFast(&seqState->DStream, ofBits - extraBits) << extraBits); + if (MEM_32bits() || extraBits) BIT_reloadDStream(&seqState->DStream); + if (extraBits) offset += BIT_readBitsFast(&seqState->DStream, extraBits); + } else { + offset = ofBase + BIT_readBitsFast(&seqState->DStream, ofBits); /* <= (ZSTD_WINDOWLOG_MAX-1) bits */ + if (MEM_32bits()) BIT_reloadDStream(&seqState->DStream); + } } - if (ofCode <= 1) { - offset += (llCode==0); + if (ofBits <= 1) { + offset += (llBase==0); if (offset) { size_t temp = (offset==3) ? seqState->prevOffset[0] - 1 : seqState->prevOffset[offset]; temp += !temp; /* 0 is not valid; input is corrupted; force offset to 1 */ @@ -1119,114 +1456,40 @@ seq.offset = offset; } - seq.matchLength = ML_base[mlCode] + ((mlCode>31) ? BIT_readBitsFast(&seqState->DStream, mlBits) : 0); /* <= 16 bits */ - if (MEM_32bits() && (mlBits+llBits>24)) BIT_reloadDStream(&seqState->DStream); - - seq.litLength = LL_base[llCode] + ((llCode>15) ? BIT_readBitsFast(&seqState->DStream, llBits) : 0); /* <= 16 bits */ - if (MEM_32bits() || - (totalBits > 64 - 7 - (LLFSELog+MLFSELog+OffFSELog)) ) BIT_reloadDStream(&seqState->DStream); + seq.matchLength = mlBase + ((mlBits>0) ? BIT_readBitsFast(&seqState->DStream, mlBits) : 0); /* <= 16 bits */ + if (MEM_32bits() && (mlBits+llBits >= STREAM_ACCUMULATOR_MIN_32-LONG_OFFSETS_MAX_EXTRA_BITS_32)) + BIT_reloadDStream(&seqState->DStream); + if (MEM_64bits() && (totalBits >= STREAM_ACCUMULATOR_MIN_64-(LLFSELog+MLFSELog+OffFSELog))) + BIT_reloadDStream(&seqState->DStream); + /* Verify that there is enough bits to read the rest of the data in 64-bit mode. */ + ZSTD_STATIC_ASSERT(16+LLFSELog+MLFSELog+OffFSELog < STREAM_ACCUMULATOR_MIN_64); + + seq.litLength = llBase + ((llBits>0) ? BIT_readBitsFast(&seqState->DStream, llBits) : 0); /* <= 16 bits */ + if (MEM_32bits()) + BIT_reloadDStream(&seqState->DStream); { size_t const pos = seqState->pos + seq.litLength; - seq.match = seqState->base + pos - seq.offset; /* single memory segment */ - if (seq.offset > pos) seq.match += seqState->gotoDict; /* separate memory segment */ + const BYTE* const matchBase = (seq.offset > pos) ? seqState->dictEnd : seqState->prefixStart; + seq.match = matchBase + pos - seq.offset; /* note : this operation can overflow when seq.offset is really too large, which can only happen when input is corrupted. + * No consequence though : no memory access will occur, overly large offset will be detected in ZSTD_execSequenceLong() */ seqState->pos = pos + seq.matchLength; } /* ANS state update */ - FSE_updateState(&seqState->stateLL, &seqState->DStream); /* <= 9 bits */ - FSE_updateState(&seqState->stateML, &seqState->DStream); /* <= 9 bits */ + ZSTD_updateFseState(&seqState->stateLL, &seqState->DStream); /* <= 9 bits */ + ZSTD_updateFseState(&seqState->stateML, &seqState->DStream); /* <= 9 bits */ if (MEM_32bits()) BIT_reloadDStream(&seqState->DStream); /* <= 18 bits */ - FSE_updateState(&seqState->stateOffb, &seqState->DStream); /* <= 8 bits */ + ZSTD_updateFseState(&seqState->stateOffb, &seqState->DStream); /* <= 8 bits */ return seq; } -FORCE_INLINE -size_t ZSTD_execSequenceLong(BYTE* op, - BYTE* const oend, seq_t sequence, - const BYTE** litPtr, const BYTE* const litLimit, - const BYTE* const base, const BYTE* const vBase, const BYTE* const dictEnd) -{ - BYTE* const oLitEnd = op + sequence.litLength; - size_t const sequenceLength = sequence.litLength + sequence.matchLength; - BYTE* const oMatchEnd = op + sequenceLength; /* risk : address space overflow (32-bits) */ - BYTE* const oend_w = oend - WILDCOPY_OVERLENGTH; - const BYTE* const iLitEnd = *litPtr + sequence.litLength; - const BYTE* match = sequence.match; - - /* check */ -#if 1 - if (oMatchEnd>oend) return ERROR(dstSize_tooSmall); /* last match must start at a minimum distance of WILDCOPY_OVERLENGTH from oend */ - if (iLitEnd > litLimit) return ERROR(corruption_detected); /* over-read beyond lit buffer */ - if (oLitEnd>oend_w) return ZSTD_execSequenceLast7(op, oend, sequence, litPtr, litLimit, base, vBase, dictEnd); -#endif - - /* copy Literals */ - ZSTD_copy8(op, *litPtr); - if (sequence.litLength > 8) - ZSTD_wildcopy(op+8, (*litPtr)+8, sequence.litLength - 8); /* note : since oLitEnd <= oend-WILDCOPY_OVERLENGTH, no risk of overwrite beyond oend */ - op = oLitEnd; - *litPtr = iLitEnd; /* update for next sequence */ - - /* copy Match */ -#if 1 - if (sequence.offset > (size_t)(oLitEnd - base)) { - /* offset beyond prefix */ - if (sequence.offset > (size_t)(oLitEnd - vBase)) return ERROR(corruption_detected); - if (match + sequence.matchLength <= dictEnd) { - memmove(oLitEnd, match, sequence.matchLength); - return sequenceLength; - } - /* span extDict & currentPrefixSegment */ - { size_t const length1 = dictEnd - match; - memmove(oLitEnd, match, length1); - op = oLitEnd + length1; - sequence.matchLength -= length1; - match = base; - if (op > oend_w || sequence.matchLength < MINMATCH) { - U32 i; - for (i = 0; i < sequence.matchLength; ++i) op[i] = match[i]; - return sequenceLength; - } - } } - /* Requirement: op <= oend_w && sequence.matchLength >= MINMATCH */ -#endif - - /* match within prefix */ - if (sequence.offset < 8) { - /* close range match, overlap */ - static const U32 dec32table[] = { 0, 1, 2, 1, 4, 4, 4, 4 }; /* added */ - static const int dec64table[] = { 8, 8, 8, 7, 8, 9,10,11 }; /* substracted */ - int const sub2 = dec64table[sequence.offset]; - op[0] = match[0]; - op[1] = match[1]; - op[2] = match[2]; - op[3] = match[3]; - match += dec32table[sequence.offset]; - ZSTD_copy4(op+4, match); - match -= sub2; - } else { - ZSTD_copy8(op, match); - } - op += 8; match += 8; - - if (oMatchEnd > oend-(16-MINMATCH)) { - if (op < oend_w) { - ZSTD_wildcopy(op, match, oend_w - op); - match += oend_w - op; - op = oend_w; - } - while (op < oMatchEnd) *op++ = *match++; - } else { - ZSTD_wildcopy(op, match, (ptrdiff_t)sequence.matchLength-8); /* works even if matchLength < 8 */ - } - return sequenceLength; -} - -static size_t ZSTD_decompressSequencesLong( +FORCE_INLINE_TEMPLATE size_t +ZSTD_decompressSequencesLong_body( ZSTD_DCtx* dctx, void* dst, size_t maxDstSize, - const void* seqStart, size_t seqSize) + const void* seqStart, size_t seqSize, int nbSeq, + const ZSTD_longOffset_e isLongOffset) { const BYTE* ip = (const BYTE*)seqStart; const BYTE* const iend = ip + seqSize; @@ -1235,16 +1498,9 @@ BYTE* op = ostart; const BYTE* litPtr = dctx->litPtr; const BYTE* const litEnd = litPtr + dctx->litSize; - const BYTE* const base = (const BYTE*) (dctx->base); - const BYTE* const vBase = (const BYTE*) (dctx->vBase); + const BYTE* const prefixStart = (const BYTE*) (dctx->base); + const BYTE* const dictStart = (const BYTE*) (dctx->vBase); const BYTE* const dictEnd = (const BYTE*) (dctx->dictEnd); - int nbSeq; - - /* Build Decoding Tables */ - { size_t const seqHSize = ZSTD_decodeSeqHeaders(dctx, &nbSeq, ip, seqSize); - if (ZSTD_isError(seqHSize)) return seqHSize; - ip += seqHSize; - } /* Regen sequences */ if (nbSeq) { @@ -1256,27 +1512,27 @@ seqState_t seqState; int seqNb; dctx->fseEntropy = 1; - { U32 i; for (i=0; irep[i]; } - seqState.base = base; - seqState.pos = (size_t)(op-base); - seqState.gotoDict = (iPtrDiff)(dictEnd - base); + { U32 i; for (i=0; ientropy.rep[i]; } + seqState.prefixStart = prefixStart; + seqState.pos = (size_t)(op-prefixStart); + seqState.dictEnd = dictEnd; CHECK_E(BIT_initDStream(&seqState.DStream, ip, iend-ip), corruption_detected); - FSE_initDState(&seqState.stateLL, &seqState.DStream, dctx->LLTptr); - FSE_initDState(&seqState.stateOffb, &seqState.DStream, dctx->OFTptr); - FSE_initDState(&seqState.stateML, &seqState.DStream, dctx->MLTptr); + ZSTD_initFseState(&seqState.stateLL, &seqState.DStream, dctx->LLTptr); + ZSTD_initFseState(&seqState.stateOffb, &seqState.DStream, dctx->OFTptr); + ZSTD_initFseState(&seqState.stateML, &seqState.DStream, dctx->MLTptr); /* prepare in advance */ - for (seqNb=0; (BIT_reloadDStream(&seqState.DStream) <= BIT_DStream_completed) && seqNbrep[i] = (U32)(seqState.prevOffset[i]); } + { U32 i; for (i=0; ientropy.rep[i] = (U32)(seqState.prevOffset[i]); } +#undef STORED_SEQS +#undef STOSEQ_MASK +#undef ADVANCED_SEQS } /* last literal segment */ @@ -1304,23 +1563,137 @@ return op-ostart; } +static size_t +ZSTD_decompressSequencesLong_default(ZSTD_DCtx* dctx, + void* dst, size_t maxDstSize, + const void* seqStart, size_t seqSize, int nbSeq, + const ZSTD_longOffset_e isLongOffset) +{ + return ZSTD_decompressSequencesLong_body(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset); +} + + + +#if DYNAMIC_BMI2 + +static TARGET_ATTRIBUTE("bmi2") size_t +ZSTD_decompressSequences_bmi2(ZSTD_DCtx* dctx, + void* dst, size_t maxDstSize, + const void* seqStart, size_t seqSize, int nbSeq, + const ZSTD_longOffset_e isLongOffset) +{ + return ZSTD_decompressSequences_body(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset); +} + +static TARGET_ATTRIBUTE("bmi2") size_t +ZSTD_decompressSequencesLong_bmi2(ZSTD_DCtx* dctx, + void* dst, size_t maxDstSize, + const void* seqStart, size_t seqSize, int nbSeq, + const ZSTD_longOffset_e isLongOffset) +{ + return ZSTD_decompressSequencesLong_body(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset); +} + +#endif + +typedef size_t (*ZSTD_decompressSequences_t)( + ZSTD_DCtx *dctx, void *dst, size_t maxDstSize, + const void *seqStart, size_t seqSize, int nbSeq, + const ZSTD_longOffset_e isLongOffset); + +static size_t ZSTD_decompressSequences(ZSTD_DCtx* dctx, void* dst, size_t maxDstSize, + const void* seqStart, size_t seqSize, int nbSeq, + const ZSTD_longOffset_e isLongOffset) +{ + DEBUGLOG(5, "ZSTD_decompressSequences"); +#if DYNAMIC_BMI2 + if (dctx->bmi2) { + return ZSTD_decompressSequences_bmi2(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset); + } +#endif + return ZSTD_decompressSequences_default(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset); +} + +static size_t ZSTD_decompressSequencesLong(ZSTD_DCtx* dctx, + void* dst, size_t maxDstSize, + const void* seqStart, size_t seqSize, int nbSeq, + const ZSTD_longOffset_e isLongOffset) +{ + DEBUGLOG(5, "ZSTD_decompressSequencesLong"); +#if DYNAMIC_BMI2 + if (dctx->bmi2) { + return ZSTD_decompressSequencesLong_bmi2(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset); + } +#endif + return ZSTD_decompressSequencesLong_default(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset); +} + +/* ZSTD_getLongOffsetsShare() : + * condition : offTable must be valid + * @return : "share" of long offsets (arbitrarily defined as > (1<<23)) + * compared to maximum possible of (1< 22) total += 1; + } + + assert(tableLog <= OffFSELog); + total <<= (OffFSELog - tableLog); /* scale to OffFSELog */ + + return total; +} + static size_t ZSTD_decompressBlock_internal(ZSTD_DCtx* dctx, void* dst, size_t dstCapacity, - const void* src, size_t srcSize) + const void* src, size_t srcSize, const int frame) { /* blockType == blockCompressed */ const BYTE* ip = (const BYTE*)src; - - if (srcSize >= ZSTD_BLOCKSIZE_ABSOLUTEMAX) return ERROR(srcSize_wrong); - - /* Decode literals sub-block */ + /* isLongOffset must be true if there are long offsets. + * Offsets are long if they are larger than 2^STREAM_ACCUMULATOR_MIN. + * We don't expect that to be the case in 64-bit mode. + * In block mode, window size is not known, so we have to be conservative. (note: but it could be evaluated from current-lowLimit) + */ + ZSTD_longOffset_e const isLongOffset = (ZSTD_longOffset_e)(MEM_32bits() && (!frame || dctx->fParams.windowSize > (1ULL << STREAM_ACCUMULATOR_MIN))); + DEBUGLOG(5, "ZSTD_decompressBlock_internal (size : %u)", (U32)srcSize); + + if (srcSize >= ZSTD_BLOCKSIZE_MAX) return ERROR(srcSize_wrong); + + /* Decode literals section */ { size_t const litCSize = ZSTD_decodeLiteralsBlock(dctx, src, srcSize); + DEBUGLOG(5, "ZSTD_decodeLiteralsBlock : %u", (U32)litCSize); if (ZSTD_isError(litCSize)) return litCSize; ip += litCSize; srcSize -= litCSize; } - if (dctx->fParams.windowSize > (1<<23)) return ZSTD_decompressSequencesLong(dctx, dst, dstCapacity, ip, srcSize); - return ZSTD_decompressSequences(dctx, dst, dstCapacity, ip, srcSize); + + /* Build Decoding Tables */ + { int nbSeq; + size_t const seqHSize = ZSTD_decodeSeqHeaders(dctx, &nbSeq, ip, srcSize); + if (ZSTD_isError(seqHSize)) return seqHSize; + ip += seqHSize; + srcSize -= seqHSize; + + if ( (!frame || dctx->fParams.windowSize > (1<<24)) + && (nbSeq>0) ) { /* could probably use a larger nbSeq limit */ + U32 const shareLongOffsets = ZSTD_getLongOffsetsShare(dctx->OFTptr); + U32 const minShare = MEM_64bits() ? 7 : 20; /* heuristic values, correspond to 2.73% and 7.81% */ + if (shareLongOffsets >= minShare) + return ZSTD_decompressSequencesLong(dctx, dst, dstCapacity, ip, srcSize, nbSeq, isLongOffset); + } + + return ZSTD_decompressSequences(dctx, dst, dstCapacity, ip, srcSize, nbSeq, isLongOffset); + } } @@ -1340,7 +1713,7 @@ { size_t dSize; ZSTD_checkContinuity(dctx, dst); - dSize = ZSTD_decompressBlock_internal(dctx, dst, dstCapacity, src, srcSize); + dSize = ZSTD_decompressBlock_internal(dctx, dst, dstCapacity, src, srcSize, /* frame */ 0); dctx->previousDstEnd = (char*)dst + dSize; return dSize; } @@ -1356,34 +1729,89 @@ } -size_t ZSTD_generateNxBytes(void* dst, size_t dstCapacity, BYTE byte, size_t length) +static size_t ZSTD_generateNxBytes(void* dst, size_t dstCapacity, BYTE byte, size_t length) { if (length > dstCapacity) return ERROR(dstSize_tooSmall); memset(dst, byte, length); return length; } +/** ZSTD_findFrameCompressedSize() : + * compatible with legacy mode + * `src` must point to the start of a ZSTD frame, ZSTD legacy frame, or skippable frame + * `srcSize` must be at least as large as the frame contained + * @return : the compressed size of the frame starting at `src` */ +size_t ZSTD_findFrameCompressedSize(const void *src, size_t srcSize) +{ +#if defined(ZSTD_LEGACY_SUPPORT) && (ZSTD_LEGACY_SUPPORT >= 1) + if (ZSTD_isLegacy(src, srcSize)) + return ZSTD_findFrameCompressedSizeLegacy(src, srcSize); +#endif + if ( (srcSize >= ZSTD_skippableHeaderSize) + && (MEM_readLE32(src) & 0xFFFFFFF0U) == ZSTD_MAGIC_SKIPPABLE_START ) { + return ZSTD_skippableHeaderSize + MEM_readLE32((const BYTE*)src + ZSTD_frameIdSize); + } else { + const BYTE* ip = (const BYTE*)src; + const BYTE* const ipstart = ip; + size_t remainingSize = srcSize; + ZSTD_frameHeader zfh; + + /* Extract Frame Header */ + { size_t const ret = ZSTD_getFrameHeader(&zfh, src, srcSize); + if (ZSTD_isError(ret)) return ret; + if (ret > 0) return ERROR(srcSize_wrong); + } + + ip += zfh.headerSize; + remainingSize -= zfh.headerSize; + + /* Loop on each block */ + while (1) { + blockProperties_t blockProperties; + size_t const cBlockSize = ZSTD_getcBlockSize(ip, remainingSize, &blockProperties); + if (ZSTD_isError(cBlockSize)) return cBlockSize; + + if (ZSTD_blockHeaderSize + cBlockSize > remainingSize) + return ERROR(srcSize_wrong); + + ip += ZSTD_blockHeaderSize + cBlockSize; + remainingSize -= ZSTD_blockHeaderSize + cBlockSize; + + if (blockProperties.lastBlock) break; + } + + if (zfh.checksumFlag) { /* Final frame content checksum */ + if (remainingSize < 4) return ERROR(srcSize_wrong); + ip += 4; + remainingSize -= 4; + } + + return ip - ipstart; + } +} /*! ZSTD_decompressFrame() : -* `dctx` must be properly initialized */ +* @dctx must be properly initialized */ static size_t ZSTD_decompressFrame(ZSTD_DCtx* dctx, - void* dst, size_t dstCapacity, - const void* src, size_t srcSize) + void* dst, size_t dstCapacity, + const void** srcPtr, size_t *srcSizePtr) { - const BYTE* ip = (const BYTE*)src; + const BYTE* ip = (const BYTE*)(*srcPtr); BYTE* const ostart = (BYTE* const)dst; BYTE* const oend = ostart + dstCapacity; BYTE* op = ostart; - size_t remainingSize = srcSize; + size_t remainingSize = *srcSizePtr; /* check */ - if (srcSize < ZSTD_frameHeaderSize_min+ZSTD_blockHeaderSize) return ERROR(srcSize_wrong); + if (remainingSize < ZSTD_frameHeaderSize_min+ZSTD_blockHeaderSize) + return ERROR(srcSize_wrong); /* Frame Header */ - { size_t const frameHeaderSize = ZSTD_frameHeaderSize(src, ZSTD_frameHeaderSize_prefix); + { size_t const frameHeaderSize = ZSTD_frameHeaderSize(ip, ZSTD_frameHeaderSize_prefix); if (ZSTD_isError(frameHeaderSize)) return frameHeaderSize; - if (srcSize < frameHeaderSize+ZSTD_blockHeaderSize) return ERROR(srcSize_wrong); - CHECK_F(ZSTD_decodeFrameHeader(dctx, src, frameHeaderSize)); + if (remainingSize < frameHeaderSize+ZSTD_blockHeaderSize) + return ERROR(srcSize_wrong); + CHECK_F( ZSTD_decodeFrameHeader(dctx, ip, frameHeaderSize) ); ip += frameHeaderSize; remainingSize -= frameHeaderSize; } @@ -1401,7 +1829,7 @@ switch(blockProperties.blockType) { case bt_compressed: - decodedSize = ZSTD_decompressBlock_internal(dctx, op, oend-op, ip, cBlockSize); + decodedSize = ZSTD_decompressBlock_internal(dctx, op, oend-op, ip, cBlockSize, /* frame */ 1); break; case bt_raw : decodedSize = ZSTD_copyRawBlock(op, oend-op, ip, cBlockSize); @@ -1415,38 +1843,123 @@ } if (ZSTD_isError(decodedSize)) return decodedSize; - if (dctx->fParams.checksumFlag) XXH64_update(&dctx->xxhState, op, decodedSize); + if (dctx->fParams.checksumFlag) + XXH64_update(&dctx->xxhState, op, decodedSize); op += decodedSize; ip += cBlockSize; remainingSize -= cBlockSize; if (blockProperties.lastBlock) break; } - if (dctx->fParams.checksumFlag) { /* Frame content checksum verification */ + if (dctx->fParams.frameContentSize != ZSTD_CONTENTSIZE_UNKNOWN) { + if ((U64)(op-ostart) != dctx->fParams.frameContentSize) { + return ERROR(corruption_detected); + } } + if (dctx->fParams.checksumFlag) { /* Frame content checksum verification */ U32 const checkCalc = (U32)XXH64_digest(&dctx->xxhState); U32 checkRead; if (remainingSize<4) return ERROR(checksum_wrong); checkRead = MEM_readLE32(ip); if (checkRead != checkCalc) return ERROR(checksum_wrong); + ip += 4; remainingSize -= 4; } - if (remainingSize) return ERROR(srcSize_wrong); + /* Allow caller to get size read */ + *srcPtr = ip; + *srcSizePtr = remainingSize; return op-ostart; } +static const void* ZSTD_DDictDictContent(const ZSTD_DDict* ddict); +static size_t ZSTD_DDictDictSize(const ZSTD_DDict* ddict); + +static size_t ZSTD_decompressMultiFrame(ZSTD_DCtx* dctx, + void* dst, size_t dstCapacity, + const void* src, size_t srcSize, + const void* dict, size_t dictSize, + const ZSTD_DDict* ddict) +{ + void* const dststart = dst; + assert(dict==NULL || ddict==NULL); /* either dict or ddict set, not both */ + + if (ddict) { + dict = ZSTD_DDictDictContent(ddict); + dictSize = ZSTD_DDictDictSize(ddict); + } + + while (srcSize >= ZSTD_frameHeaderSize_prefix) { + U32 magicNumber; + +#if defined(ZSTD_LEGACY_SUPPORT) && (ZSTD_LEGACY_SUPPORT >= 1) + if (ZSTD_isLegacy(src, srcSize)) { + size_t decodedSize; + size_t const frameSize = ZSTD_findFrameCompressedSizeLegacy(src, srcSize); + if (ZSTD_isError(frameSize)) return frameSize; + /* legacy support is not compatible with static dctx */ + if (dctx->staticSize) return ERROR(memory_allocation); + + decodedSize = ZSTD_decompressLegacy(dst, dstCapacity, src, frameSize, dict, dictSize); + + dst = (BYTE*)dst + decodedSize; + dstCapacity -= decodedSize; + + src = (const BYTE*)src + frameSize; + srcSize -= frameSize; + + continue; + } +#endif + + magicNumber = MEM_readLE32(src); + DEBUGLOG(4, "reading magic number %08X (expecting %08X)", + (U32)magicNumber, (U32)ZSTD_MAGICNUMBER); + if (magicNumber != ZSTD_MAGICNUMBER) { + if ((magicNumber & 0xFFFFFFF0U) == ZSTD_MAGIC_SKIPPABLE_START) { + size_t skippableSize; + if (srcSize < ZSTD_skippableHeaderSize) + return ERROR(srcSize_wrong); + skippableSize = MEM_readLE32((const BYTE*)src + ZSTD_frameIdSize) + + ZSTD_skippableHeaderSize; + if (srcSize < skippableSize) return ERROR(srcSize_wrong); + + src = (const BYTE *)src + skippableSize; + srcSize -= skippableSize; + continue; + } + return ERROR(prefix_unknown); + } + + if (ddict) { + /* we were called from ZSTD_decompress_usingDDict */ + CHECK_F(ZSTD_decompressBegin_usingDDict(dctx, ddict)); + } else { + /* this will initialize correctly with no dict if dict == NULL, so + * use this in all cases but ddict */ + CHECK_F(ZSTD_decompressBegin_usingDict(dctx, dict, dictSize)); + } + ZSTD_checkContinuity(dctx, dst); + + { const size_t res = ZSTD_decompressFrame(dctx, dst, dstCapacity, + &src, &srcSize); + if (ZSTD_isError(res)) return res; + /* no need to bound check, ZSTD_decompressFrame already has */ + dst = (BYTE*)dst + res; + dstCapacity -= res; + } + } /* while (srcSize >= ZSTD_frameHeaderSize_prefix) */ + + if (srcSize) return ERROR(srcSize_wrong); /* input not entirely consumed */ + + return (BYTE*)dst - (BYTE*)dststart; +} size_t ZSTD_decompress_usingDict(ZSTD_DCtx* dctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize, const void* dict, size_t dictSize) { -#if defined(ZSTD_LEGACY_SUPPORT) && (ZSTD_LEGACY_SUPPORT==1) - if (ZSTD_isLegacy(src, srcSize)) return ZSTD_decompressLegacy(dst, dstCapacity, src, srcSize, dict, dictSize); -#endif - CHECK_F(ZSTD_decompressBegin_usingDict(dctx, dict, dictSize)); - ZSTD_checkContinuity(dctx, dst); - return ZSTD_decompressFrame(dctx, dst, dstCapacity, src, srcSize); + return ZSTD_decompressMultiFrame(dctx, dst, dstCapacity, src, srcSize, dict, dictSize, NULL); } @@ -1458,7 +1971,7 @@ size_t ZSTD_decompress(void* dst, size_t dstCapacity, const void* src, size_t srcSize) { -#if defined(ZSTD_HEAPMODE) && (ZSTD_HEAPMODE==1) +#if defined(ZSTD_HEAPMODE) && (ZSTD_HEAPMODE>=1) size_t regenSize; ZSTD_DCtx* const dctx = ZSTD_createDCtx(); if (dctx==NULL) return ERROR(memory_allocation); @@ -1482,6 +1995,7 @@ switch(dctx->stage) { default: /* should not happen */ + assert(0); case ZSTDds_getFrameHeaderSize: case ZSTDds_decodeFrameHeader: return ZSTDnit_frameHeader; @@ -1499,39 +2013,41 @@ } } -int ZSTD_isSkipFrame(ZSTD_DCtx* dctx) { return dctx->stage == ZSTDds_skipFrame; } /* for zbuff */ +static int ZSTD_isSkipFrame(ZSTD_DCtx* dctx) { return dctx->stage == ZSTDds_skipFrame; } /** ZSTD_decompressContinue() : -* @return : nb of bytes generated into `dst` (necessarily <= `dstCapacity) -* or an error code, which can be tested using ZSTD_isError() */ + * srcSize : must be the exact nb of bytes expected (see ZSTD_nextSrcSizeToDecompress()) + * @return : nb of bytes generated into `dst` (necessarily <= `dstCapacity) + * or an error code, which can be tested using ZSTD_isError() */ size_t ZSTD_decompressContinue(ZSTD_DCtx* dctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize) { + DEBUGLOG(5, "ZSTD_decompressContinue (srcSize:%u)", (U32)srcSize); /* Sanity check */ - if (srcSize != dctx->expected) return ERROR(srcSize_wrong); + if (srcSize != dctx->expected) return ERROR(srcSize_wrong); /* not allowed */ if (dstCapacity) ZSTD_checkContinuity(dctx, dst); switch (dctx->stage) { case ZSTDds_getFrameHeaderSize : - if (srcSize != ZSTD_frameHeaderSize_prefix) return ERROR(srcSize_wrong); /* impossible */ - if ((MEM_readLE32(src) & 0xFFFFFFF0U) == ZSTD_MAGIC_SKIPPABLE_START) { /* skippable frame */ - memcpy(dctx->headerBuffer, src, ZSTD_frameHeaderSize_prefix); - dctx->expected = ZSTD_skippableHeaderSize - ZSTD_frameHeaderSize_prefix; /* magic number + skippable frame length */ - dctx->stage = ZSTDds_decodeSkippableHeader; - return 0; - } - dctx->headerSize = ZSTD_frameHeaderSize(src, ZSTD_frameHeaderSize_prefix); + assert(src != NULL); + if (dctx->format == ZSTD_f_zstd1) { /* allows header */ + assert(srcSize >= ZSTD_frameIdSize); /* to read skippable magic number */ + if ((MEM_readLE32(src) & 0xFFFFFFF0U) == ZSTD_MAGIC_SKIPPABLE_START) { /* skippable frame */ + memcpy(dctx->headerBuffer, src, srcSize); + dctx->expected = ZSTD_skippableHeaderSize - srcSize; /* remaining to load to get full skippable frame header */ + dctx->stage = ZSTDds_decodeSkippableHeader; + return 0; + } } + dctx->headerSize = ZSTD_frameHeaderSize_internal(src, srcSize, dctx->format); if (ZSTD_isError(dctx->headerSize)) return dctx->headerSize; - memcpy(dctx->headerBuffer, src, ZSTD_frameHeaderSize_prefix); - if (dctx->headerSize > ZSTD_frameHeaderSize_prefix) { - dctx->expected = dctx->headerSize - ZSTD_frameHeaderSize_prefix; - dctx->stage = ZSTDds_decodeFrameHeader; - return 0; - } - dctx->expected = 0; /* not necessary to copy more */ + memcpy(dctx->headerBuffer, src, srcSize); + dctx->expected = dctx->headerSize - srcSize; + dctx->stage = ZSTDds_decodeFrameHeader; + return 0; case ZSTDds_decodeFrameHeader: - memcpy(dctx->headerBuffer + ZSTD_frameHeaderSize_prefix, src, dctx->expected); + assert(src != NULL); + memcpy(dctx->headerBuffer + (dctx->headerSize - srcSize), src, srcSize); CHECK_F(ZSTD_decodeFrameHeader(dctx, dctx->headerBuffer, dctx->headerSize)); dctx->expected = ZSTD_blockHeaderSize; dctx->stage = ZSTDds_decodeBlockHeader; @@ -1558,18 +2074,21 @@ dctx->stage = ZSTDds_getFrameHeaderSize; } } else { - dctx->expected = 3; /* go directly to next header */ + dctx->expected = ZSTD_blockHeaderSize; /* jump to next header */ dctx->stage = ZSTDds_decodeBlockHeader; } return 0; } + case ZSTDds_decompressLastBlock: case ZSTDds_decompressBlock: + DEBUGLOG(5, "ZSTD_decompressContinue: case ZSTDds_decompressBlock"); { size_t rSize; switch(dctx->bType) { case bt_compressed: - rSize = ZSTD_decompressBlock_internal(dctx, dst, dstCapacity, src, srcSize); + DEBUGLOG(5, "ZSTD_decompressContinue: case bt_compressed"); + rSize = ZSTD_decompressBlock_internal(dctx, dst, dstCapacity, src, srcSize, /* frame */ 1); break; case bt_raw : rSize = ZSTD_copyRawBlock(dst, dstCapacity, src, srcSize); @@ -1582,9 +2101,16 @@ return ERROR(corruption_detected); } if (ZSTD_isError(rSize)) return rSize; + DEBUGLOG(5, "ZSTD_decompressContinue: decoded size from block : %u", (U32)rSize); + dctx->decodedSize += rSize; if (dctx->fParams.checksumFlag) XXH64_update(&dctx->xxhState, dst, rSize); if (dctx->stage == ZSTDds_decompressLastBlock) { /* end of frame */ + DEBUGLOG(4, "ZSTD_decompressContinue: decoded size from frame : %u", (U32)dctx->decodedSize); + if (dctx->fParams.frameContentSize != ZSTD_CONTENTSIZE_UNKNOWN) { + if (dctx->decodedSize != dctx->fParams.frameContentSize) { + return ERROR(corruption_detected); + } } if (dctx->fParams.checksumFlag) { /* another round for frame checksum */ dctx->expected = 4; dctx->stage = ZSTDds_checkChecksum; @@ -1599,25 +2125,31 @@ } return rSize; } + case ZSTDds_checkChecksum: + assert(srcSize == 4); /* guaranteed by dctx->expected */ { U32 const h32 = (U32)XXH64_digest(&dctx->xxhState); - U32 const check32 = MEM_readLE32(src); /* srcSize == 4, guaranteed by dctx->expected */ + U32 const check32 = MEM_readLE32(src); + DEBUGLOG(4, "ZSTD_decompressContinue: checksum : calculated %08X :: %08X read", h32, check32); if (check32 != h32) return ERROR(checksum_wrong); dctx->expected = 0; dctx->stage = ZSTDds_getFrameHeaderSize; return 0; } + case ZSTDds_decodeSkippableHeader: - { memcpy(dctx->headerBuffer + ZSTD_frameHeaderSize_prefix, src, dctx->expected); - dctx->expected = MEM_readLE32(dctx->headerBuffer + 4); - dctx->stage = ZSTDds_skipFrame; - return 0; - } + assert(src != NULL); + assert(srcSize <= ZSTD_skippableHeaderSize); + memcpy(dctx->headerBuffer + (ZSTD_skippableHeaderSize - srcSize), src, srcSize); /* complete skippable header */ + dctx->expected = MEM_readLE32(dctx->headerBuffer + ZSTD_frameIdSize); /* note : dctx->expected can grow seriously large, beyond local buffer size */ + dctx->stage = ZSTDds_skipFrame; + return 0; + case ZSTDds_skipFrame: - { dctx->expected = 0; - dctx->stage = ZSTDds_getFrameHeaderSize; - return 0; - } + dctx->expected = 0; + dctx->stage = ZSTDds_getFrameHeaderSize; + return 0; + default: return ERROR(GENERIC); /* impossible */ } @@ -1633,22 +2165,35 @@ return 0; } -static size_t ZSTD_loadEntropy(ZSTD_DCtx* dctx, const void* const dict, size_t const dictSize) +/* ZSTD_loadEntropy() : + * dict : must point at beginning of a valid zstd dictionary + * @return : size of entropy tables read */ +static size_t ZSTD_loadEntropy(ZSTD_entropyDTables_t* entropy, const void* const dict, size_t const dictSize) { const BYTE* dictPtr = (const BYTE*)dict; const BYTE* const dictEnd = dictPtr + dictSize; - { size_t const hSize = HUF_readDTableX4(dctx->hufTable, dict, dictSize); + if (dictSize <= 8) return ERROR(dictionary_corrupted); + dictPtr += 8; /* skip header = magic + dictID */ + + + { size_t const hSize = HUF_readDTableX4_wksp( + entropy->hufTable, dictPtr, dictEnd - dictPtr, + entropy->workspace, sizeof(entropy->workspace)); if (HUF_isError(hSize)) return ERROR(dictionary_corrupted); dictPtr += hSize; } { short offcodeNCount[MaxOff+1]; - U32 offcodeMaxValue=MaxOff, offcodeLog; + U32 offcodeMaxValue = MaxOff, offcodeLog; size_t const offcodeHeaderSize = FSE_readNCount(offcodeNCount, &offcodeMaxValue, &offcodeLog, dictPtr, dictEnd-dictPtr); if (FSE_isError(offcodeHeaderSize)) return ERROR(dictionary_corrupted); + if (offcodeMaxValue > MaxOff) return ERROR(dictionary_corrupted); if (offcodeLog > OffFSELog) return ERROR(dictionary_corrupted); - CHECK_E(FSE_buildDTable(dctx->OFTable, offcodeNCount, offcodeMaxValue, offcodeLog), dictionary_corrupted); + ZSTD_buildFSETable(entropy->OFTable, + offcodeNCount, offcodeMaxValue, + OF_base, OF_bits, + offcodeLog); dictPtr += offcodeHeaderSize; } @@ -1656,8 +2201,12 @@ unsigned matchlengthMaxValue = MaxML, matchlengthLog; size_t const matchlengthHeaderSize = FSE_readNCount(matchlengthNCount, &matchlengthMaxValue, &matchlengthLog, dictPtr, dictEnd-dictPtr); if (FSE_isError(matchlengthHeaderSize)) return ERROR(dictionary_corrupted); + if (matchlengthMaxValue > MaxML) return ERROR(dictionary_corrupted); if (matchlengthLog > MLFSELog) return ERROR(dictionary_corrupted); - CHECK_E(FSE_buildDTable(dctx->MLTable, matchlengthNCount, matchlengthMaxValue, matchlengthLog), dictionary_corrupted); + ZSTD_buildFSETable(entropy->MLTable, + matchlengthNCount, matchlengthMaxValue, + ML_base, ML_bits, + matchlengthLog); dictPtr += matchlengthHeaderSize; } @@ -1665,18 +2214,24 @@ unsigned litlengthMaxValue = MaxLL, litlengthLog; size_t const litlengthHeaderSize = FSE_readNCount(litlengthNCount, &litlengthMaxValue, &litlengthLog, dictPtr, dictEnd-dictPtr); if (FSE_isError(litlengthHeaderSize)) return ERROR(dictionary_corrupted); + if (litlengthMaxValue > MaxLL) return ERROR(dictionary_corrupted); if (litlengthLog > LLFSELog) return ERROR(dictionary_corrupted); - CHECK_E(FSE_buildDTable(dctx->LLTable, litlengthNCount, litlengthMaxValue, litlengthLog), dictionary_corrupted); + ZSTD_buildFSETable(entropy->LLTable, + litlengthNCount, litlengthMaxValue, + LL_base, LL_bits, + litlengthLog); dictPtr += litlengthHeaderSize; } if (dictPtr+12 > dictEnd) return ERROR(dictionary_corrupted); - dctx->rep[0] = MEM_readLE32(dictPtr+0); if (dctx->rep[0] == 0 || dctx->rep[0] >= dictSize) return ERROR(dictionary_corrupted); - dctx->rep[1] = MEM_readLE32(dictPtr+4); if (dctx->rep[1] == 0 || dctx->rep[1] >= dictSize) return ERROR(dictionary_corrupted); - dctx->rep[2] = MEM_readLE32(dictPtr+8); if (dctx->rep[2] == 0 || dctx->rep[2] >= dictSize) return ERROR(dictionary_corrupted); - dictPtr += 12; - - dctx->litEntropy = dctx->fseEntropy = 1; + { int i; + size_t const dictContentSize = (size_t)(dictEnd - (dictPtr+12)); + for (i=0; i<3; i++) { + U32 const rep = MEM_readLE32(dictPtr); dictPtr += 4; + if (rep==0 || rep >= dictContentSize) return ERROR(dictionary_corrupted); + entropy->rep[i] = rep; + } } + return dictPtr - (const BYTE*)dict; } @@ -1684,28 +2239,51 @@ { if (dictSize < 8) return ZSTD_refDictContent(dctx, dict, dictSize); { U32 const magic = MEM_readLE32(dict); - if (magic != ZSTD_DICT_MAGIC) { + if (magic != ZSTD_MAGIC_DICTIONARY) { return ZSTD_refDictContent(dctx, dict, dictSize); /* pure content mode */ } } - dctx->dictID = MEM_readLE32((const char*)dict + 4); + dctx->dictID = MEM_readLE32((const char*)dict + ZSTD_frameIdSize); /* load entropy tables */ - dict = (const char*)dict + 8; - dictSize -= 8; - { size_t const eSize = ZSTD_loadEntropy(dctx, dict, dictSize); + { size_t const eSize = ZSTD_loadEntropy(&dctx->entropy, dict, dictSize); if (ZSTD_isError(eSize)) return ERROR(dictionary_corrupted); dict = (const char*)dict + eSize; dictSize -= eSize; } + dctx->litEntropy = dctx->fseEntropy = 1; /* reference dictionary content */ return ZSTD_refDictContent(dctx, dict, dictSize); } +/* Note : this function cannot fail */ +size_t ZSTD_decompressBegin(ZSTD_DCtx* dctx) +{ + assert(dctx != NULL); + dctx->expected = ZSTD_startingInputLength(dctx->format); /* dctx->format must be properly set */ + dctx->stage = ZSTDds_getFrameHeaderSize; + dctx->decodedSize = 0; + dctx->previousDstEnd = NULL; + dctx->base = NULL; + dctx->vBase = NULL; + dctx->dictEnd = NULL; + dctx->entropy.hufTable[0] = (HUF_DTable)((HufLog)*0x1000001); /* cover both little and big endian */ + dctx->litEntropy = dctx->fseEntropy = 0; + dctx->dictID = 0; + ZSTD_STATIC_ASSERT(sizeof(dctx->entropy.rep) == sizeof(repStartValue)); + memcpy(dctx->entropy.rep, repStartValue, sizeof(repStartValue)); /* initial repcodes */ + dctx->LLTptr = dctx->entropy.LLTable; + dctx->MLTptr = dctx->entropy.MLTable; + dctx->OFTptr = dctx->entropy.OFTable; + dctx->HUFptr = dctx->entropy.hufTable; + return 0; +} + size_t ZSTD_decompressBegin_usingDict(ZSTD_DCtx* dctx, const void* dict, size_t dictSize) { - CHECK_F(ZSTD_decompressBegin(dctx)); - if (dict && dictSize) CHECK_E(ZSTD_decompress_insertDictionary(dctx, dict, dictSize), dictionary_corrupted); + CHECK_F( ZSTD_decompressBegin(dctx) ); + if (dict && dictSize) + CHECK_E(ZSTD_decompress_insertDictionary(dctx, dict, dictSize), dictionary_corrupted); return 0; } @@ -1716,83 +2294,186 @@ void* dictBuffer; const void* dictContent; size_t dictSize; - ZSTD_DCtx* refContext; + ZSTD_entropyDTables_t entropy; + U32 dictID; + U32 entropyPresent; + ZSTD_customMem cMem; }; /* typedef'd to ZSTD_DDict within "zstd.h" */ -ZSTD_DDict* ZSTD_createDDict_advanced(const void* dict, size_t dictSize, unsigned byReference, ZSTD_customMem customMem) +static const void* ZSTD_DDictDictContent(const ZSTD_DDict* ddict) +{ + return ddict->dictContent; +} + +static size_t ZSTD_DDictDictSize(const ZSTD_DDict* ddict) +{ + return ddict->dictSize; +} + +size_t ZSTD_decompressBegin_usingDDict(ZSTD_DCtx* dstDCtx, const ZSTD_DDict* ddict) +{ + CHECK_F( ZSTD_decompressBegin(dstDCtx) ); + if (ddict) { /* support begin on NULL */ + dstDCtx->dictID = ddict->dictID; + dstDCtx->base = ddict->dictContent; + dstDCtx->vBase = ddict->dictContent; + dstDCtx->dictEnd = (const BYTE*)ddict->dictContent + ddict->dictSize; + dstDCtx->previousDstEnd = dstDCtx->dictEnd; + if (ddict->entropyPresent) { + dstDCtx->litEntropy = 1; + dstDCtx->fseEntropy = 1; + dstDCtx->LLTptr = ddict->entropy.LLTable; + dstDCtx->MLTptr = ddict->entropy.MLTable; + dstDCtx->OFTptr = ddict->entropy.OFTable; + dstDCtx->HUFptr = ddict->entropy.hufTable; + dstDCtx->entropy.rep[0] = ddict->entropy.rep[0]; + dstDCtx->entropy.rep[1] = ddict->entropy.rep[1]; + dstDCtx->entropy.rep[2] = ddict->entropy.rep[2]; + } else { + dstDCtx->litEntropy = 0; + dstDCtx->fseEntropy = 0; + } + } + return 0; +} + +static size_t ZSTD_loadEntropy_inDDict(ZSTD_DDict* ddict, ZSTD_dictContentType_e dictContentType) { - if (!customMem.customAlloc && !customMem.customFree) customMem = defaultCustomMem; - if (!customMem.customAlloc || !customMem.customFree) return NULL; + ddict->dictID = 0; + ddict->entropyPresent = 0; + if (dictContentType == ZSTD_dct_rawContent) return 0; + + if (ddict->dictSize < 8) { + if (dictContentType == ZSTD_dct_fullDict) + return ERROR(dictionary_corrupted); /* only accept specified dictionaries */ + return 0; /* pure content mode */ + } + { U32 const magic = MEM_readLE32(ddict->dictContent); + if (magic != ZSTD_MAGIC_DICTIONARY) { + if (dictContentType == ZSTD_dct_fullDict) + return ERROR(dictionary_corrupted); /* only accept specified dictionaries */ + return 0; /* pure content mode */ + } + } + ddict->dictID = MEM_readLE32((const char*)ddict->dictContent + ZSTD_frameIdSize); + + /* load entropy tables */ + CHECK_E( ZSTD_loadEntropy(&ddict->entropy, ddict->dictContent, ddict->dictSize), dictionary_corrupted ); + ddict->entropyPresent = 1; + return 0; +} + + +static size_t ZSTD_initDDict_internal(ZSTD_DDict* ddict, + const void* dict, size_t dictSize, + ZSTD_dictLoadMethod_e dictLoadMethod, + ZSTD_dictContentType_e dictContentType) +{ + if ((dictLoadMethod == ZSTD_dlm_byRef) || (!dict) || (!dictSize)) { + ddict->dictBuffer = NULL; + ddict->dictContent = dict; + } else { + void* const internalBuffer = ZSTD_malloc(dictSize, ddict->cMem); + ddict->dictBuffer = internalBuffer; + ddict->dictContent = internalBuffer; + if (!internalBuffer) return ERROR(memory_allocation); + memcpy(internalBuffer, dict, dictSize); + } + ddict->dictSize = dictSize; + ddict->entropy.hufTable[0] = (HUF_DTable)((HufLog)*0x1000001); /* cover both little and big endian */ + + /* parse dictionary content */ + CHECK_F( ZSTD_loadEntropy_inDDict(ddict, dictContentType) ); + + return 0; +} + +ZSTD_DDict* ZSTD_createDDict_advanced(const void* dict, size_t dictSize, + ZSTD_dictLoadMethod_e dictLoadMethod, + ZSTD_dictContentType_e dictContentType, + ZSTD_customMem customMem) +{ + if (!customMem.customAlloc ^ !customMem.customFree) return NULL; { ZSTD_DDict* const ddict = (ZSTD_DDict*) ZSTD_malloc(sizeof(ZSTD_DDict), customMem); - ZSTD_DCtx* const dctx = ZSTD_createDCtx_advanced(customMem); - - if (!ddict || !dctx) { - ZSTD_free(ddict, customMem); - ZSTD_free(dctx, customMem); + if (!ddict) return NULL; + ddict->cMem = customMem; + + if (ZSTD_isError( ZSTD_initDDict_internal(ddict, dict, dictSize, dictLoadMethod, dictContentType) )) { + ZSTD_freeDDict(ddict); return NULL; } - if ((byReference) || (!dict) || (!dictSize)) { - ddict->dictBuffer = NULL; - ddict->dictContent = dict; - } else { - void* const internalBuffer = ZSTD_malloc(dictSize, customMem); - if (!internalBuffer) { ZSTD_free(dctx, customMem); ZSTD_free(ddict, customMem); return NULL; } - memcpy(internalBuffer, dict, dictSize); - ddict->dictBuffer = internalBuffer; - ddict->dictContent = internalBuffer; - } - { size_t const errorCode = ZSTD_decompressBegin_usingDict(dctx, ddict->dictContent, dictSize); - if (ZSTD_isError(errorCode)) { - ZSTD_free(ddict->dictBuffer, customMem); - ZSTD_free(ddict, customMem); - ZSTD_free(dctx, customMem); - return NULL; - } } - - ddict->dictSize = dictSize; - ddict->refContext = dctx; return ddict; } } /*! ZSTD_createDDict() : -* Create a digested dictionary, ready to start decompression without startup delay. -* `dict` can be released after `ZSTD_DDict` creation */ +* Create a digested dictionary, to start decompression without startup delay. +* `dict` content is copied inside DDict. +* Consequently, `dict` can be released after `ZSTD_DDict` creation */ ZSTD_DDict* ZSTD_createDDict(const void* dict, size_t dictSize) { ZSTD_customMem const allocator = { NULL, NULL, NULL }; - return ZSTD_createDDict_advanced(dict, dictSize, 0, allocator); + return ZSTD_createDDict_advanced(dict, dictSize, ZSTD_dlm_byCopy, ZSTD_dct_auto, allocator); } - /*! ZSTD_createDDict_byReference() : - * Create a digested dictionary, ready to start decompression operation without startup delay. - * Dictionary content is simply referenced, and therefore stays in dictBuffer. - * It is important that dictBuffer outlives DDict, it must remain read accessible throughout the lifetime of DDict */ + * Create a digested dictionary, to start decompression without startup delay. + * Dictionary content is simply referenced, it will be accessed during decompression. + * Warning : dictBuffer must outlive DDict (DDict must be freed before dictBuffer) */ ZSTD_DDict* ZSTD_createDDict_byReference(const void* dictBuffer, size_t dictSize) { ZSTD_customMem const allocator = { NULL, NULL, NULL }; - return ZSTD_createDDict_advanced(dictBuffer, dictSize, 1, allocator); + return ZSTD_createDDict_advanced(dictBuffer, dictSize, ZSTD_dlm_byRef, ZSTD_dct_auto, allocator); +} + + +const ZSTD_DDict* ZSTD_initStaticDDict( + void* workspace, size_t workspaceSize, + const void* dict, size_t dictSize, + ZSTD_dictLoadMethod_e dictLoadMethod, + ZSTD_dictContentType_e dictContentType) +{ + size_t const neededSpace = + sizeof(ZSTD_DDict) + (dictLoadMethod == ZSTD_dlm_byRef ? 0 : dictSize); + ZSTD_DDict* const ddict = (ZSTD_DDict*)workspace; + assert(workspace != NULL); + assert(dict != NULL); + if ((size_t)workspace & 7) return NULL; /* 8-aligned */ + if (workspaceSize < neededSpace) return NULL; + if (dictLoadMethod == ZSTD_dlm_byCopy) { + memcpy(ddict+1, dict, dictSize); /* local copy */ + dict = ddict+1; + } + if (ZSTD_isError( ZSTD_initDDict_internal(ddict, dict, dictSize, ZSTD_dlm_byRef, dictContentType) )) + return NULL; + return ddict; } size_t ZSTD_freeDDict(ZSTD_DDict* ddict) { if (ddict==NULL) return 0; /* support free on NULL */ - { ZSTD_customMem const cMem = ddict->refContext->customMem; - ZSTD_freeDCtx(ddict->refContext); + { ZSTD_customMem const cMem = ddict->cMem; ZSTD_free(ddict->dictBuffer, cMem); ZSTD_free(ddict, cMem); return 0; } } +/*! ZSTD_estimateDDictSize() : + * Estimate amount of memory that will be needed to create a dictionary for decompression. + * Note : dictionary created by reference using ZSTD_dlm_byRef are smaller */ +size_t ZSTD_estimateDDictSize(size_t dictSize, ZSTD_dictLoadMethod_e dictLoadMethod) +{ + return sizeof(ZSTD_DDict) + (dictLoadMethod == ZSTD_dlm_byRef ? 0 : dictSize); +} + size_t ZSTD_sizeof_DDict(const ZSTD_DDict* ddict) { if (ddict==NULL) return 0; /* support sizeof on NULL */ - return sizeof(*ddict) + ZSTD_sizeof_DCtx(ddict->refContext) + (ddict->dictBuffer ? ddict->dictSize : 0) ; + return sizeof(*ddict) + (ddict->dictBuffer ? ddict->dictSize : 0) ; } /*! ZSTD_getDictID_fromDict() : @@ -1802,8 +2483,8 @@ unsigned ZSTD_getDictID_fromDict(const void* dict, size_t dictSize) { if (dictSize < 8) return 0; - if (MEM_readLE32(dict) != ZSTD_DICT_MAGIC) return 0; - return MEM_readLE32((const char*)dict + 4); + if (MEM_readLE32(dict) != ZSTD_MAGIC_DICTIONARY) return 0; + return MEM_readLE32((const char*)dict + ZSTD_frameIdSize); } /*! ZSTD_getDictID_fromDDict() : @@ -1817,19 +2498,22 @@ } /*! ZSTD_getDictID_fromFrame() : - * Provides the dictID required to decompressed the frame stored within `src`. + * Provides the dictID required to decompresse frame stored within `src`. * If @return == 0, the dictID could not be decoded. * This could for one of the following reasons : - * - The frame does not require a dictionary to be decoded (most common case). - * - The frame was built with dictID intentionally removed. Whatever dictionary is necessary is a hidden information. + * - The frame does not require a dictionary (most common case). + * - The frame was built with dictID intentionally removed. + * Needed dictionary is a hidden information. * Note : this use case also happens when using a non-conformant dictionary. - * - `srcSize` is too small, and as a result, the frame header could not be decoded (only possible if `srcSize < ZSTD_FRAMEHEADERSIZE_MAX`). + * - `srcSize` is too small, and as a result, frame header could not be decoded. + * Note : possible if `srcSize < ZSTD_FRAMEHEADERSIZE_MAX`. * - This is not a Zstandard frame. - * When identifying the exact failure cause, it's possible to used ZSTD_getFrameParams(), which will provide a more precise error code. */ + * When identifying the exact failure cause, it's possible to use + * ZSTD_getFrameHeader(), which will provide a more precise error code. */ unsigned ZSTD_getDictID_fromFrame(const void* src, size_t srcSize) { - ZSTD_frameParams zfp = { 0 , 0 , 0 , 0 }; - size_t const hError = ZSTD_getFrameParams(&zfp, src, srcSize); + ZSTD_frameHeader zfp = { 0, 0, 0, ZSTD_frame, 0, 0, 0 }; + size_t const hError = ZSTD_getFrameHeader(&zfp, src, srcSize); if (ZSTD_isError(hError)) return 0; return zfp.dictID; } @@ -1843,12 +2527,10 @@ const void* src, size_t srcSize, const ZSTD_DDict* ddict) { -#if defined(ZSTD_LEGACY_SUPPORT) && (ZSTD_LEGACY_SUPPORT==1) - if (ZSTD_isLegacy(src, srcSize)) return ZSTD_decompressLegacy(dst, dstCapacity, src, srcSize, ddict->dictContent, ddict->dictSize); -#endif - ZSTD_refDCtx(dctx, ddict->refContext); - ZSTD_checkContinuity(dctx, dst); - return ZSTD_decompressFrame(dctx, dst, dstCapacity, src, srcSize); + /* pass content and size in case legacy frames are encountered */ + return ZSTD_decompressMultiFrame(dctx, dst, dstCapacity, src, srcSize, + NULL, 0, + ddict); } @@ -1856,133 +2538,180 @@ * Streaming decompression *====================================*/ -typedef enum { zdss_init, zdss_loadHeader, - zdss_read, zdss_load, zdss_flush } ZSTD_dStreamStage; - -/* *** Resource management *** */ -struct ZSTD_DStream_s { - ZSTD_DCtx* dctx; - ZSTD_DDict* ddictLocal; - const ZSTD_DDict* ddict; - ZSTD_frameParams fParams; - ZSTD_dStreamStage stage; - char* inBuff; - size_t inBuffSize; - size_t inPos; - size_t maxWindowSize; - char* outBuff; - size_t outBuffSize; - size_t outStart; - size_t outEnd; - size_t blockSize; - BYTE headerBuffer[ZSTD_FRAMEHEADERSIZE_MAX]; /* tmp buffer to store frame header */ - size_t lhSize; - ZSTD_customMem customMem; - void* legacyContext; - U32 previousLegacyVersion; - U32 legacyVersion; - U32 hostageByte; -}; /* typedef'd to ZSTD_DStream within "zstd.h" */ - - ZSTD_DStream* ZSTD_createDStream(void) { - return ZSTD_createDStream_advanced(defaultCustomMem); + DEBUGLOG(3, "ZSTD_createDStream"); + return ZSTD_createDStream_advanced(ZSTD_defaultCMem); +} + +ZSTD_DStream* ZSTD_initStaticDStream(void *workspace, size_t workspaceSize) +{ + return ZSTD_initStaticDCtx(workspace, workspaceSize); } ZSTD_DStream* ZSTD_createDStream_advanced(ZSTD_customMem customMem) { - ZSTD_DStream* zds; - - if (!customMem.customAlloc && !customMem.customFree) customMem = defaultCustomMem; - if (!customMem.customAlloc || !customMem.customFree) return NULL; - - zds = (ZSTD_DStream*) ZSTD_malloc(sizeof(ZSTD_DStream), customMem); - if (zds==NULL) return NULL; - memset(zds, 0, sizeof(ZSTD_DStream)); - memcpy(&zds->customMem, &customMem, sizeof(ZSTD_customMem)); - zds->dctx = ZSTD_createDCtx_advanced(customMem); - if (zds->dctx == NULL) { ZSTD_freeDStream(zds); return NULL; } - zds->stage = zdss_init; - zds->maxWindowSize = ZSTD_MAXWINDOWSIZE_DEFAULT; - return zds; + return ZSTD_createDCtx_advanced(customMem); } size_t ZSTD_freeDStream(ZSTD_DStream* zds) { - if (zds==NULL) return 0; /* support free on null */ - { ZSTD_customMem const cMem = zds->customMem; - ZSTD_freeDCtx(zds->dctx); - ZSTD_freeDDict(zds->ddictLocal); - ZSTD_free(zds->inBuff, cMem); - ZSTD_free(zds->outBuff, cMem); -#if defined(ZSTD_LEGACY_SUPPORT) && (ZSTD_LEGACY_SUPPORT >= 1) - if (zds->legacyContext) - ZSTD_freeLegacyStreamContext(zds->legacyContext, zds->previousLegacyVersion); -#endif - ZSTD_free(zds, cMem); - return 0; - } + return ZSTD_freeDCtx(zds); } /* *** Initialization *** */ -size_t ZSTD_DStreamInSize(void) { return ZSTD_BLOCKSIZE_ABSOLUTEMAX + ZSTD_blockHeaderSize; } -size_t ZSTD_DStreamOutSize(void) { return ZSTD_BLOCKSIZE_ABSOLUTEMAX; } - +size_t ZSTD_DStreamInSize(void) { return ZSTD_BLOCKSIZE_MAX + ZSTD_blockHeaderSize; } +size_t ZSTD_DStreamOutSize(void) { return ZSTD_BLOCKSIZE_MAX; } + +size_t ZSTD_DCtx_loadDictionary_advanced(ZSTD_DCtx* dctx, const void* dict, size_t dictSize, ZSTD_dictLoadMethod_e dictLoadMethod, ZSTD_dictContentType_e dictContentType) +{ + if (dctx->streamStage != zdss_init) return ERROR(stage_wrong); + ZSTD_freeDDict(dctx->ddictLocal); + if (dict && dictSize >= 8) { + dctx->ddictLocal = ZSTD_createDDict_advanced(dict, dictSize, dictLoadMethod, dictContentType, dctx->customMem); + if (dctx->ddictLocal == NULL) return ERROR(memory_allocation); + } else { + dctx->ddictLocal = NULL; + } + dctx->ddict = dctx->ddictLocal; + return 0; +} + +size_t ZSTD_DCtx_loadDictionary_byReference(ZSTD_DCtx* dctx, const void* dict, size_t dictSize) +{ + return ZSTD_DCtx_loadDictionary_advanced(dctx, dict, dictSize, ZSTD_dlm_byRef, ZSTD_dct_auto); +} + +size_t ZSTD_DCtx_loadDictionary(ZSTD_DCtx* dctx, const void* dict, size_t dictSize) +{ + return ZSTD_DCtx_loadDictionary_advanced(dctx, dict, dictSize, ZSTD_dlm_byCopy, ZSTD_dct_auto); +} + +size_t ZSTD_DCtx_refPrefix_advanced(ZSTD_DCtx* dctx, const void* prefix, size_t prefixSize, ZSTD_dictContentType_e dictContentType) +{ + return ZSTD_DCtx_loadDictionary_advanced(dctx, prefix, prefixSize, ZSTD_dlm_byRef, dictContentType); +} + +size_t ZSTD_DCtx_refPrefix(ZSTD_DCtx* dctx, const void* prefix, size_t prefixSize) +{ + return ZSTD_DCtx_refPrefix_advanced(dctx, prefix, prefixSize, ZSTD_dct_rawContent); +} + + +/* ZSTD_initDStream_usingDict() : + * return : expected size, aka ZSTD_frameHeaderSize_prefix. + * this function cannot fail */ size_t ZSTD_initDStream_usingDict(ZSTD_DStream* zds, const void* dict, size_t dictSize) { - zds->stage = zdss_loadHeader; - zds->lhSize = zds->inPos = zds->outStart = zds->outEnd = 0; - ZSTD_freeDDict(zds->ddictLocal); - if (dict && dictSize >= 8) { - zds->ddictLocal = ZSTD_createDDict(dict, dictSize); - if (zds->ddictLocal == NULL) return ERROR(memory_allocation); - } else zds->ddictLocal = NULL; - zds->ddict = zds->ddictLocal; - zds->legacyVersion = 0; - zds->hostageByte = 0; + DEBUGLOG(4, "ZSTD_initDStream_usingDict"); + zds->streamStage = zdss_init; + CHECK_F( ZSTD_DCtx_loadDictionary(zds, dict, dictSize) ); return ZSTD_frameHeaderSize_prefix; } +/* note : this variant can't fail */ size_t ZSTD_initDStream(ZSTD_DStream* zds) { + DEBUGLOG(4, "ZSTD_initDStream"); return ZSTD_initDStream_usingDict(zds, NULL, 0); } -size_t ZSTD_initDStream_usingDDict(ZSTD_DStream* zds, const ZSTD_DDict* ddict) /**< note : ddict will just be referenced, and must outlive decompression session */ +size_t ZSTD_DCtx_refDDict(ZSTD_DCtx* dctx, const ZSTD_DDict* ddict) { - size_t const initResult = ZSTD_initDStream(zds); - zds->ddict = ddict; + if (dctx->streamStage != zdss_init) return ERROR(stage_wrong); + dctx->ddict = ddict; + return 0; +} + +/* ZSTD_initDStream_usingDDict() : + * ddict will just be referenced, and must outlive decompression session + * this function cannot fail */ +size_t ZSTD_initDStream_usingDDict(ZSTD_DStream* dctx, const ZSTD_DDict* ddict) +{ + size_t const initResult = ZSTD_initDStream(dctx); + dctx->ddict = ddict; return initResult; } -size_t ZSTD_resetDStream(ZSTD_DStream* zds) +/* ZSTD_resetDStream() : + * return : expected size, aka ZSTD_frameHeaderSize_prefix. + * this function cannot fail */ +size_t ZSTD_resetDStream(ZSTD_DStream* dctx) { - zds->stage = zdss_loadHeader; - zds->lhSize = zds->inPos = zds->outStart = zds->outEnd = 0; - zds->legacyVersion = 0; - zds->hostageByte = 0; + DEBUGLOG(4, "ZSTD_resetDStream"); + dctx->streamStage = zdss_loadHeader; + dctx->lhSize = dctx->inPos = dctx->outStart = dctx->outEnd = 0; + dctx->legacyVersion = 0; + dctx->hostageByte = 0; return ZSTD_frameHeaderSize_prefix; } -size_t ZSTD_setDStreamParameter(ZSTD_DStream* zds, +size_t ZSTD_setDStreamParameter(ZSTD_DStream* dctx, ZSTD_DStreamParameter_e paramType, unsigned paramValue) { + if (dctx->streamStage != zdss_init) return ERROR(stage_wrong); switch(paramType) { - default : return ERROR(parameter_unknown); - case DStream_p_maxWindowSize : zds->maxWindowSize = paramValue ? paramValue : (U32)(-1); break; + default : return ERROR(parameter_unsupported); + case DStream_p_maxWindowSize : + DEBUGLOG(4, "setting maxWindowSize = %u KB", paramValue >> 10); + dctx->maxWindowSize = paramValue ? paramValue : (U32)(-1); + break; } return 0; } - -size_t ZSTD_sizeof_DStream(const ZSTD_DStream* zds) +size_t ZSTD_DCtx_setMaxWindowSize(ZSTD_DCtx* dctx, size_t maxWindowSize) +{ + if (dctx->streamStage != zdss_init) return ERROR(stage_wrong); + dctx->maxWindowSize = maxWindowSize; + return 0; +} + +size_t ZSTD_DCtx_setFormat(ZSTD_DCtx* dctx, ZSTD_format_e format) +{ + DEBUGLOG(4, "ZSTD_DCtx_setFormat : %u", (unsigned)format); + if (dctx->streamStage != zdss_init) return ERROR(stage_wrong); + dctx->format = format; + return 0; +} + + +size_t ZSTD_sizeof_DStream(const ZSTD_DStream* dctx) +{ + return ZSTD_sizeof_DCtx(dctx); +} + +size_t ZSTD_decodingBufferSize_min(unsigned long long windowSize, unsigned long long frameContentSize) { - if (zds==NULL) return 0; /* support sizeof on NULL */ - return sizeof(*zds) + ZSTD_sizeof_DCtx(zds->dctx) + ZSTD_sizeof_DDict(zds->ddictLocal) + zds->inBuffSize + zds->outBuffSize; + size_t const blockSize = (size_t) MIN(windowSize, ZSTD_BLOCKSIZE_MAX); + unsigned long long const neededRBSize = windowSize + blockSize + (WILDCOPY_OVERLENGTH * 2); + unsigned long long const neededSize = MIN(frameContentSize, neededRBSize); + size_t const minRBSize = (size_t) neededSize; + if ((unsigned long long)minRBSize != neededSize) return ERROR(frameParameter_windowTooLarge); + return minRBSize; +} + +size_t ZSTD_estimateDStreamSize(size_t windowSize) +{ + size_t const blockSize = MIN(windowSize, ZSTD_BLOCKSIZE_MAX); + size_t const inBuffSize = blockSize; /* no block can be larger */ + size_t const outBuffSize = ZSTD_decodingBufferSize_min(windowSize, ZSTD_CONTENTSIZE_UNKNOWN); + return ZSTD_estimateDCtxSize() + inBuffSize + outBuffSize; +} + +size_t ZSTD_estimateDStreamSize_fromFrame(const void* src, size_t srcSize) +{ + U32 const windowSizeMax = 1U << ZSTD_WINDOWLOG_MAX; /* note : should be user-selectable */ + ZSTD_frameHeader zfh; + size_t const err = ZSTD_getFrameHeader(&zfh, src, srcSize); + if (ZSTD_isError(err)) return err; + if (err>0) return ERROR(srcSize_wrong); + if (zfh.windowSize > windowSizeMax) + return ERROR(frameParameter_windowTooLarge); + return ZSTD_estimateDStreamSize((size_t)zfh.windowSize); } @@ -2006,126 +2735,193 @@ char* op = ostart; U32 someMoreWork = 1; -#if defined(ZSTD_LEGACY_SUPPORT) && (ZSTD_LEGACY_SUPPORT>=1) - if (zds->legacyVersion) - return ZSTD_decompressLegacyStream(zds->legacyContext, zds->legacyVersion, output, input); -#endif + DEBUGLOG(5, "ZSTD_decompressStream"); + if (input->pos > input->size) { /* forbidden */ + DEBUGLOG(5, "in: pos: %u vs size: %u", + (U32)input->pos, (U32)input->size); + return ERROR(srcSize_wrong); + } + if (output->pos > output->size) { /* forbidden */ + DEBUGLOG(5, "out: pos: %u vs size: %u", + (U32)output->pos, (U32)output->size); + return ERROR(dstSize_tooSmall); + } + DEBUGLOG(5, "input size : %u", (U32)(input->size - input->pos)); while (someMoreWork) { - switch(zds->stage) + switch(zds->streamStage) { case zdss_init : + DEBUGLOG(5, "stage zdss_init => transparent reset "); ZSTD_resetDStream(zds); /* transparent reset on starting decoding a new frame */ /* fall-through */ case zdss_loadHeader : - { size_t const hSize = ZSTD_getFrameParams(&zds->fParams, zds->headerBuffer, zds->lhSize); - if (ZSTD_isError(hSize)) + DEBUGLOG(5, "stage zdss_loadHeader (srcSize : %u)", (U32)(iend - ip)); #if defined(ZSTD_LEGACY_SUPPORT) && (ZSTD_LEGACY_SUPPORT>=1) - { U32 const legacyVersion = ZSTD_isLegacy(istart, iend-istart); + if (zds->legacyVersion) { + /* legacy support is incompatible with static dctx */ + if (zds->staticSize) return ERROR(memory_allocation); + { size_t const hint = ZSTD_decompressLegacyStream(zds->legacyContext, zds->legacyVersion, output, input); + if (hint==0) zds->streamStage = zdss_init; + return hint; + } } +#endif + { size_t const hSize = ZSTD_getFrameHeader_internal(&zds->fParams, zds->headerBuffer, zds->lhSize, zds->format); + DEBUGLOG(5, "header size : %u", (U32)hSize); + if (ZSTD_isError(hSize)) { +#if defined(ZSTD_LEGACY_SUPPORT) && (ZSTD_LEGACY_SUPPORT>=1) + U32 const legacyVersion = ZSTD_isLegacy(istart, iend-istart); if (legacyVersion) { const void* const dict = zds->ddict ? zds->ddict->dictContent : NULL; size_t const dictSize = zds->ddict ? zds->ddict->dictSize : 0; - CHECK_F(ZSTD_initLegacyStream(&zds->legacyContext, zds->previousLegacyVersion, legacyVersion, - dict, dictSize)); + DEBUGLOG(5, "ZSTD_decompressStream: detected legacy version v0.%u", legacyVersion); + /* legacy support is incompatible with static dctx */ + if (zds->staticSize) return ERROR(memory_allocation); + CHECK_F(ZSTD_initLegacyStream(&zds->legacyContext, + zds->previousLegacyVersion, legacyVersion, + dict, dictSize)); zds->legacyVersion = zds->previousLegacyVersion = legacyVersion; - return ZSTD_decompressLegacyStream(zds->legacyContext, zds->legacyVersion, output, input); - } else { - return hSize; /* error */ - } } -#else - return hSize; + { size_t const hint = ZSTD_decompressLegacyStream(zds->legacyContext, legacyVersion, output, input); + if (hint==0) zds->streamStage = zdss_init; /* or stay in stage zdss_loadHeader */ + return hint; + } } #endif + return hSize; /* error */ + } if (hSize != 0) { /* need more input */ size_t const toLoad = hSize - zds->lhSize; /* if hSize!=0, hSize > zds->lhSize */ - if (toLoad > (size_t)(iend-ip)) { /* not enough input to load full header */ - memcpy(zds->headerBuffer + zds->lhSize, ip, iend-ip); - zds->lhSize += iend-ip; + size_t const remainingInput = (size_t)(iend-ip); + assert(iend >= ip); + if (toLoad > remainingInput) { /* not enough input to load full header */ + if (remainingInput > 0) { + memcpy(zds->headerBuffer + zds->lhSize, ip, remainingInput); + zds->lhSize += remainingInput; + } input->pos = input->size; return (MAX(ZSTD_frameHeaderSize_min, hSize) - zds->lhSize) + ZSTD_blockHeaderSize; /* remaining header bytes + next block header */ } + assert(ip != NULL); memcpy(zds->headerBuffer + zds->lhSize, ip, toLoad); zds->lhSize = hSize; ip += toLoad; break; } } - /* Consume header */ - { const ZSTD_DCtx* refContext = zds->ddict ? zds->ddict->refContext : NULL; - ZSTD_refDCtx(zds->dctx, refContext); + /* check for single-pass mode opportunity */ + if (zds->fParams.frameContentSize && zds->fParams.windowSize /* skippable frame if == 0 */ + && (U64)(size_t)(oend-op) >= zds->fParams.frameContentSize) { + size_t const cSize = ZSTD_findFrameCompressedSize(istart, iend-istart); + if (cSize <= (size_t)(iend-istart)) { + /* shortcut : using single-pass mode */ + size_t const decompressedSize = ZSTD_decompress_usingDDict(zds, op, oend-op, istart, cSize, zds->ddict); + if (ZSTD_isError(decompressedSize)) return decompressedSize; + DEBUGLOG(4, "shortcut to single-pass ZSTD_decompress_usingDDict()") + ip = istart + cSize; + op += decompressedSize; + zds->expected = 0; + zds->streamStage = zdss_init; + someMoreWork = 0; + break; + } } + + /* Consume header (see ZSTDds_decodeFrameHeader) */ + DEBUGLOG(4, "Consume header"); + CHECK_F(ZSTD_decompressBegin_usingDDict(zds, zds->ddict)); + + if ((MEM_readLE32(zds->headerBuffer) & 0xFFFFFFF0U) == ZSTD_MAGIC_SKIPPABLE_START) { /* skippable frame */ + zds->expected = MEM_readLE32(zds->headerBuffer + ZSTD_frameIdSize); + zds->stage = ZSTDds_skipFrame; + } else { + CHECK_F(ZSTD_decodeFrameHeader(zds, zds->headerBuffer, zds->lhSize)); + zds->expected = ZSTD_blockHeaderSize; + zds->stage = ZSTDds_decodeBlockHeader; } - { size_t const h1Size = ZSTD_nextSrcSizeToDecompress(zds->dctx); /* == ZSTD_frameHeaderSize_prefix */ - CHECK_F(ZSTD_decompressContinue(zds->dctx, NULL, 0, zds->headerBuffer, h1Size)); - { size_t const h2Size = ZSTD_nextSrcSizeToDecompress(zds->dctx); - CHECK_F(ZSTD_decompressContinue(zds->dctx, NULL, 0, zds->headerBuffer+h1Size, h2Size)); - } } - + + /* control buffer memory usage */ + DEBUGLOG(4, "Control max memory usage (%u KB <= max %u KB)", + (U32)(zds->fParams.windowSize >>10), + (U32)(zds->maxWindowSize >> 10) ); zds->fParams.windowSize = MAX(zds->fParams.windowSize, 1U << ZSTD_WINDOWLOG_ABSOLUTEMIN); if (zds->fParams.windowSize > zds->maxWindowSize) return ERROR(frameParameter_windowTooLarge); /* Adapt buffer sizes to frame header instructions */ - { size_t const blockSize = MIN(zds->fParams.windowSize, ZSTD_BLOCKSIZE_ABSOLUTEMAX); - size_t const neededOutSize = zds->fParams.windowSize + blockSize; - zds->blockSize = blockSize; - if (zds->inBuffSize < blockSize) { - ZSTD_free(zds->inBuff, zds->customMem); - zds->inBuffSize = blockSize; - zds->inBuff = (char*)ZSTD_malloc(blockSize, zds->customMem); - if (zds->inBuff == NULL) return ERROR(memory_allocation); - } - if (zds->outBuffSize < neededOutSize) { - ZSTD_free(zds->outBuff, zds->customMem); - zds->outBuffSize = neededOutSize; - zds->outBuff = (char*)ZSTD_malloc(neededOutSize, zds->customMem); - if (zds->outBuff == NULL) return ERROR(memory_allocation); + { size_t const neededInBuffSize = MAX(zds->fParams.blockSizeMax, 4 /* frame checksum */); + size_t const neededOutBuffSize = ZSTD_decodingBufferSize_min(zds->fParams.windowSize, zds->fParams.frameContentSize); + if ((zds->inBuffSize < neededInBuffSize) || (zds->outBuffSize < neededOutBuffSize)) { + size_t const bufferSize = neededInBuffSize + neededOutBuffSize; + DEBUGLOG(4, "inBuff : from %u to %u", + (U32)zds->inBuffSize, (U32)neededInBuffSize); + DEBUGLOG(4, "outBuff : from %u to %u", + (U32)zds->outBuffSize, (U32)neededOutBuffSize); + if (zds->staticSize) { /* static DCtx */ + DEBUGLOG(4, "staticSize : %u", (U32)zds->staticSize); + assert(zds->staticSize >= sizeof(ZSTD_DCtx)); /* controlled at init */ + if (bufferSize > zds->staticSize - sizeof(ZSTD_DCtx)) + return ERROR(memory_allocation); + } else { + ZSTD_free(zds->inBuff, zds->customMem); + zds->inBuffSize = 0; + zds->outBuffSize = 0; + zds->inBuff = (char*)ZSTD_malloc(bufferSize, zds->customMem); + if (zds->inBuff == NULL) return ERROR(memory_allocation); + } + zds->inBuffSize = neededInBuffSize; + zds->outBuff = zds->inBuff + zds->inBuffSize; + zds->outBuffSize = neededOutBuffSize; } } - zds->stage = zdss_read; - /* pass-through */ + zds->streamStage = zdss_read; + /* fall-through */ case zdss_read: - { size_t const neededInSize = ZSTD_nextSrcSizeToDecompress(zds->dctx); + DEBUGLOG(5, "stage zdss_read"); + { size_t const neededInSize = ZSTD_nextSrcSizeToDecompress(zds); + DEBUGLOG(5, "neededInSize = %u", (U32)neededInSize); if (neededInSize==0) { /* end of frame */ - zds->stage = zdss_init; + zds->streamStage = zdss_init; someMoreWork = 0; break; } if ((size_t)(iend-ip) >= neededInSize) { /* decode directly from src */ - const int isSkipFrame = ZSTD_isSkipFrame(zds->dctx); - size_t const decodedSize = ZSTD_decompressContinue(zds->dctx, + int const isSkipFrame = ZSTD_isSkipFrame(zds); + size_t const decodedSize = ZSTD_decompressContinue(zds, zds->outBuff + zds->outStart, (isSkipFrame ? 0 : zds->outBuffSize - zds->outStart), ip, neededInSize); if (ZSTD_isError(decodedSize)) return decodedSize; ip += neededInSize; if (!decodedSize && !isSkipFrame) break; /* this was just a header */ zds->outEnd = zds->outStart + decodedSize; - zds->stage = zdss_flush; + zds->streamStage = zdss_flush; break; - } - if (ip==iend) { someMoreWork = 0; break; } /* no more input */ - zds->stage = zdss_load; - /* pass-through */ - } + } } + if (ip==iend) { someMoreWork = 0; break; } /* no more input */ + zds->streamStage = zdss_load; + /* fall-through */ case zdss_load: - { size_t const neededInSize = ZSTD_nextSrcSizeToDecompress(zds->dctx); - size_t const toLoad = neededInSize - zds->inPos; /* should always be <= remaining space within inBuff */ + { size_t const neededInSize = ZSTD_nextSrcSizeToDecompress(zds); + size_t const toLoad = neededInSize - zds->inPos; + int const isSkipFrame = ZSTD_isSkipFrame(zds); size_t loadedSize; - if (toLoad > zds->inBuffSize - zds->inPos) return ERROR(corruption_detected); /* should never happen */ - loadedSize = ZSTD_limitCopy(zds->inBuff + zds->inPos, toLoad, ip, iend-ip); + if (isSkipFrame) { + loadedSize = MIN(toLoad, (size_t)(iend-ip)); + } else { + if (toLoad > zds->inBuffSize - zds->inPos) return ERROR(corruption_detected); /* should never happen */ + loadedSize = ZSTD_limitCopy(zds->inBuff + zds->inPos, toLoad, ip, iend-ip); + } ip += loadedSize; zds->inPos += loadedSize; if (loadedSize < toLoad) { someMoreWork = 0; break; } /* not enough input, wait for more */ /* decode loaded input */ - { const int isSkipFrame = ZSTD_isSkipFrame(zds->dctx); - size_t const decodedSize = ZSTD_decompressContinue(zds->dctx, + { size_t const decodedSize = ZSTD_decompressContinue(zds, zds->outBuff + zds->outStart, zds->outBuffSize - zds->outStart, zds->inBuff, neededInSize); if (ZSTD_isError(decodedSize)) return decodedSize; zds->inPos = 0; /* input is consumed */ - if (!decodedSize && !isSkipFrame) { zds->stage = zdss_read; break; } /* this was just a header */ + if (!decodedSize && !isSkipFrame) { zds->streamStage = zdss_read; break; } /* this was just a header */ zds->outEnd = zds->outStart + decodedSize; - zds->stage = zdss_flush; - /* pass-through */ } } + zds->streamStage = zdss_flush; + /* fall-through */ case zdss_flush: { size_t const toFlushSize = zds->outEnd - zds->outStart; @@ -2133,39 +2929,75 @@ op += flushedSize; zds->outStart += flushedSize; if (flushedSize == toFlushSize) { /* flush completed */ - zds->stage = zdss_read; - if (zds->outStart + zds->blockSize > zds->outBuffSize) + zds->streamStage = zdss_read; + if ( (zds->outBuffSize < zds->fParams.frameContentSize) + && (zds->outStart + zds->fParams.blockSizeMax > zds->outBuffSize) ) { + DEBUGLOG(5, "restart filling outBuff from beginning (left:%i, needed:%u)", + (int)(zds->outBuffSize - zds->outStart), + (U32)zds->fParams.blockSizeMax); zds->outStart = zds->outEnd = 0; + } break; - } - /* cannot complete flush */ - someMoreWork = 0; - break; - } + } } + /* cannot complete flush */ + someMoreWork = 0; + break; + default: return ERROR(GENERIC); /* impossible */ } } /* result */ input->pos += (size_t)(ip-istart); output->pos += (size_t)(op-ostart); - { size_t nextSrcSizeHint = ZSTD_nextSrcSizeToDecompress(zds->dctx); + { size_t nextSrcSizeHint = ZSTD_nextSrcSizeToDecompress(zds); if (!nextSrcSizeHint) { /* frame fully decoded */ if (zds->outEnd == zds->outStart) { /* output fully flushed */ if (zds->hostageByte) { - if (input->pos >= input->size) { zds->stage = zdss_read; return 1; } /* can't release hostage (not present) */ + if (input->pos >= input->size) { + /* can't release hostage (not present) */ + zds->streamStage = zdss_read; + return 1; + } input->pos++; /* release hostage */ - } + } /* zds->hostageByte */ return 0; - } + } /* zds->outEnd == zds->outStart */ if (!zds->hostageByte) { /* output not fully flushed; keep last byte as hostage; will be released when all output is flushed */ input->pos--; /* note : pos > 0, otherwise, impossible to finish reading last block */ zds->hostageByte=1; } return 1; - } - nextSrcSizeHint += ZSTD_blockHeaderSize * (ZSTD_nextInputType(zds->dctx) == ZSTDnit_block); /* preload header of next block */ - if (zds->inPos > nextSrcSizeHint) return ERROR(GENERIC); /* should never happen */ - nextSrcSizeHint -= zds->inPos; /* already loaded*/ + } /* nextSrcSizeHint==0 */ + nextSrcSizeHint += ZSTD_blockHeaderSize * (ZSTD_nextInputType(zds) == ZSTDnit_block); /* preload header of next block */ + assert(zds->inPos <= nextSrcSizeHint); + nextSrcSizeHint -= zds->inPos; /* part already loaded*/ return nextSrcSizeHint; } } + + +size_t ZSTD_decompress_generic(ZSTD_DCtx* dctx, ZSTD_outBuffer* output, ZSTD_inBuffer* input) +{ + return ZSTD_decompressStream(dctx, output, input); +} + +size_t ZSTD_decompress_generic_simpleArgs ( + ZSTD_DCtx* dctx, + void* dst, size_t dstCapacity, size_t* dstPos, + const void* src, size_t srcSize, size_t* srcPos) +{ + ZSTD_outBuffer output = { dst, dstCapacity, *dstPos }; + ZSTD_inBuffer input = { src, srcSize, *srcPos }; + /* ZSTD_compress_generic() will check validity of dstPos and srcPos */ + size_t const cErr = ZSTD_decompress_generic(dctx, &output, &input); + *dstPos = output.pos; + *srcPos = input.pos; + return cErr; +} + +void ZSTD_DCtx_reset(ZSTD_DCtx* dctx) +{ + (void)ZSTD_initDStream(dctx); + dctx->format = ZSTD_f_zstd1; + dctx->maxWindowSize = ZSTD_MAXWINDOWSIZE_DEFAULT; +} diff -r fb92df8b634c -r ed5448edcbfa contrib/python-zstandard/zstd/deprecated/zbuff.h --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/contrib/python-zstandard/zstd/deprecated/zbuff.h Wed Apr 18 15:32:08 2018 -0400 @@ -0,0 +1,213 @@ +/* + * Copyright (c) 2016-present, Yann Collet, Facebook, Inc. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the + * LICENSE file in the root directory of this source tree) and the GPLv2 (found + * in the COPYING file in the root directory of this source tree). + * You may select, at your option, one of the above-listed licenses. + */ + +/* *************************************************************** +* NOTES/WARNINGS +******************************************************************/ +/* The streaming API defined here is deprecated. + * Consider migrating towards ZSTD_compressStream() API in `zstd.h` + * See 'lib/README.md'. + *****************************************************************/ + + +#if defined (__cplusplus) +extern "C" { +#endif + +#ifndef ZSTD_BUFFERED_H_23987 +#define ZSTD_BUFFERED_H_23987 + +/* ************************************* +* Dependencies +***************************************/ +#include /* size_t */ +#include "zstd.h" /* ZSTD_CStream, ZSTD_DStream, ZSTDLIB_API */ + + +/* *************************************************************** +* Compiler specifics +*****************************************************************/ +/* Deprecation warnings */ +/* Should these warnings be a problem, + it is generally possible to disable them, + typically with -Wno-deprecated-declarations for gcc + or _CRT_SECURE_NO_WARNINGS in Visual. + Otherwise, it's also possible to define ZBUFF_DISABLE_DEPRECATE_WARNINGS */ +#ifdef ZBUFF_DISABLE_DEPRECATE_WARNINGS +# define ZBUFF_DEPRECATED(message) ZSTDLIB_API /* disable deprecation warnings */ +#else +# if defined (__cplusplus) && (__cplusplus >= 201402) /* C++14 or greater */ +# define ZBUFF_DEPRECATED(message) [[deprecated(message)]] ZSTDLIB_API +# elif (defined(__GNUC__) && (__GNUC__ >= 5)) || defined(__clang__) +# define ZBUFF_DEPRECATED(message) ZSTDLIB_API __attribute__((deprecated(message))) +# elif defined(__GNUC__) && (__GNUC__ >= 3) +# define ZBUFF_DEPRECATED(message) ZSTDLIB_API __attribute__((deprecated)) +# elif defined(_MSC_VER) +# define ZBUFF_DEPRECATED(message) ZSTDLIB_API __declspec(deprecated(message)) +# else +# pragma message("WARNING: You need to implement ZBUFF_DEPRECATED for this compiler") +# define ZBUFF_DEPRECATED(message) ZSTDLIB_API +# endif +#endif /* ZBUFF_DISABLE_DEPRECATE_WARNINGS */ + + +/* ************************************* +* Streaming functions +***************************************/ +/* This is the easier "buffered" streaming API, +* using an internal buffer to lift all restrictions on user-provided buffers +* which can be any size, any place, for both input and output. +* ZBUFF and ZSTD are 100% interoperable, +* frames created by one can be decoded by the other one */ + +typedef ZSTD_CStream ZBUFF_CCtx; +ZBUFF_DEPRECATED("use ZSTD_createCStream") ZBUFF_CCtx* ZBUFF_createCCtx(void); +ZBUFF_DEPRECATED("use ZSTD_freeCStream") size_t ZBUFF_freeCCtx(ZBUFF_CCtx* cctx); + +ZBUFF_DEPRECATED("use ZSTD_initCStream") size_t ZBUFF_compressInit(ZBUFF_CCtx* cctx, int compressionLevel); +ZBUFF_DEPRECATED("use ZSTD_initCStream_usingDict") size_t ZBUFF_compressInitDictionary(ZBUFF_CCtx* cctx, const void* dict, size_t dictSize, int compressionLevel); + +ZBUFF_DEPRECATED("use ZSTD_compressStream") size_t ZBUFF_compressContinue(ZBUFF_CCtx* cctx, void* dst, size_t* dstCapacityPtr, const void* src, size_t* srcSizePtr); +ZBUFF_DEPRECATED("use ZSTD_flushStream") size_t ZBUFF_compressFlush(ZBUFF_CCtx* cctx, void* dst, size_t* dstCapacityPtr); +ZBUFF_DEPRECATED("use ZSTD_endStream") size_t ZBUFF_compressEnd(ZBUFF_CCtx* cctx, void* dst, size_t* dstCapacityPtr); + +/*-************************************************* +* Streaming compression - howto +* +* A ZBUFF_CCtx object is required to track streaming operation. +* Use ZBUFF_createCCtx() and ZBUFF_freeCCtx() to create/release resources. +* ZBUFF_CCtx objects can be reused multiple times. +* +* Start by initializing ZBUF_CCtx. +* Use ZBUFF_compressInit() to start a new compression operation. +* Use ZBUFF_compressInitDictionary() for a compression which requires a dictionary. +* +* Use ZBUFF_compressContinue() repetitively to consume input stream. +* *srcSizePtr and *dstCapacityPtr can be any size. +* The function will report how many bytes were read or written within *srcSizePtr and *dstCapacityPtr. +* Note that it may not consume the entire input, in which case it's up to the caller to present again remaining data. +* The content of `dst` will be overwritten (up to *dstCapacityPtr) at each call, so save its content if it matters or change @dst . +* @return : a hint to preferred nb of bytes to use as input for next function call (it's just a hint, to improve latency) +* or an error code, which can be tested using ZBUFF_isError(). +* +* At any moment, it's possible to flush whatever data remains within buffer, using ZBUFF_compressFlush(). +* The nb of bytes written into `dst` will be reported into *dstCapacityPtr. +* Note that the function cannot output more than *dstCapacityPtr, +* therefore, some content might still be left into internal buffer if *dstCapacityPtr is too small. +* @return : nb of bytes still present into internal buffer (0 if it's empty) +* or an error code, which can be tested using ZBUFF_isError(). +* +* ZBUFF_compressEnd() instructs to finish a frame. +* It will perform a flush and write frame epilogue. +* The epilogue is required for decoders to consider a frame completed. +* Similar to ZBUFF_compressFlush(), it may not be able to output the entire internal buffer content if *dstCapacityPtr is too small. +* In which case, call again ZBUFF_compressFlush() to complete the flush. +* @return : nb of bytes still present into internal buffer (0 if it's empty) +* or an error code, which can be tested using ZBUFF_isError(). +* +* Hint : _recommended buffer_ sizes (not compulsory) : ZBUFF_recommendedCInSize() / ZBUFF_recommendedCOutSize() +* input : ZBUFF_recommendedCInSize==128 KB block size is the internal unit, use this value to reduce intermediate stages (better latency) +* output : ZBUFF_recommendedCOutSize==ZSTD_compressBound(128 KB) + 3 + 3 : ensures it's always possible to write/flush/end a full block. Skip some buffering. +* By using both, it ensures that input will be entirely consumed, and output will always contain the result, reducing intermediate buffering. +* **************************************************/ + + +typedef ZSTD_DStream ZBUFF_DCtx; +ZBUFF_DEPRECATED("use ZSTD_createDStream") ZBUFF_DCtx* ZBUFF_createDCtx(void); +ZBUFF_DEPRECATED("use ZSTD_freeDStream") size_t ZBUFF_freeDCtx(ZBUFF_DCtx* dctx); + +ZBUFF_DEPRECATED("use ZSTD_initDStream") size_t ZBUFF_decompressInit(ZBUFF_DCtx* dctx); +ZBUFF_DEPRECATED("use ZSTD_initDStream_usingDict") size_t ZBUFF_decompressInitDictionary(ZBUFF_DCtx* dctx, const void* dict, size_t dictSize); + +ZBUFF_DEPRECATED("use ZSTD_decompressStream") size_t ZBUFF_decompressContinue(ZBUFF_DCtx* dctx, + void* dst, size_t* dstCapacityPtr, + const void* src, size_t* srcSizePtr); + +/*-*************************************************************************** +* Streaming decompression howto +* +* A ZBUFF_DCtx object is required to track streaming operations. +* Use ZBUFF_createDCtx() and ZBUFF_freeDCtx() to create/release resources. +* Use ZBUFF_decompressInit() to start a new decompression operation, +* or ZBUFF_decompressInitDictionary() if decompression requires a dictionary. +* Note that ZBUFF_DCtx objects can be re-init multiple times. +* +* Use ZBUFF_decompressContinue() repetitively to consume your input. +* *srcSizePtr and *dstCapacityPtr can be any size. +* The function will report how many bytes were read or written by modifying *srcSizePtr and *dstCapacityPtr. +* Note that it may not consume the entire input, in which case it's up to the caller to present remaining input again. +* The content of `dst` will be overwritten (up to *dstCapacityPtr) at each function call, so save its content if it matters, or change `dst`. +* @return : 0 when a frame is completely decoded and fully flushed, +* 1 when there is still some data left within internal buffer to flush, +* >1 when more data is expected, with value being a suggested next input size (it's just a hint, which helps latency), +* or an error code, which can be tested using ZBUFF_isError(). +* +* Hint : recommended buffer sizes (not compulsory) : ZBUFF_recommendedDInSize() and ZBUFF_recommendedDOutSize() +* output : ZBUFF_recommendedDOutSize== 128 KB block size is the internal unit, it ensures it's always possible to write a full block when decoded. +* input : ZBUFF_recommendedDInSize == 128KB + 3; +* just follow indications from ZBUFF_decompressContinue() to minimize latency. It should always be <= 128 KB + 3 . +* *******************************************************************************/ + + +/* ************************************* +* Tool functions +***************************************/ +ZBUFF_DEPRECATED("use ZSTD_isError") unsigned ZBUFF_isError(size_t errorCode); +ZBUFF_DEPRECATED("use ZSTD_getErrorName") const char* ZBUFF_getErrorName(size_t errorCode); + +/** Functions below provide recommended buffer sizes for Compression or Decompression operations. +* These sizes are just hints, they tend to offer better latency */ +ZBUFF_DEPRECATED("use ZSTD_CStreamInSize") size_t ZBUFF_recommendedCInSize(void); +ZBUFF_DEPRECATED("use ZSTD_CStreamOutSize") size_t ZBUFF_recommendedCOutSize(void); +ZBUFF_DEPRECATED("use ZSTD_DStreamInSize") size_t ZBUFF_recommendedDInSize(void); +ZBUFF_DEPRECATED("use ZSTD_DStreamOutSize") size_t ZBUFF_recommendedDOutSize(void); + +#endif /* ZSTD_BUFFERED_H_23987 */ + + +#ifdef ZBUFF_STATIC_LINKING_ONLY +#ifndef ZBUFF_STATIC_H_30298098432 +#define ZBUFF_STATIC_H_30298098432 + +/* ==================================================================================== + * The definitions in this section are considered experimental. + * They should never be used in association with a dynamic library, as they may change in the future. + * They are provided for advanced usages. + * Use them only in association with static linking. + * ==================================================================================== */ + +/*--- Dependency ---*/ +#define ZSTD_STATIC_LINKING_ONLY /* ZSTD_parameters, ZSTD_customMem */ +#include "zstd.h" + + +/*--- Custom memory allocator ---*/ +/*! ZBUFF_createCCtx_advanced() : + * Create a ZBUFF compression context using external alloc and free functions */ +ZBUFF_DEPRECATED("use ZSTD_createCStream_advanced") ZBUFF_CCtx* ZBUFF_createCCtx_advanced(ZSTD_customMem customMem); + +/*! ZBUFF_createDCtx_advanced() : + * Create a ZBUFF decompression context using external alloc and free functions */ +ZBUFF_DEPRECATED("use ZSTD_createDStream_advanced") ZBUFF_DCtx* ZBUFF_createDCtx_advanced(ZSTD_customMem customMem); + + +/*--- Advanced Streaming Initialization ---*/ +ZBUFF_DEPRECATED("use ZSTD_initDStream_usingDict") size_t ZBUFF_compressInit_advanced(ZBUFF_CCtx* zbc, + const void* dict, size_t dictSize, + ZSTD_parameters params, unsigned long long pledgedSrcSize); + + +#endif /* ZBUFF_STATIC_H_30298098432 */ +#endif /* ZBUFF_STATIC_LINKING_ONLY */ + + +#if defined (__cplusplus) +} +#endif diff -r fb92df8b634c -r ed5448edcbfa contrib/python-zstandard/zstd/deprecated/zbuff_common.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/contrib/python-zstandard/zstd/deprecated/zbuff_common.c Wed Apr 18 15:32:08 2018 -0400 @@ -0,0 +1,26 @@ +/* + * Copyright (c) 2016-present, Yann Collet, Facebook, Inc. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the + * LICENSE file in the root directory of this source tree) and the GPLv2 (found + * in the COPYING file in the root directory of this source tree). + * You may select, at your option, one of the above-listed licenses. + */ + +/*-************************************* +* Dependencies +***************************************/ +#include "error_private.h" +#include "zbuff.h" + +/*-**************************************** +* ZBUFF Error Management (deprecated) +******************************************/ + +/*! ZBUFF_isError() : +* tells if a return value is an error code */ +unsigned ZBUFF_isError(size_t errorCode) { return ERR_isError(errorCode); } +/*! ZBUFF_getErrorName() : +* provides error code string from function result (useful for debugging) */ +const char* ZBUFF_getErrorName(size_t errorCode) { return ERR_getErrorName(errorCode); } diff -r fb92df8b634c -r ed5448edcbfa contrib/python-zstandard/zstd/deprecated/zbuff_compress.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/contrib/python-zstandard/zstd/deprecated/zbuff_compress.c Wed Apr 18 15:32:08 2018 -0400 @@ -0,0 +1,147 @@ +/* + * Copyright (c) 2016-present, Yann Collet, Facebook, Inc. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the + * LICENSE file in the root directory of this source tree) and the GPLv2 (found + * in the COPYING file in the root directory of this source tree). + * You may select, at your option, one of the above-listed licenses. + */ + + + +/* ************************************* +* Dependencies +***************************************/ +#define ZBUFF_STATIC_LINKING_ONLY +#include "zbuff.h" + + +/*-*********************************************************** +* Streaming compression +* +* A ZBUFF_CCtx object is required to track streaming operation. +* Use ZBUFF_createCCtx() and ZBUFF_freeCCtx() to create/release resources. +* Use ZBUFF_compressInit() to start a new compression operation. +* ZBUFF_CCtx objects can be reused multiple times. +* +* Use ZBUFF_compressContinue() repetitively to consume your input. +* *srcSizePtr and *dstCapacityPtr can be any size. +* The function will report how many bytes were read or written by modifying *srcSizePtr and *dstCapacityPtr. +* Note that it may not consume the entire input, in which case it's up to the caller to call again the function with remaining input. +* The content of dst will be overwritten (up to *dstCapacityPtr) at each function call, so save its content if it matters or change dst . +* @return : a hint to preferred nb of bytes to use as input for next function call (it's only a hint, to improve latency) +* or an error code, which can be tested using ZBUFF_isError(). +* +* ZBUFF_compressFlush() can be used to instruct ZBUFF to compress and output whatever remains within its buffer. +* Note that it will not output more than *dstCapacityPtr. +* Therefore, some content might still be left into its internal buffer if dst buffer is too small. +* @return : nb of bytes still present into internal buffer (0 if it's empty) +* or an error code, which can be tested using ZBUFF_isError(). +* +* ZBUFF_compressEnd() instructs to finish a frame. +* It will perform a flush and write frame epilogue. +* Similar to ZBUFF_compressFlush(), it may not be able to output the entire internal buffer content if *dstCapacityPtr is too small. +* @return : nb of bytes still present into internal buffer (0 if it's empty) +* or an error code, which can be tested using ZBUFF_isError(). +* +* Hint : recommended buffer sizes (not compulsory) +* input : ZSTD_BLOCKSIZE_MAX (128 KB), internal unit size, it improves latency to use this value. +* output : ZSTD_compressBound(ZSTD_BLOCKSIZE_MAX) + ZSTD_blockHeaderSize + ZBUFF_endFrameSize : ensures it's always possible to write/flush/end a full block at best speed. +* ***********************************************************/ + +ZBUFF_CCtx* ZBUFF_createCCtx(void) +{ + return ZSTD_createCStream(); +} + +ZBUFF_CCtx* ZBUFF_createCCtx_advanced(ZSTD_customMem customMem) +{ + return ZSTD_createCStream_advanced(customMem); +} + +size_t ZBUFF_freeCCtx(ZBUFF_CCtx* zbc) +{ + return ZSTD_freeCStream(zbc); +} + + +/* ====== Initialization ====== */ + +size_t ZBUFF_compressInit_advanced(ZBUFF_CCtx* zbc, + const void* dict, size_t dictSize, + ZSTD_parameters params, unsigned long long pledgedSrcSize) +{ + if (pledgedSrcSize==0) pledgedSrcSize = ZSTD_CONTENTSIZE_UNKNOWN; /* preserve "0 == unknown" behavior */ + return ZSTD_initCStream_advanced(zbc, dict, dictSize, params, pledgedSrcSize); +} + + +size_t ZBUFF_compressInitDictionary(ZBUFF_CCtx* zbc, const void* dict, size_t dictSize, int compressionLevel) +{ + return ZSTD_initCStream_usingDict(zbc, dict, dictSize, compressionLevel); +} + +size_t ZBUFF_compressInit(ZBUFF_CCtx* zbc, int compressionLevel) +{ + return ZSTD_initCStream(zbc, compressionLevel); +} + +/* ====== Compression ====== */ + + +size_t ZBUFF_compressContinue(ZBUFF_CCtx* zbc, + void* dst, size_t* dstCapacityPtr, + const void* src, size_t* srcSizePtr) +{ + size_t result; + ZSTD_outBuffer outBuff; + ZSTD_inBuffer inBuff; + outBuff.dst = dst; + outBuff.pos = 0; + outBuff.size = *dstCapacityPtr; + inBuff.src = src; + inBuff.pos = 0; + inBuff.size = *srcSizePtr; + result = ZSTD_compressStream(zbc, &outBuff, &inBuff); + *dstCapacityPtr = outBuff.pos; + *srcSizePtr = inBuff.pos; + return result; +} + + + +/* ====== Finalize ====== */ + +size_t ZBUFF_compressFlush(ZBUFF_CCtx* zbc, void* dst, size_t* dstCapacityPtr) +{ + size_t result; + ZSTD_outBuffer outBuff; + outBuff.dst = dst; + outBuff.pos = 0; + outBuff.size = *dstCapacityPtr; + result = ZSTD_flushStream(zbc, &outBuff); + *dstCapacityPtr = outBuff.pos; + return result; +} + + +size_t ZBUFF_compressEnd(ZBUFF_CCtx* zbc, void* dst, size_t* dstCapacityPtr) +{ + size_t result; + ZSTD_outBuffer outBuff; + outBuff.dst = dst; + outBuff.pos = 0; + outBuff.size = *dstCapacityPtr; + result = ZSTD_endStream(zbc, &outBuff); + *dstCapacityPtr = outBuff.pos; + return result; +} + + + +/* ************************************* +* Tool functions +***************************************/ +size_t ZBUFF_recommendedCInSize(void) { return ZSTD_CStreamInSize(); } +size_t ZBUFF_recommendedCOutSize(void) { return ZSTD_CStreamOutSize(); } diff -r fb92df8b634c -r ed5448edcbfa contrib/python-zstandard/zstd/deprecated/zbuff_decompress.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/contrib/python-zstandard/zstd/deprecated/zbuff_decompress.c Wed Apr 18 15:32:08 2018 -0400 @@ -0,0 +1,75 @@ +/* + * Copyright (c) 2016-present, Yann Collet, Facebook, Inc. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the + * LICENSE file in the root directory of this source tree) and the GPLv2 (found + * in the COPYING file in the root directory of this source tree). + * You may select, at your option, one of the above-listed licenses. + */ + + + +/* ************************************* +* Dependencies +***************************************/ +#define ZBUFF_STATIC_LINKING_ONLY +#include "zbuff.h" + + +ZBUFF_DCtx* ZBUFF_createDCtx(void) +{ + return ZSTD_createDStream(); +} + +ZBUFF_DCtx* ZBUFF_createDCtx_advanced(ZSTD_customMem customMem) +{ + return ZSTD_createDStream_advanced(customMem); +} + +size_t ZBUFF_freeDCtx(ZBUFF_DCtx* zbd) +{ + return ZSTD_freeDStream(zbd); +} + + +/* *** Initialization *** */ + +size_t ZBUFF_decompressInitDictionary(ZBUFF_DCtx* zbd, const void* dict, size_t dictSize) +{ + return ZSTD_initDStream_usingDict(zbd, dict, dictSize); +} + +size_t ZBUFF_decompressInit(ZBUFF_DCtx* zbd) +{ + return ZSTD_initDStream(zbd); +} + + +/* *** Decompression *** */ + +size_t ZBUFF_decompressContinue(ZBUFF_DCtx* zbd, + void* dst, size_t* dstCapacityPtr, + const void* src, size_t* srcSizePtr) +{ + ZSTD_outBuffer outBuff; + ZSTD_inBuffer inBuff; + size_t result; + outBuff.dst = dst; + outBuff.pos = 0; + outBuff.size = *dstCapacityPtr; + inBuff.src = src; + inBuff.pos = 0; + inBuff.size = *srcSizePtr; + result = ZSTD_decompressStream(zbd, &outBuff, &inBuff); + *dstCapacityPtr = outBuff.pos; + *srcSizePtr = inBuff.pos; + return result; +} + + +/* ************************************* +* Tool functions +***************************************/ +size_t ZBUFF_recommendedDInSize(void) { return ZSTD_DStreamInSize(); } +size_t ZBUFF_recommendedDOutSize(void) { return ZSTD_DStreamOutSize(); } diff -r fb92df8b634c -r ed5448edcbfa contrib/python-zstandard/zstd/dictBuilder/cover.c --- a/contrib/python-zstandard/zstd/dictBuilder/cover.c Wed Apr 04 10:35:09 2018 -0400 +++ b/contrib/python-zstandard/zstd/dictBuilder/cover.c Wed Apr 18 15:32:08 2018 -0400 @@ -1,12 +1,23 @@ -/** +/* * Copyright (c) 2016-present, Yann Collet, Facebook, Inc. * All rights reserved. * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. An additional grant - * of patent rights can be found in the PATENTS file in the same directory. + * This source code is licensed under both the BSD-style license (found in the + * LICENSE file in the root directory of this source tree) and the GPLv2 (found + * in the COPYING file in the root directory of this source tree). + * You may select, at your option, one of the above-listed licenses. */ +/* ***************************************************************************** + * Constructs a dictionary using a heuristic based on the following paper: + * + * Liao, Petri, Moffat, Wirth + * Effective Construction of Relative Lempel-Ziv Dictionaries + * Published in WWW 2016. + * + * Adapted from code originally written by @ot (Giuseppe Ottaviano). + ******************************************************************************/ + /*-************************************* * Dependencies ***************************************/ @@ -49,8 +60,6 @@ if ((clock() - g_time > refreshRate) || (displayLevel >= 4)) { \ g_time = clock(); \ DISPLAY(__VA_ARGS__); \ - if (displayLevel >= 4) \ - fflush(stdout); \ } \ } #define DISPLAYUPDATE(l, ...) LOCALDISPLAYUPDATE(g_displayLevel, l, __VA_ARGS__) @@ -226,10 +235,22 @@ * Returns 1 if the dmer at lp is greater than the dmer at rp. */ static int COVER_cmp(COVER_ctx_t *ctx, const void *lp, const void *rp) { - const U32 lhs = *(const U32 *)lp; - const U32 rhs = *(const U32 *)rp; + U32 const lhs = *(U32 const *)lp; + U32 const rhs = *(U32 const *)rp; return memcmp(ctx->samples + lhs, ctx->samples + rhs, ctx->d); } +/** + * Faster version for d <= 8. + */ +static int COVER_cmp8(COVER_ctx_t *ctx, const void *lp, const void *rp) { + U64 const mask = (ctx->d == 8) ? (U64)-1 : (((U64)1 << (8 * ctx->d)) - 1); + U64 const lhs = MEM_readLE64(ctx->samples + *(U32 const *)lp) & mask; + U64 const rhs = MEM_readLE64(ctx->samples + *(U32 const *)rp) & mask; + if (lhs < rhs) { + return -1; + } + return (lhs > rhs); +} /** * Same as COVER_cmp() except ties are broken by pointer value @@ -243,6 +264,16 @@ } return result; } +/** + * Faster version for d <= 8. + */ +static int COVER_strict_cmp8(const void *lp, const void *rp) { + int result = COVER_cmp8(g_ctx, lp, rp); + if (result == 0) { + result = lp < rp ? -1 : 1; + } + return result; +} /** * Returns the first pointer in [first, last) whose element does not compare @@ -352,7 +383,7 @@ typedef struct { U32 begin; U32 end; - double score; + U32 score; } COVER_segment_t; /** @@ -368,7 +399,8 @@ */ static COVER_segment_t COVER_selectSegment(const COVER_ctx_t *ctx, U32 *freqs, COVER_map_t *activeDmers, U32 begin, - U32 end, COVER_params_t parameters) { + U32 end, + ZDICT_cover_params_t parameters) { /* Constants */ const U32 k = parameters.k; const U32 d = parameters.d; @@ -448,11 +480,16 @@ * Check the validity of the parameters. * Returns non-zero if the parameters are valid and 0 otherwise. */ -static int COVER_checkParameters(COVER_params_t parameters) { +static int COVER_checkParameters(ZDICT_cover_params_t parameters, + size_t maxDictSize) { /* k and d are required parameters */ if (parameters.d == 0 || parameters.k == 0) { return 0; } + /* k <= maxDictSize */ + if (parameters.k > maxDictSize) { + return 0; + } /* d <= k */ if (parameters.d > parameters.k) { return 0; @@ -498,10 +535,10 @@ const BYTE *const samples = (const BYTE *)samplesBuffer; const size_t totalSamplesSize = COVER_sum(samplesSizes, nbSamples); /* Checks */ - if (totalSamplesSize < d || + if (totalSamplesSize < MAX(d, sizeof(U64)) || totalSamplesSize >= (size_t)COVER_MAX_SAMPLES_SIZE) { - DISPLAYLEVEL(1, "Total samples size is too large, maximum size is %u MB\n", - (COVER_MAX_SAMPLES_SIZE >> 20)); + DISPLAYLEVEL(1, "Total samples size is too large (%u MB), maximum size is %u MB\n", + (U32)(totalSamplesSize>>20), (COVER_MAX_SAMPLES_SIZE >> 20)); return 0; } /* Zero the context */ @@ -512,7 +549,7 @@ ctx->samplesSizes = samplesSizes; ctx->nbSamples = nbSamples; /* Partial suffix array */ - ctx->suffixSize = totalSamplesSize - d + 1; + ctx->suffixSize = totalSamplesSize - MAX(d, sizeof(U64)) + 1; ctx->suffix = (U32 *)malloc(ctx->suffixSize * sizeof(U32)); /* Maps index to the dmerID */ ctx->dmerAt = (U32 *)malloc(ctx->suffixSize * sizeof(U32)); @@ -546,7 +583,8 @@ } /* qsort doesn't take an opaque pointer, so pass as a global */ g_ctx = ctx; - qsort(ctx->suffix, ctx->suffixSize, sizeof(U32), &COVER_strict_cmp); + qsort(ctx->suffix, ctx->suffixSize, sizeof(U32), + (ctx->d <= 8 ? &COVER_strict_cmp8 : &COVER_strict_cmp)); } DISPLAYLEVEL(2, "Computing frequencies\n"); /* For each dmer group (group of positions with the same first d bytes): @@ -556,8 +594,8 @@ * 2. We calculate how many samples the dmer occurs in and save it in * freqs[dmerId]. */ - COVER_groupBy(ctx->suffix, ctx->suffixSize, sizeof(U32), ctx, &COVER_cmp, - &COVER_group); + COVER_groupBy(ctx->suffix, ctx->suffixSize, sizeof(U32), ctx, + (ctx->d <= 8 ? &COVER_cmp8 : &COVER_cmp), &COVER_group); ctx->freqs = ctx->suffix; ctx->suffix = NULL; return 1; @@ -569,7 +607,7 @@ static size_t COVER_buildDictionary(const COVER_ctx_t *ctx, U32 *freqs, COVER_map_t *activeDmers, void *dictBuffer, size_t dictBufferCapacity, - COVER_params_t parameters) { + ZDICT_cover_params_t parameters) { BYTE *const dict = (BYTE *)dictBuffer; size_t tail = dictBufferCapacity; /* Divide the data up into epochs of equal size. @@ -590,9 +628,13 @@ /* Select a segment */ COVER_segment_t segment = COVER_selectSegment( ctx, freqs, activeDmers, epochBegin, epochEnd, parameters); - /* Trim the segment if necessary and if it is empty then we are done */ + /* If the segment covers no dmers, then we are out of content */ + if (segment.score == 0) { + break; + } + /* Trim the segment if necessary and if it is too small then we are done */ segmentSize = MIN(segment.end - segment.begin + parameters.d - 1, tail); - if (segmentSize == 0) { + if (segmentSize < parameters.d) { break; } /* We fill the dictionary from the back to allow the best segments to be @@ -608,34 +650,19 @@ return tail; } -/** - * Translate from COVER_params_t to ZDICT_params_t required for finalizing the - * dictionary. - */ -static ZDICT_params_t COVER_translateParams(COVER_params_t parameters) { - ZDICT_params_t zdictParams; - memset(&zdictParams, 0, sizeof(zdictParams)); - zdictParams.notificationLevel = 1; - zdictParams.dictID = parameters.dictID; - zdictParams.compressionLevel = parameters.compressionLevel; - return zdictParams; -} - -/** - * Constructs a dictionary using a heuristic based on the following paper: - * - * Liao, Petri, Moffat, Wirth - * Effective Construction of Relative Lempel-Ziv Dictionaries - * Published in WWW 2016. - */ -ZDICTLIB_API size_t COVER_trainFromBuffer( - void *dictBuffer, size_t dictBufferCapacity, const void *samplesBuffer, - const size_t *samplesSizes, unsigned nbSamples, COVER_params_t parameters) { - BYTE *const dict = (BYTE *)dictBuffer; +ZDICTLIB_API size_t ZDICT_trainFromBuffer_cover( + void *dictBuffer, size_t dictBufferCapacity, + const void *samplesBuffer, const size_t *samplesSizes, unsigned nbSamples, + ZDICT_cover_params_t parameters) +{ + BYTE* const dict = (BYTE*)dictBuffer; COVER_ctx_t ctx; COVER_map_t activeDmers; + + /* Initialize global data */ + g_displayLevel = parameters.zParams.notificationLevel; /* Checks */ - if (!COVER_checkParameters(parameters)) { + if (!COVER_checkParameters(parameters, dictBufferCapacity)) { DISPLAYLEVEL(1, "Cover parameters incorrect\n"); return ERROR(GENERIC); } @@ -648,8 +675,6 @@ ZDICT_DICTSIZE_MIN); return ERROR(dstSize_tooSmall); } - /* Initialize global data */ - g_displayLevel = parameters.notificationLevel; /* Initialize context and activeDmers */ if (!COVER_ctx_init(&ctx, samplesBuffer, samplesSizes, nbSamples, parameters.d)) { @@ -666,10 +691,9 @@ const size_t tail = COVER_buildDictionary(&ctx, ctx.freqs, &activeDmers, dictBuffer, dictBufferCapacity, parameters); - ZDICT_params_t zdictParams = COVER_translateParams(parameters); const size_t dictionarySize = ZDICT_finalizeDictionary( dict, dictBufferCapacity, dict + tail, dictBufferCapacity - tail, - samplesBuffer, samplesSizes, nbSamples, zdictParams); + samplesBuffer, samplesSizes, nbSamples, parameters.zParams); if (!ZSTD_isError(dictionarySize)) { DISPLAYLEVEL(2, "Constructed dictionary of size %u\n", (U32)dictionarySize); @@ -689,12 +713,12 @@ * compiled with multithreaded support. */ typedef struct COVER_best_s { - pthread_mutex_t mutex; - pthread_cond_t cond; + ZSTD_pthread_mutex_t mutex; + ZSTD_pthread_cond_t cond; size_t liveJobs; void *dict; size_t dictSize; - COVER_params_t parameters; + ZDICT_cover_params_t parameters; size_t compressedSize; } COVER_best_t; @@ -702,11 +726,9 @@ * Initialize the `COVER_best_t`. */ static void COVER_best_init(COVER_best_t *best) { - if (!best) { - return; - } - pthread_mutex_init(&best->mutex, NULL); - pthread_cond_init(&best->cond, NULL); + if (best==NULL) return; /* compatible with init on NULL */ + (void)ZSTD_pthread_mutex_init(&best->mutex, NULL); + (void)ZSTD_pthread_cond_init(&best->cond, NULL); best->liveJobs = 0; best->dict = NULL; best->dictSize = 0; @@ -721,11 +743,11 @@ if (!best) { return; } - pthread_mutex_lock(&best->mutex); + ZSTD_pthread_mutex_lock(&best->mutex); while (best->liveJobs != 0) { - pthread_cond_wait(&best->cond, &best->mutex); + ZSTD_pthread_cond_wait(&best->cond, &best->mutex); } - pthread_mutex_unlock(&best->mutex); + ZSTD_pthread_mutex_unlock(&best->mutex); } /** @@ -739,8 +761,8 @@ if (best->dict) { free(best->dict); } - pthread_mutex_destroy(&best->mutex); - pthread_cond_destroy(&best->cond); + ZSTD_pthread_mutex_destroy(&best->mutex); + ZSTD_pthread_cond_destroy(&best->cond); } /** @@ -751,9 +773,9 @@ if (!best) { return; } - pthread_mutex_lock(&best->mutex); + ZSTD_pthread_mutex_lock(&best->mutex); ++best->liveJobs; - pthread_mutex_unlock(&best->mutex); + ZSTD_pthread_mutex_unlock(&best->mutex); } /** @@ -762,14 +784,14 @@ * If this dictionary is the best so far save it and its parameters. */ static void COVER_best_finish(COVER_best_t *best, size_t compressedSize, - COVER_params_t parameters, void *dict, + ZDICT_cover_params_t parameters, void *dict, size_t dictSize) { if (!best) { return; } { size_t liveJobs; - pthread_mutex_lock(&best->mutex); + ZSTD_pthread_mutex_lock(&best->mutex); --best->liveJobs; liveJobs = best->liveJobs; /* If the new dictionary is better */ @@ -792,9 +814,9 @@ best->parameters = parameters; best->compressedSize = compressedSize; } - pthread_mutex_unlock(&best->mutex); + ZSTD_pthread_mutex_unlock(&best->mutex); if (liveJobs == 0) { - pthread_cond_broadcast(&best->cond); + ZSTD_pthread_cond_broadcast(&best->cond); } } } @@ -806,7 +828,7 @@ const COVER_ctx_t *ctx; COVER_best_t *best; size_t dictBufferCapacity; - COVER_params_t parameters; + ZDICT_cover_params_t parameters; } COVER_tryParameters_data_t; /** @@ -818,7 +840,7 @@ /* Save parameters as local variables */ COVER_tryParameters_data_t *const data = (COVER_tryParameters_data_t *)opaque; const COVER_ctx_t *const ctx = data->ctx; - const COVER_params_t parameters = data->parameters; + const ZDICT_cover_params_t parameters = data->parameters; size_t dictBufferCapacity = data->dictBufferCapacity; size_t totalCompressedSize = ERROR(GENERIC); /* Allocate space for hash table, dict, and freqs */ @@ -839,10 +861,10 @@ { const size_t tail = COVER_buildDictionary(ctx, freqs, &activeDmers, dict, dictBufferCapacity, parameters); - const ZDICT_params_t zdictParams = COVER_translateParams(parameters); dictBufferCapacity = ZDICT_finalizeDictionary( dict, dictBufferCapacity, dict + tail, dictBufferCapacity - tail, - ctx->samples, ctx->samplesSizes, (unsigned)ctx->nbSamples, zdictParams); + ctx->samples, ctx->samplesSizes, (unsigned)ctx->nbSamples, + parameters.zParams); if (ZDICT_isError(dictBufferCapacity)) { DISPLAYLEVEL(1, "Failed to finalize dictionary\n"); goto _cleanup; @@ -868,13 +890,13 @@ } /* Create the cctx and cdict */ cctx = ZSTD_createCCtx(); - cdict = - ZSTD_createCDict(dict, dictBufferCapacity, parameters.compressionLevel); + cdict = ZSTD_createCDict(dict, dictBufferCapacity, + parameters.zParams.compressionLevel); if (!dst || !cctx || !cdict) { goto _compressCleanup; } /* Compress each sample and sum their sizes (or error) */ - totalCompressedSize = 0; + totalCompressedSize = dictBufferCapacity; for (i = 0; i < ctx->nbSamples; ++i) { const size_t size = ZSTD_compress_usingCDict( cctx, dst, dstCapacity, ctx->samples + ctx->offsets[i], @@ -906,29 +928,28 @@ } } -ZDICTLIB_API size_t COVER_optimizeTrainFromBuffer(void *dictBuffer, - size_t dictBufferCapacity, - const void *samplesBuffer, - const size_t *samplesSizes, - unsigned nbSamples, - COVER_params_t *parameters) { +ZDICTLIB_API size_t ZDICT_optimizeTrainFromBuffer_cover( + void *dictBuffer, size_t dictBufferCapacity, const void *samplesBuffer, + const size_t *samplesSizes, unsigned nbSamples, + ZDICT_cover_params_t *parameters) { /* constants */ const unsigned nbThreads = parameters->nbThreads; const unsigned kMinD = parameters->d == 0 ? 6 : parameters->d; - const unsigned kMaxD = parameters->d == 0 ? 16 : parameters->d; - const unsigned kMinK = parameters->k == 0 ? kMaxD : parameters->k; - const unsigned kMaxK = parameters->k == 0 ? 2048 : parameters->k; - const unsigned kSteps = parameters->steps == 0 ? 32 : parameters->steps; + const unsigned kMaxD = parameters->d == 0 ? 8 : parameters->d; + const unsigned kMinK = parameters->k == 0 ? 50 : parameters->k; + const unsigned kMaxK = parameters->k == 0 ? 2000 : parameters->k; + const unsigned kSteps = parameters->steps == 0 ? 40 : parameters->steps; const unsigned kStepSize = MAX((kMaxK - kMinK) / kSteps, 1); const unsigned kIterations = (1 + (kMaxD - kMinD) / 2) * (1 + (kMaxK - kMinK) / kStepSize); /* Local variables */ - const int displayLevel = parameters->notificationLevel; + const int displayLevel = parameters->zParams.notificationLevel; unsigned iteration = 1; unsigned d; unsigned k; COVER_best_t best; POOL_ctx *pool = NULL; + /* Checks */ if (kMinK < kMaxD || kMaxK < kMinK) { LOCALDISPLAYLEVEL(displayLevel, 1, "Incorrect parameters\n"); @@ -952,7 +973,7 @@ /* Initialization */ COVER_best_init(&best); /* Turn down global display level to clean up display at level 2 and below */ - g_displayLevel = parameters->notificationLevel - 1; + g_displayLevel = displayLevel == 0 ? 0 : displayLevel - 1; /* Loop through d first because each new value needs a new context */ LOCALDISPLAYLEVEL(displayLevel, 2, "Trying %u different sets of parameters\n", kIterations); @@ -963,6 +984,7 @@ if (!COVER_ctx_init(&ctx, samplesBuffer, samplesSizes, nbSamples, d)) { LOCALDISPLAYLEVEL(displayLevel, 1, "Failed to initialize context\n"); COVER_best_destroy(&best); + POOL_free(pool); return ERROR(GENERIC); } /* Loop through k reusing the same context */ @@ -975,6 +997,7 @@ LOCALDISPLAYLEVEL(displayLevel, 1, "Failed to allocate parameters\n"); COVER_best_destroy(&best); COVER_ctx_destroy(&ctx); + POOL_free(pool); return ERROR(GENERIC); } data->ctx = &ctx; @@ -984,9 +1007,11 @@ data->parameters.k = k; data->parameters.d = d; data->parameters.steps = kSteps; + data->parameters.zParams.notificationLevel = g_displayLevel; /* Check the parameters */ - if (!COVER_checkParameters(data->parameters)) { + if (!COVER_checkParameters(data->parameters, dictBufferCapacity)) { DISPLAYLEVEL(1, "Cover parameters incorrect\n"); + free(data); continue; } /* Call the function and pass ownership of data to it */ @@ -1009,8 +1034,10 @@ { const size_t dictSize = best.dictSize; if (ZSTD_isError(best.compressedSize)) { + const size_t compressedSize = best.compressedSize; COVER_best_destroy(&best); - return best.compressedSize; + POOL_free(pool); + return compressedSize; } *parameters = best.parameters; memcpy(dictBuffer, best.dict, dictSize); diff -r fb92df8b634c -r ed5448edcbfa contrib/python-zstandard/zstd/dictBuilder/zdict.c --- a/contrib/python-zstandard/zstd/dictBuilder/zdict.c Wed Apr 04 10:35:09 2018 -0400 +++ b/contrib/python-zstandard/zstd/dictBuilder/zdict.c Wed Apr 18 15:32:08 2018 -0400 @@ -1,18 +1,20 @@ -/** +/* * Copyright (c) 2016-present, Yann Collet, Facebook, Inc. * All rights reserved. * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. An additional grant - * of patent rights can be found in the PATENTS file in the same directory. + * This source code is licensed under both the BSD-style license (found in the + * LICENSE file in the root directory of this source tree) and the GPLv2 (found + * in the COPYING file in the root directory of this source tree). + * You may select, at your option, one of the above-listed licenses. */ /*-************************************** * Tuning parameters ****************************************/ +#define MINRATIO 4 /* minimum nb of apparition to be selected in dictionary */ #define ZDICT_MAX_SAMPLES_SIZE (2000U << 20) -#define ZDICT_MIN_SAMPLES_SIZE 512 +#define ZDICT_MIN_SAMPLES_SIZE (ZDICT_CONTENTSIZE_MIN * MINRATIO) /*-************************************** @@ -59,11 +61,8 @@ #define NOISELENGTH 32 -#define MINRATIO 4 -static const int g_compressionLevel_default = 6; +static const int g_compressionLevel_default = 3; static const U32 g_selectivity_default = 9; -static const size_t g_provision_entropySize = 200; -static const size_t g_min_fast_dictContent = 192; /*-************************************* @@ -96,7 +95,7 @@ unsigned ZDICT_getDictID(const void* dictBuffer, size_t dictSize) { if (dictSize < 8) return 0; - if (MEM_readLE32(dictBuffer) != ZSTD_DICT_MAGIC) return 0; + if (MEM_readLE32(dictBuffer) != ZSTD_MAGIC_DICTIONARY) return 0; return MEM_readLE32((const char*)dictBuffer + 4); } @@ -104,7 +103,7 @@ /*-******************************************************** * Dictionary training functions **********************************************************/ -static unsigned ZDICT_NbCommonBytes (register size_t val) +static unsigned ZDICT_NbCommonBytes (size_t val) { if (MEM_isLittleEndian()) { if (MEM_64bits()) { @@ -208,7 +207,6 @@ U32 cumulLength[LLIMIT] = {0}; U32 savings[LLIMIT] = {0}; const BYTE* b = (const BYTE*)buffer; - size_t length; size_t maxLength = LLIMIT; size_t pos = suffix[start]; U32 end = start; @@ -223,26 +221,30 @@ ||(MEM_read16(b+pos+1) == MEM_read16(b+pos+3)) ||(MEM_read16(b+pos+2) == MEM_read16(b+pos+4)) ) { /* skip and mark segment */ - U16 u16 = MEM_read16(b+pos+4); - U32 u, e = 6; - while (MEM_read16(b+pos+e) == u16) e+=2 ; - if (b[pos+e] == b[pos+e-1]) e++; - for (u=1; u=MINMATCHLENGTH); + { size_t length; + do { + end++; + length = ZDICT_count(b + pos, b + suffix[end]); + } while (length >= MINMATCHLENGTH); + } /* look backward */ - do { - length = ZDICT_count(b + pos, b + *(suffix+start-1)); - if (length >=MINMATCHLENGTH) start--; - } while(length >= MINMATCHLENGTH); + { size_t length; + do { + length = ZDICT_count(b + pos, b + *(suffix+start-1)); + if (length >=MINMATCHLENGTH) start--; + } while(length >= MINMATCHLENGTH); + } /* exit if not found a minimum nb of repetitions */ if (end-start < minRatio) { @@ -269,7 +271,7 @@ U32 selectedCount = 0; U32 selectedID = currentID; for (id =refinedStart; id < refinedEnd; id++) { - if (b[ suffix[id] + searchLength] != currentChar) { + if (b[suffix[id] + searchLength] != currentChar) { if (currentCount > selectedCount) { selectedCount = currentCount; selectedID = currentID; @@ -298,20 +300,23 @@ memset(lengthList, 0, sizeof(lengthList)); /* look forward */ - do { - end++; - length = ZDICT_count(b + pos, b + suffix[end]); - if (length >= LLIMIT) length = LLIMIT-1; - lengthList[length]++; - } while (length >=MINMATCHLENGTH); + { size_t length; + do { + end++; + length = ZDICT_count(b + pos, b + suffix[end]); + if (length >= LLIMIT) length = LLIMIT-1; + lengthList[length]++; + } while (length >=MINMATCHLENGTH); + } /* look backward */ - length = MINMATCHLENGTH; - while ((length >= MINMATCHLENGTH) & (start > 0)) { - length = ZDICT_count(b + pos, b + suffix[start - 1]); - if (length >= LLIMIT) length = LLIMIT - 1; - lengthList[length]++; - if (length >= MINMATCHLENGTH) start--; + { size_t length = MINMATCHLENGTH; + while ((length >= MINMATCHLENGTH) & (start > 0)) { + length = ZDICT_count(b + pos, b + suffix[start - 1]); + if (length >= LLIMIT) length = LLIMIT - 1; + lengthList[length]++; + if (length >= MINMATCHLENGTH) start--; + } } /* largest useful length */ @@ -346,12 +351,12 @@ /* mark positions done */ { U32 id; for (id=start; id solution.length) length = solution.length; } pEnd = (U32)(testedPos + length); @@ -363,21 +368,35 @@ } -/*! ZDICT_checkMerge +static int isIncluded(const void* in, const void* container, size_t length) +{ + const char* const ip = (const char*) in; + const char* const into = (const char*) container; + size_t u; + + for (u=0; upos; const U32 eltEnd = elt.pos + elt.length; + const char* const buf = (const char*) buffer; /* tail overlap */ U32 u; for (u=1; u elt.pos) && (table[u].pos <= eltEnd)) { /* overlap, existing > new */ /* append */ - U32 addedLength = table[u].pos - elt.pos; + U32 const addedLength = table[u].pos - elt.pos; table[u].length += addedLength; table[u].pos = elt.pos; table[u].savings += elt.savings * addedLength / elt.length; /* rough approx */ @@ -393,9 +412,10 @@ /* front overlap */ for (u=1; u= elt.pos) && (table[u].pos < elt.pos)) { /* overlap, existing < new */ /* append */ - int addedLength = (int)eltEnd - (table[u].pos + table[u].length); + int const addedLength = (int)eltEnd - (table[u].pos + table[u].length); table[u].savings += elt.length / 8; /* rough approx bonus */ if (addedLength > 0) { /* otherwise, elt fully included into existing */ table[u].length += addedLength; @@ -407,7 +427,18 @@ table[u] = table[u-1], u--; table[u] = elt; return u; - } } + } + + if (MEM_read64(buf + table[u].pos) == MEM_read64(buf + elt.pos + 1)) { + if (isIncluded(buf + table[u].pos, buf + elt.pos + 1, table[u].length)) { + size_t const addedLength = MAX( (int)elt.length - (int)table[u].length , 1 ); + table[u].pos = elt.pos; + table[u].savings += (U32)(elt.savings * addedLength / elt.length); + table[u].length = MIN(elt.length, table[u].length + 1); + return u; + } + } + } return 0; } @@ -415,8 +446,8 @@ static void ZDICT_removeDictItem(dictItem* table, U32 id) { - /* convention : first element is nb of elts */ - U32 const max = table->pos; + /* convention : table[0].pos stores nb of elts */ + U32 const max = table[0].pos; U32 u; if (!id) return; /* protection, should never happen */ for (u=id; u=l) { \ if (ZDICT_clockSpan(displayClock) > refreshRate) \ { displayClock = clock(); DISPLAY(__VA_ARGS__); \ - if (notificationLevel>=4) fflush(stdout); } } + if (notificationLevel>=4) fflush(stderr); } } /* init */ DISPLAYLEVEL(2, "\r%70s\r", ""); /* clean display line */ @@ -521,7 +552,7 @@ if (doneMarks[cursor]) { cursor++; continue; } solution = ZDICT_analyzePos(doneMarks, suffix, reverseSuffix[cursor], buffer, minRatio, notificationLevel); if (solution.length==0) { cursor++; continue; } - ZDICT_insertDictItem(dictList, dictListSize, solution); + ZDICT_insertDictItem(dictList, dictListSize, solution, buffer); cursor += solution.length; DISPLAYUPDATE(2, "\r%4.2f %% \r", (double)cursor / bufferSize * 100); } } @@ -550,29 +581,30 @@ typedef struct { - ZSTD_CCtx* ref; - ZSTD_CCtx* zc; - void* workPlace; /* must be ZSTD_BLOCKSIZE_ABSOLUTEMAX allocated */ + ZSTD_CCtx* ref; /* contains reference to dictionary */ + ZSTD_CCtx* zc; /* working context */ + void* workPlace; /* must be ZSTD_BLOCKSIZE_MAX allocated */ } EStats_ress_t; #define MAXREPOFFSET 1024 static void ZDICT_countEStats(EStats_ress_t esr, ZSTD_parameters params, - U32* countLit, U32* offsetcodeCount, U32* matchlengthCount, U32* litlengthCount, U32* repOffsets, - const void* src, size_t srcSize, U32 notificationLevel) + U32* countLit, U32* offsetcodeCount, U32* matchlengthCount, U32* litlengthCount, U32* repOffsets, + const void* src, size_t srcSize, + U32 notificationLevel) { - size_t const blockSizeMax = MIN (ZSTD_BLOCKSIZE_ABSOLUTEMAX, 1 << params.cParams.windowLog); + size_t const blockSizeMax = MIN (ZSTD_BLOCKSIZE_MAX, 1 << params.cParams.windowLog); size_t cSize; if (srcSize > blockSizeMax) srcSize = blockSizeMax; /* protection vs large samples */ - { size_t const errorCode = ZSTD_copyCCtx(esr.zc, esr.ref, 0); - if (ZSTD_isError(errorCode)) { DISPLAYLEVEL(1, "warning : ZSTD_copyCCtx failed \n"); return; } + { size_t const errorCode = ZSTD_copyCCtx(esr.zc, esr.ref, 0); + if (ZSTD_isError(errorCode)) { DISPLAYLEVEL(1, "warning : ZSTD_copyCCtx failed \n"); return; } } - cSize = ZSTD_compressBlock(esr.zc, esr.workPlace, ZSTD_BLOCKSIZE_ABSOLUTEMAX, src, srcSize); + cSize = ZSTD_compressBlock(esr.zc, esr.workPlace, ZSTD_BLOCKSIZE_MAX, src, srcSize); if (ZSTD_isError(cSize)) { DISPLAYLEVEL(3, "warning : could not compress sample size %u \n", (U32)srcSize); return; } if (cSize) { /* if == 0; block is not compressible */ - const seqStore_t* seqStorePtr = ZSTD_getSeqStore(esr.zc); + const seqStore_t* const seqStorePtr = ZSTD_getSeqStore(esr.zc); /* literals stats */ { const BYTE* bytePtr; @@ -610,17 +642,6 @@ } } } } -/* -static size_t ZDICT_maxSampleSize(const size_t* fileSizes, unsigned nbFiles) -{ - unsigned u; - size_t max=0; - for (u=0; uOFFCODE_MAX) { eSize = ERROR(dictionary_wrong); goto _cleanup; } /* too large dictionary */ - for (u=0; u<256; u++) countLit[u]=1; /* any character must be described */ - for (u=0; u<=offcodeMax; u++) offcodeCount[u]=1; - for (u=0; u<=MaxML; u++) matchLengthCount[u]=1; - for (u=0; u<=MaxLL; u++) litLengthCount[u]=1; + if (offcodeMax>OFFCODE_MAX) { eSize = ERROR(dictionaryCreation_failed); goto _cleanup; } /* too large dictionary */ + for (u=0; u<256; u++) countLit[u] = 1; /* any character must be described */ + for (u=0; u<=offcodeMax; u++) offcodeCount[u] = 1; + for (u=0; u<=MaxML; u++) matchLengthCount[u] = 1; + for (u=0; u<=MaxLL; u++) litLengthCount[u] = 1; memset(repOffset, 0, sizeof(repOffset)); repOffset[1] = repOffset[4] = repOffset[8] = 1; memset(bestRepOffset, 0, sizeof(bestRepOffset)); - if (compressionLevel==0) compressionLevel=g_compressionLevel_default; + if (compressionLevel<=0) compressionLevel = g_compressionLevel_default; params = ZSTD_getParams(compressionLevel, averageSampleSize, dictBufferSize); { size_t const beginResult = ZSTD_compressBegin_advanced(esr.ref, dictBuffer, dictBufferSize, params, 0); - if (ZSTD_isError(beginResult)) { + if (ZSTD_isError(beginResult)) { + DISPLAYLEVEL(1, "error : ZSTD_compressBegin_advanced() failed : %s \n", ZSTD_getErrorName(beginResult)); eSize = ERROR(GENERIC); - DISPLAYLEVEL(1, "error : ZSTD_compressBegin_advanced failed \n"); goto _cleanup; } } - /* collect stats on all files */ + /* collect stats on all samples */ for (u=0; u= 3) { + if (params.zParams.notificationLevel>= 3) { U32 const nb = MIN(25, dictList[0].pos); U32 const dictContentSize = ZDICT_dictSize(dictList); U32 u; @@ -963,14 +1002,15 @@ /* create dictionary */ { U32 dictContentSize = ZDICT_dictSize(dictList); - if (dictContentSize < targetDictSize/3) { + if (dictContentSize < ZDICT_CONTENTSIZE_MIN) { free(dictList); return ERROR(dictionaryCreation_failed); } /* dictionary content too small */ + if (dictContentSize < targetDictSize/4) { DISPLAYLEVEL(2, "! warning : selected content significantly smaller than requested (%u < %u) \n", dictContentSize, (U32)maxDictSize); + if (samplesBuffSize < 10 * targetDictSize) + DISPLAYLEVEL(2, "! consider increasing the number of samples (total size : %u MB)\n", (U32)(samplesBuffSize>>20)); if (minRep > MINRATIO) { DISPLAYLEVEL(2, "! consider increasing selectivity to produce larger dictionary (-s%u) \n", selectivity+1); DISPLAYLEVEL(2, "! note : larger dictionaries are not necessarily better, test its efficiency on samples \n"); } - if (samplesBuffSize < 10 * targetDictSize) - DISPLAYLEVEL(2, "! consider increasing the number of samples (total size : %u MB)\n", (U32)(samplesBuffSize>>20)); } if ((dictContentSize > targetDictSize*3) && (nbSamples > 2*MINRATIO) && (selectivity>1)) { @@ -978,7 +1018,7 @@ while ((nbSamples >> proposedSelectivity) <= MINRATIO) { proposedSelectivity--; } DISPLAYLEVEL(2, "! note : calculated dictionary significantly larger than requested (%u > %u) \n", dictContentSize, (U32)maxDictSize); DISPLAYLEVEL(2, "! consider increasing dictionary size, or produce denser dictionary (-s%u) \n", proposedSelectivity); - DISPLAYLEVEL(2, "! always test dictionary efficiency on samples \n"); + DISPLAYLEVEL(2, "! always test dictionary efficiency on real samples \n"); } /* limit dictionary size */ @@ -1004,7 +1044,7 @@ dictSize = ZDICT_addEntropyTablesFromBuffer_advanced(dictBuffer, dictContentSize, maxDictSize, samplesBuffer, samplesSizes, nbSamples, - params); + params.zParams); } /* clean up */ @@ -1013,11 +1053,12 @@ } -/* issue : samplesBuffer need to be followed by a noisy guard band. -* work around : duplicate the buffer, and add the noise */ -size_t ZDICT_trainFromBuffer_advanced(void* dictBuffer, size_t dictBufferCapacity, - const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples, - ZDICT_params_t params) +/* ZDICT_trainFromBuffer_legacy() : + * issue : samplesBuffer need to be followed by a noisy guard band. + * work around : duplicate the buffer, and add the noise */ +size_t ZDICT_trainFromBuffer_legacy(void* dictBuffer, size_t dictBufferCapacity, + const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples, + ZDICT_legacy_params_t params) { size_t result; void* newBuff; @@ -1030,10 +1071,9 @@ memcpy(newBuff, samplesBuffer, sBuffSize); ZDICT_fillNoise((char*)newBuff + sBuffSize, NOISELENGTH); /* guard band, for end of buffer condition */ - result = ZDICT_trainFromBuffer_unsafe( - dictBuffer, dictBufferCapacity, - newBuff, samplesSizes, nbSamples, - params); + result = + ZDICT_trainFromBuffer_unsafe_legacy(dictBuffer, dictBufferCapacity, newBuff, + samplesSizes, nbSamples, params); free(newBuff); return result; } @@ -1042,15 +1082,23 @@ size_t ZDICT_trainFromBuffer(void* dictBuffer, size_t dictBufferCapacity, const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples) { - ZDICT_params_t params; + ZDICT_cover_params_t params; + DEBUGLOG(3, "ZDICT_trainFromBuffer"); memset(¶ms, 0, sizeof(params)); - return ZDICT_trainFromBuffer_advanced(dictBuffer, dictBufferCapacity, - samplesBuffer, samplesSizes, nbSamples, - params); + params.d = 8; + params.steps = 4; + /* Default to level 6 since no compression level information is available */ + params.zParams.compressionLevel = 6; +#if defined(ZSTD_DEBUG) && (ZSTD_DEBUG>=1) + params.zParams.notificationLevel = ZSTD_DEBUG; +#endif + return ZDICT_optimizeTrainFromBuffer_cover(dictBuffer, dictBufferCapacity, + samplesBuffer, samplesSizes, nbSamples, + ¶ms); } size_t ZDICT_addEntropyTablesFromBuffer(void* dictBuffer, size_t dictContentSize, size_t dictBufferCapacity, - const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples) + const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples) { ZDICT_params_t params; memset(¶ms, 0, sizeof(params)); diff -r fb92df8b634c -r ed5448edcbfa contrib/python-zstandard/zstd/dictBuilder/zdict.h --- a/contrib/python-zstandard/zstd/dictBuilder/zdict.h Wed Apr 04 10:35:09 2018 -0400 +++ b/contrib/python-zstandard/zstd/dictBuilder/zdict.h Wed Apr 18 15:32:08 2018 -0400 @@ -1,10 +1,11 @@ -/** +/* * Copyright (c) 2016-present, Yann Collet, Facebook, Inc. * All rights reserved. * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. An additional grant - * of patent rights can be found in the PATENTS file in the same directory. + * This source code is licensed under both the BSD-style license (found in the + * LICENSE file in the root directory of this source tree) and the GPLv2 (found + * in the COPYING file in the root directory of this source tree). + * You may select, at your option, one of the above-listed licenses. */ #ifndef DICTBUILDER_H_001 @@ -20,10 +21,12 @@ /* ===== ZDICTLIB_API : control library symbols visibility ===== */ -#if defined(__GNUC__) && (__GNUC__ >= 4) -# define ZDICTLIB_VISIBILITY __attribute__ ((visibility ("default"))) -#else -# define ZDICTLIB_VISIBILITY +#ifndef ZDICTLIB_VISIBILITY +# if defined(__GNUC__) && (__GNUC__ >= 4) +# define ZDICTLIB_VISIBILITY __attribute__ ((visibility ("default"))) +# else +# define ZDICTLIB_VISIBILITY +# endif #endif #if defined(ZSTD_DLL_EXPORT) && (ZSTD_DLL_EXPORT==1) # define ZDICTLIB_API __declspec(dllexport) ZDICTLIB_VISIBILITY @@ -34,20 +37,22 @@ #endif -/*! ZDICT_trainFromBuffer() : - Train a dictionary from an array of samples. - Samples must be stored concatenated in a single flat buffer `samplesBuffer`, - supplied with an array of sizes `samplesSizes`, providing the size of each sample, in order. - The resulting dictionary will be saved into `dictBuffer`. - @return : size of dictionary stored into `dictBuffer` (<= `dictBufferCapacity`) - or an error code, which can be tested with ZDICT_isError(). - Tips : In general, a reasonable dictionary has a size of ~ 100 KB. - It's obviously possible to target smaller or larger ones, just by specifying different `dictBufferCapacity`. - In general, it's recommended to provide a few thousands samples, but this can vary a lot. - It's recommended that total size of all samples be about ~x100 times the target size of dictionary. -*/ +/*! ZDICT_trainFromBuffer(): + * Train a dictionary from an array of samples. + * Redirect towards ZDICT_optimizeTrainFromBuffer_cover() single-threaded, with d=8 and steps=4. + * Samples must be stored concatenated in a single flat buffer `samplesBuffer`, + * supplied with an array of sizes `samplesSizes`, providing the size of each sample, in order. + * The resulting dictionary will be saved into `dictBuffer`. + * @return: size of dictionary stored into `dictBuffer` (<= `dictBufferCapacity`) + * or an error code, which can be tested with ZDICT_isError(). + * Note: ZDICT_trainFromBuffer() requires about 9 bytes of memory for each input byte. + * Tips: In general, a reasonable dictionary has a size of ~ 100 KB. + * It's possible to select smaller or larger size, just by specifying `dictBufferCapacity`. + * In general, it's recommended to provide a few thousands samples, though this can vary a lot. + * It's recommended that total size of all samples be about ~x100 times the target size of dictionary. + */ ZDICTLIB_API size_t ZDICT_trainFromBuffer(void* dictBuffer, size_t dictBufferCapacity, - const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples); + const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples); /*====== Helper functions ======*/ @@ -67,102 +72,108 @@ * ==================================================================================== */ typedef struct { - unsigned selectivityLevel; /* 0 means default; larger => select more => larger dictionary */ - int compressionLevel; /* 0 means default; target a specific zstd compression level */ - unsigned notificationLevel; /* Write to stderr; 0 = none (default); 1 = errors; 2 = progression; 3 = details; 4 = debug; */ - unsigned dictID; /* 0 means auto mode (32-bits random value); other : force dictID value */ - unsigned reserved[2]; /* reserved space for future parameters */ + int compressionLevel; /* optimize for a specific zstd compression level; 0 means default */ + unsigned notificationLevel; /* Write log to stderr; 0 = none (default); 1 = errors; 2 = progression; 3 = details; 4 = debug; */ + unsigned dictID; /* force dictID value; 0 means auto mode (32-bits random value) */ } ZDICT_params_t; - -/*! ZDICT_trainFromBuffer_advanced() : - Same as ZDICT_trainFromBuffer() with control over more parameters. - `parameters` is optional and can be provided with values set to 0 to mean "default". - @return : size of dictionary stored into `dictBuffer` (<= `dictBufferSize`), - or an error code, which can be tested by ZDICT_isError(). - note : ZDICT_trainFromBuffer_advanced() will send notifications into stderr if instructed to, using notificationLevel>0. -*/ -ZDICTLIB_API size_t ZDICT_trainFromBuffer_advanced(void* dictBuffer, size_t dictBufferCapacity, - const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples, - ZDICT_params_t parameters); - -/*! COVER_params_t : - For all values 0 means default. - kMin and d are the only required parameters. -*/ +/*! ZDICT_cover_params_t: + * k and d are the only required parameters. + * For others, value 0 means default. + */ typedef struct { unsigned k; /* Segment size : constraint: 0 < k : Reasonable range [16, 2048+] */ unsigned d; /* dmer size : constraint: 0 < d <= k : Reasonable range [6, 16] */ unsigned steps; /* Number of steps : Only used for optimization : 0 means default (32) : Higher means more parameters checked */ - unsigned nbThreads; /* Number of threads : constraint: 0 < nbThreads : 1 means single-threaded : Only used for optimization : Ignored if ZSTD_MULTITHREAD is not defined */ - unsigned notificationLevel; /* Write to stderr; 0 = none (default); 1 = errors; 2 = progression; 3 = details; 4 = debug; */ - unsigned dictID; /* 0 means auto mode (32-bits random value); other : force dictID value */ - int compressionLevel; /* 0 means default; target a specific zstd compression level */ -} COVER_params_t; + ZDICT_params_t zParams; +} ZDICT_cover_params_t; -/*! COVER_trainFromBuffer() : - Train a dictionary from an array of samples using the COVER algorithm. - Samples must be stored concatenated in a single flat buffer `samplesBuffer`, - supplied with an array of sizes `samplesSizes`, providing the size of each sample, in order. - The resulting dictionary will be saved into `dictBuffer`. - @return : size of dictionary stored into `dictBuffer` (<= `dictBufferCapacity`) - or an error code, which can be tested with ZDICT_isError(). - Note : COVER_trainFromBuffer() requires about 9 bytes of memory for each input byte. - Tips : In general, a reasonable dictionary has a size of ~ 100 KB. - It's obviously possible to target smaller or larger ones, just by specifying different `dictBufferCapacity`. - In general, it's recommended to provide a few thousands samples, but this can vary a lot. - It's recommended that total size of all samples be about ~x100 times the target size of dictionary. -*/ -ZDICTLIB_API size_t COVER_trainFromBuffer(void* dictBuffer, size_t dictBufferCapacity, - const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples, - COVER_params_t parameters); - -/*! COVER_optimizeTrainFromBuffer() : - The same requirements as above hold for all the parameters except `parameters`. - This function tries many parameter combinations and picks the best parameters. - `*parameters` is filled with the best parameters found, and the dictionary - constructed with those parameters is stored in `dictBuffer`. - - All of the parameters d, k, steps are optional. - If d is non-zero then we don't check multiple values of d, otherwise we check d = {6, 8, 10, 12, 14, 16}. - if steps is zero it defaults to its default value. - If k is non-zero then we don't check multiple values of k, otherwise we check steps values in [16, 2048]. +/*! ZDICT_trainFromBuffer_cover(): + * Train a dictionary from an array of samples using the COVER algorithm. + * Samples must be stored concatenated in a single flat buffer `samplesBuffer`, + * supplied with an array of sizes `samplesSizes`, providing the size of each sample, in order. + * The resulting dictionary will be saved into `dictBuffer`. + * @return: size of dictionary stored into `dictBuffer` (<= `dictBufferCapacity`) + * or an error code, which can be tested with ZDICT_isError(). + * Note: ZDICT_trainFromBuffer_cover() requires about 9 bytes of memory for each input byte. + * Tips: In general, a reasonable dictionary has a size of ~ 100 KB. + * It's possible to select smaller or larger size, just by specifying `dictBufferCapacity`. + * In general, it's recommended to provide a few thousands samples, though this can vary a lot. + * It's recommended that total size of all samples be about ~x100 times the target size of dictionary. + */ +ZDICTLIB_API size_t ZDICT_trainFromBuffer_cover( + void *dictBuffer, size_t dictBufferCapacity, + const void *samplesBuffer, const size_t *samplesSizes, unsigned nbSamples, + ZDICT_cover_params_t parameters); - @return : size of dictionary stored into `dictBuffer` (<= `dictBufferCapacity`) - or an error code, which can be tested with ZDICT_isError(). - On success `*parameters` contains the parameters selected. - Note : COVER_optimizeTrainFromBuffer() requires about 8 bytes of memory for each input byte and additionally another 5 bytes of memory for each byte of memory for each thread. -*/ -ZDICTLIB_API size_t COVER_optimizeTrainFromBuffer(void* dictBuffer, size_t dictBufferCapacity, - const void* samplesBuffer, const size_t *samplesSizes, unsigned nbSamples, - COVER_params_t *parameters); - -/*! ZDICT_finalizeDictionary() : - - Given a custom content as a basis for dictionary, and a set of samples, - finalize dictionary by adding headers and statistics. +/*! ZDICT_optimizeTrainFromBuffer_cover(): + * The same requirements as above hold for all the parameters except `parameters`. + * This function tries many parameter combinations and picks the best parameters. + * `*parameters` is filled with the best parameters found, + * dictionary constructed with those parameters is stored in `dictBuffer`. + * + * All of the parameters d, k, steps are optional. + * If d is non-zero then we don't check multiple values of d, otherwise we check d = {6, 8, 10, 12, 14, 16}. + * if steps is zero it defaults to its default value. + * If k is non-zero then we don't check multiple values of k, otherwise we check steps values in [16, 2048]. + * + * @return: size of dictionary stored into `dictBuffer` (<= `dictBufferCapacity`) + * or an error code, which can be tested with ZDICT_isError(). + * On success `*parameters` contains the parameters selected. + * Note: ZDICT_optimizeTrainFromBuffer_cover() requires about 8 bytes of memory for each input byte and additionally another 5 bytes of memory for each byte of memory for each thread. + */ +ZDICTLIB_API size_t ZDICT_optimizeTrainFromBuffer_cover( + void* dictBuffer, size_t dictBufferCapacity, + const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples, + ZDICT_cover_params_t* parameters); - Samples must be stored concatenated in a flat buffer `samplesBuffer`, - supplied with an array of sizes `samplesSizes`, providing the size of each sample in order. - - dictContentSize must be > ZDICT_CONTENTSIZE_MIN bytes. - maxDictSize must be >= dictContentSize, and must be > ZDICT_DICTSIZE_MIN bytes. - - @return : size of dictionary stored into `dictBuffer` (<= `dictBufferCapacity`), - or an error code, which can be tested by ZDICT_isError(). - note : ZDICT_finalizeDictionary() will push notifications into stderr if instructed to, using notificationLevel>0. - note 2 : dictBuffer and customDictContent can overlap -*/ -#define ZDICT_CONTENTSIZE_MIN 256 -#define ZDICT_DICTSIZE_MIN 512 +/*! ZDICT_finalizeDictionary(): + * Given a custom content as a basis for dictionary, and a set of samples, + * finalize dictionary by adding headers and statistics. + * + * Samples must be stored concatenated in a flat buffer `samplesBuffer`, + * supplied with an array of sizes `samplesSizes`, providing the size of each sample in order. + * + * dictContentSize must be >= ZDICT_CONTENTSIZE_MIN bytes. + * maxDictSize must be >= dictContentSize, and must be >= ZDICT_DICTSIZE_MIN bytes. + * + * @return: size of dictionary stored into `dictBuffer` (<= `dictBufferCapacity`), + * or an error code, which can be tested by ZDICT_isError(). + * Note: ZDICT_finalizeDictionary() will push notifications into stderr if instructed to, using notificationLevel>0. + * Note 2: dictBuffer and dictContent can overlap + */ +#define ZDICT_CONTENTSIZE_MIN 128 +#define ZDICT_DICTSIZE_MIN 256 ZDICTLIB_API size_t ZDICT_finalizeDictionary(void* dictBuffer, size_t dictBufferCapacity, - const void* customDictContent, size_t dictContentSize, + const void* dictContent, size_t dictContentSize, const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples, ZDICT_params_t parameters); +typedef struct { + unsigned selectivityLevel; /* 0 means default; larger => select more => larger dictionary */ + ZDICT_params_t zParams; +} ZDICT_legacy_params_t; +/*! ZDICT_trainFromBuffer_legacy(): + * Train a dictionary from an array of samples. + * Samples must be stored concatenated in a single flat buffer `samplesBuffer`, + * supplied with an array of sizes `samplesSizes`, providing the size of each sample, in order. + * The resulting dictionary will be saved into `dictBuffer`. + * `parameters` is optional and can be provided with values set to 0 to mean "default". + * @return: size of dictionary stored into `dictBuffer` (<= `dictBufferCapacity`) + * or an error code, which can be tested with ZDICT_isError(). + * Tips: In general, a reasonable dictionary has a size of ~ 100 KB. + * It's possible to select smaller or larger size, just by specifying `dictBufferCapacity`. + * In general, it's recommended to provide a few thousands samples, though this can vary a lot. + * It's recommended that total size of all samples be about ~x100 times the target size of dictionary. + * Note: ZDICT_trainFromBuffer_legacy() will send notifications into stderr if instructed to, using notificationLevel>0. + */ +ZDICTLIB_API size_t ZDICT_trainFromBuffer_legacy( + void *dictBuffer, size_t dictBufferCapacity, + const void *samplesBuffer, const size_t *samplesSizes, unsigned nbSamples, + ZDICT_legacy_params_t parameters); /* Deprecation warnings */ /* It is generally possible to disable deprecation warnings from compiler, @@ -174,7 +185,7 @@ #else # define ZDICT_GCC_VERSION (__GNUC__ * 100 + __GNUC_MINOR__) # if defined (__cplusplus) && (__cplusplus >= 201402) /* C++14 or greater */ -# define ZDICT_DEPRECATED(message) ZDICTLIB_API [[deprecated(message)]] +# define ZDICT_DEPRECATED(message) [[deprecated(message)]] ZDICTLIB_API # elif (ZDICT_GCC_VERSION >= 405) || defined(__clang__) # define ZDICT_DEPRECATED(message) ZDICTLIB_API __attribute__((deprecated(message))) # elif (ZDICT_GCC_VERSION >= 301) diff -r fb92df8b634c -r ed5448edcbfa contrib/python-zstandard/zstd/zstd.h --- a/contrib/python-zstandard/zstd/zstd.h Wed Apr 04 10:35:09 2018 -0400 +++ b/contrib/python-zstandard/zstd/zstd.h Wed Apr 18 15:32:08 2018 -0400 @@ -2,11 +2,11 @@ * Copyright (c) 2016-present, Yann Collet, Facebook, Inc. * All rights reserved. * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. An additional grant - * of patent rights can be found in the PATENTS file in the same directory. + * This source code is licensed under both the BSD-style license (found in the + * LICENSE file in the root directory of this source tree) and the GPLv2 (found + * in the COPYING file in the root directory of this source tree). + * You may select, at your option, one of the above-listed licenses. */ - #if defined (__cplusplus) extern "C" { #endif @@ -19,10 +19,12 @@ /* ===== ZSTDLIB_API : control library symbols visibility ===== */ -#if defined(__GNUC__) && (__GNUC__ >= 4) -# define ZSTDLIB_VISIBILITY __attribute__ ((visibility ("default"))) -#else -# define ZSTDLIB_VISIBILITY +#ifndef ZSTDLIB_VISIBILITY +# if defined(__GNUC__) && (__GNUC__ >= 4) +# define ZSTDLIB_VISIBILITY __attribute__ ((visibility ("default"))) +# else +# define ZSTDLIB_VISIBILITY +# endif #endif #if defined(ZSTD_DLL_EXPORT) && (ZSTD_DLL_EXPORT==1) # define ZSTDLIB_API __declspec(dllexport) ZSTDLIB_VISIBILITY @@ -36,117 +38,147 @@ /******************************************************************************************************* Introduction - zstd, short for Zstandard, is a fast lossless compression algorithm, targeting real-time compression scenarios - at zlib-level and better compression ratios. The zstd compression library provides in-memory compression and - decompression functions. The library supports compression levels from 1 up to ZSTD_maxCLevel() which is 22. - Levels >= 20, labelled `--ultra`, should be used with caution, as they require more memory. + zstd, short for Zstandard, is a fast lossless compression algorithm, + targeting real-time compression scenarios at zlib-level and better compression ratios. + The zstd compression library provides in-memory compression and decompression functions. + The library supports compression levels from 1 up to ZSTD_maxCLevel() which is currently 22. + Levels >= 20, labeled `--ultra`, should be used with caution, as they require more memory. Compression can be done in: - a single step (described as Simple API) - - a single step, reusing a context (described as Explicit memory management) + - a single step, reusing a context (described as Explicit context) - unbounded multiple steps (described as Streaming compression) - The compression ratio achievable on small data can be highly improved using compression with a dictionary in: + The compression ratio achievable on small data can be highly improved using a dictionary in: - a single step (described as Simple dictionary API) - - a single step, reusing a dictionary (described as Fast dictionary API) + - a single step, reusing a dictionary (described as Bulk-processing dictionary API) Advanced experimental functions can be accessed using #define ZSTD_STATIC_LINKING_ONLY before including zstd.h. - These APIs shall never be used with a dynamic library. + Advanced experimental APIs shall never be used with a dynamic library. They are not "stable", their definition may change in the future. Only static linking is allowed. *********************************************************************************************************/ /*------ Version ------*/ #define ZSTD_VERSION_MAJOR 1 -#define ZSTD_VERSION_MINOR 1 -#define ZSTD_VERSION_RELEASE 3 +#define ZSTD_VERSION_MINOR 3 +#define ZSTD_VERSION_RELEASE 4 + +#define ZSTD_VERSION_NUMBER (ZSTD_VERSION_MAJOR *100*100 + ZSTD_VERSION_MINOR *100 + ZSTD_VERSION_RELEASE) +ZSTDLIB_API unsigned ZSTD_versionNumber(void); /**< useful to check dll version */ #define ZSTD_LIB_VERSION ZSTD_VERSION_MAJOR.ZSTD_VERSION_MINOR.ZSTD_VERSION_RELEASE #define ZSTD_QUOTE(str) #str #define ZSTD_EXPAND_AND_QUOTE(str) ZSTD_QUOTE(str) #define ZSTD_VERSION_STRING ZSTD_EXPAND_AND_QUOTE(ZSTD_LIB_VERSION) - -#define ZSTD_VERSION_NUMBER (ZSTD_VERSION_MAJOR *100*100 + ZSTD_VERSION_MINOR *100 + ZSTD_VERSION_RELEASE) -ZSTDLIB_API unsigned ZSTD_versionNumber(void); /**< library version number; to be used when checking dll version */ +ZSTDLIB_API const char* ZSTD_versionString(void); /* added in v1.3.0 */ /*************************************** * Simple API ***************************************/ /*! ZSTD_compress() : - Compresses `src` content as a single zstd compressed frame into already allocated `dst`. - Hint : compression runs faster if `dstCapacity` >= `ZSTD_compressBound(srcSize)`. - @return : compressed size written into `dst` (<= `dstCapacity), - or an error code if it fails (which can be tested using ZSTD_isError()). */ + * Compresses `src` content as a single zstd compressed frame into already allocated `dst`. + * Hint : compression runs faster if `dstCapacity` >= `ZSTD_compressBound(srcSize)`. + * @return : compressed size written into `dst` (<= `dstCapacity), + * or an error code if it fails (which can be tested using ZSTD_isError()). */ ZSTDLIB_API size_t ZSTD_compress( void* dst, size_t dstCapacity, const void* src, size_t srcSize, int compressionLevel); /*! ZSTD_decompress() : - `compressedSize` : must be the _exact_ size of a single compressed frame. - `dstCapacity` is an upper bound of originalSize. - If user cannot imply a maximum upper bound, it's better to use streaming mode to decompress data. - @return : the number of bytes decompressed into `dst` (<= `dstCapacity`), - or an errorCode if it fails (which can be tested using ZSTD_isError()). */ + * `compressedSize` : must be the _exact_ size of some number of compressed and/or skippable frames. + * `dstCapacity` is an upper bound of originalSize to regenerate. + * If user cannot imply a maximum upper bound, it's better to use streaming mode to decompress data. + * @return : the number of bytes decompressed into `dst` (<= `dstCapacity`), + * or an errorCode if it fails (which can be tested using ZSTD_isError()). */ ZSTDLIB_API size_t ZSTD_decompress( void* dst, size_t dstCapacity, const void* src, size_t compressedSize); +/*! ZSTD_getFrameContentSize() : added in v1.3.0 + * `src` should point to the start of a ZSTD encoded frame. + * `srcSize` must be at least as large as the frame header. + * hint : any size >= `ZSTD_frameHeaderSize_max` is large enough. + * @return : - decompressed size of the frame in `src`, if known + * - ZSTD_CONTENTSIZE_UNKNOWN if the size cannot be determined + * - ZSTD_CONTENTSIZE_ERROR if an error occurred (e.g. invalid magic number, srcSize too small) + * note 1 : a 0 return value means the frame is valid but "empty". + * note 2 : decompressed size is an optional field, it may not be present, typically in streaming mode. + * When `return==ZSTD_CONTENTSIZE_UNKNOWN`, data to decompress could be any size. + * In which case, it's necessary to use streaming mode to decompress data. + * Optionally, application can rely on some implicit limit, + * as ZSTD_decompress() only needs an upper bound of decompressed size. + * (For example, data could be necessarily cut into blocks <= 16 KB). + * note 3 : decompressed size is always present when compression is done with ZSTD_compress() + * note 4 : decompressed size can be very large (64-bits value), + * potentially larger than what local system can handle as a single memory segment. + * In which case, it's necessary to use streaming mode to decompress data. + * note 5 : If source is untrusted, decompressed size could be wrong or intentionally modified. + * Always ensure return value fits within application's authorized limits. + * Each application can set its own limits. + * note 6 : This function replaces ZSTD_getDecompressedSize() */ +#define ZSTD_CONTENTSIZE_UNKNOWN (0ULL - 1) +#define ZSTD_CONTENTSIZE_ERROR (0ULL - 2) +ZSTDLIB_API unsigned long long ZSTD_getFrameContentSize(const void *src, size_t srcSize); + /*! ZSTD_getDecompressedSize() : -* 'src' is the start of a zstd compressed frame. -* @return : content size to be decompressed, as a 64-bits value _if known_, 0 otherwise. -* note 1 : decompressed size is an optional field, that may not be present, especially in streaming mode. -* When `return==0`, data to decompress could be any size. -* In which case, it's necessary to use streaming mode to decompress data. -* Optionally, application can still use ZSTD_decompress() while relying on implied limits. -* (For example, data may be necessarily cut into blocks <= 16 KB). -* note 2 : decompressed size is always present when compression is done with ZSTD_compress() -* note 3 : decompressed size can be very large (64-bits value), -* potentially larger than what local system can handle as a single memory segment. -* In which case, it's necessary to use streaming mode to decompress data. -* note 4 : If source is untrusted, decompressed size could be wrong or intentionally modified. -* Always ensure result fits within application's authorized limits. -* Each application can set its own limits. -* note 5 : when `return==0`, if precise failure cause is needed, use ZSTD_getFrameParams() to know more. */ + * NOTE: This function is now obsolete, in favor of ZSTD_getFrameContentSize(). + * Both functions work the same way, but ZSTD_getDecompressedSize() blends + * "empty", "unknown" and "error" results to the same return value (0), + * while ZSTD_getFrameContentSize() gives them separate return values. + * `src` is the start of a zstd compressed frame. + * @return : content size to be decompressed, as a 64-bits value _if known and not empty_, 0 otherwise. */ ZSTDLIB_API unsigned long long ZSTD_getDecompressedSize(const void* src, size_t srcSize); /*====== Helper functions ======*/ -ZSTDLIB_API int ZSTD_maxCLevel(void); /*!< maximum compression level available */ -ZSTDLIB_API size_t ZSTD_compressBound(size_t srcSize); /*!< maximum compressed size in worst case scenario */ +#define ZSTD_COMPRESSBOUND(srcSize) ((srcSize) + ((srcSize)>>8) + (((srcSize) < (128<<10)) ? (((128<<10) - (srcSize)) >> 11) /* margin, from 64 to 0 */ : 0)) /* this formula ensures that bound(A) + bound(B) <= bound(A+B) as long as A and B >= 128 KB */ +ZSTDLIB_API size_t ZSTD_compressBound(size_t srcSize); /*!< maximum compressed size in worst case single-pass scenario */ ZSTDLIB_API unsigned ZSTD_isError(size_t code); /*!< tells if a `size_t` function result is an error code */ ZSTDLIB_API const char* ZSTD_getErrorName(size_t code); /*!< provides readable string from an error code */ +ZSTDLIB_API int ZSTD_maxCLevel(void); /*!< maximum compression level available */ /*************************************** -* Explicit memory management +* Explicit context ***************************************/ /*= Compression context -* When compressing many times, -* it is recommended to allocate a context just once, and re-use it for each successive compression operation. -* This will make workload friendlier for system's memory. -* Use one context per thread for parallel execution in multi-threaded environments. */ + * When compressing many times, + * it is recommended to allocate a context just once, and re-use it for each successive compression operation. + * This will make workload friendlier for system's memory. + * Use one context per thread for parallel execution in multi-threaded environments. */ typedef struct ZSTD_CCtx_s ZSTD_CCtx; ZSTDLIB_API ZSTD_CCtx* ZSTD_createCCtx(void); ZSTDLIB_API size_t ZSTD_freeCCtx(ZSTD_CCtx* cctx); /*! ZSTD_compressCCtx() : - Same as ZSTD_compress(), requires an allocated ZSTD_CCtx (see ZSTD_createCCtx()). */ -ZSTDLIB_API size_t ZSTD_compressCCtx(ZSTD_CCtx* ctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize, int compressionLevel); + * Same as ZSTD_compress(), requires an allocated ZSTD_CCtx (see ZSTD_createCCtx()). */ +ZSTDLIB_API size_t ZSTD_compressCCtx(ZSTD_CCtx* ctx, + void* dst, size_t dstCapacity, + const void* src, size_t srcSize, + int compressionLevel); -/*= Decompression context */ +/*= Decompression context + * When decompressing many times, + * it is recommended to allocate a context only once, + * and re-use it for each successive compression operation. + * This will make workload friendlier for system's memory. + * Use one context per thread for parallel execution. */ typedef struct ZSTD_DCtx_s ZSTD_DCtx; ZSTDLIB_API ZSTD_DCtx* ZSTD_createDCtx(void); ZSTDLIB_API size_t ZSTD_freeDCtx(ZSTD_DCtx* dctx); /*! ZSTD_decompressDCtx() : -* Same as ZSTD_decompress(), requires an allocated ZSTD_DCtx (see ZSTD_createDCtx()). */ -ZSTDLIB_API size_t ZSTD_decompressDCtx(ZSTD_DCtx* ctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize); + * Same as ZSTD_decompress(), requires an allocated ZSTD_DCtx (see ZSTD_createDCtx()) */ +ZSTDLIB_API size_t ZSTD_decompressDCtx(ZSTD_DCtx* ctx, + void* dst, size_t dstCapacity, + const void* src, size_t srcSize); /************************** * Simple dictionary API ***************************/ /*! ZSTD_compress_usingDict() : -* Compression using a predefined Dictionary (see dictBuilder/zdict.h). -* Note : This function loads the dictionary, resulting in significant startup delay. -* Note : When `dict == NULL || dictSize < 8` no dictionary is used. */ + * Compression using a predefined Dictionary (see dictBuilder/zdict.h). + * Note : This function loads the dictionary, resulting in significant startup delay. + * Note : When `dict == NULL || dictSize < 8` no dictionary is used. */ ZSTDLIB_API size_t ZSTD_compress_usingDict(ZSTD_CCtx* ctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize, @@ -154,36 +186,38 @@ int compressionLevel); /*! ZSTD_decompress_usingDict() : -* Decompression using a predefined Dictionary (see dictBuilder/zdict.h). -* Dictionary must be identical to the one used during compression. -* Note : This function loads the dictionary, resulting in significant startup delay. -* Note : When `dict == NULL || dictSize < 8` no dictionary is used. */ + * Decompression using a predefined Dictionary (see dictBuilder/zdict.h). + * Dictionary must be identical to the one used during compression. + * Note : This function loads the dictionary, resulting in significant startup delay. + * Note : When `dict == NULL || dictSize < 8` no dictionary is used. */ ZSTDLIB_API size_t ZSTD_decompress_usingDict(ZSTD_DCtx* dctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize, const void* dict,size_t dictSize); -/**************************** -* Fast dictionary API -****************************/ +/********************************** + * Bulk processing dictionary API + *********************************/ typedef struct ZSTD_CDict_s ZSTD_CDict; /*! ZSTD_createCDict() : -* When compressing multiple messages / blocks with the same dictionary, it's recommended to load it just once. -* ZSTD_createCDict() will create a digested dictionary, ready to start future compression operations without startup delay. -* ZSTD_CDict can be created once and used by multiple threads concurrently, as its usage is read-only. -* `dictBuffer` can be released after ZSTD_CDict creation, as its content is copied within CDict */ -ZSTDLIB_API ZSTD_CDict* ZSTD_createCDict(const void* dictBuffer, size_t dictSize, int compressionLevel); + * When compressing multiple messages / blocks with the same dictionary, it's recommended to load it just once. + * ZSTD_createCDict() will create a digested dictionary, ready to start future compression operations without startup delay. + * ZSTD_CDict can be created once and shared by multiple threads concurrently, since its usage is read-only. + * `dictBuffer` can be released after ZSTD_CDict creation, since its content is copied within CDict */ +ZSTDLIB_API ZSTD_CDict* ZSTD_createCDict(const void* dictBuffer, size_t dictSize, + int compressionLevel); /*! ZSTD_freeCDict() : -* Function frees memory allocated by ZSTD_createCDict(). */ + * Function frees memory allocated by ZSTD_createCDict(). */ ZSTDLIB_API size_t ZSTD_freeCDict(ZSTD_CDict* CDict); /*! ZSTD_compress_usingCDict() : -* Compression using a digested Dictionary. -* Faster startup than ZSTD_compress_usingDict(), recommended when same dictionary is used multiple times. -* Note that compression level is decided during dictionary creation. */ + * Compression using a digested Dictionary. + * Faster startup than ZSTD_compress_usingDict(), recommended when same dictionary is used multiple times. + * Note that compression level is decided during dictionary creation. + * Frame parameters are hardcoded (dictID=yes, contentSize=yes, checksum=no) */ ZSTDLIB_API size_t ZSTD_compress_usingCDict(ZSTD_CCtx* cctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize, @@ -193,17 +227,17 @@ typedef struct ZSTD_DDict_s ZSTD_DDict; /*! ZSTD_createDDict() : -* Create a digested dictionary, ready to start decompression operation without startup delay. -* dictBuffer can be released after DDict creation, as its content is copied inside DDict */ + * Create a digested dictionary, ready to start decompression operation without startup delay. + * dictBuffer can be released after DDict creation, as its content is copied inside DDict */ ZSTDLIB_API ZSTD_DDict* ZSTD_createDDict(const void* dictBuffer, size_t dictSize); /*! ZSTD_freeDDict() : -* Function frees memory allocated with ZSTD_createDDict() */ + * Function frees memory allocated with ZSTD_createDDict() */ ZSTDLIB_API size_t ZSTD_freeDDict(ZSTD_DDict* ddict); /*! ZSTD_decompress_usingDDict() : -* Decompression using a digested Dictionary. -* Faster startup than ZSTD_decompress_usingDict(), recommended when same dictionary is used multiple times. */ + * Decompression using a digested Dictionary. + * Faster startup than ZSTD_decompress_usingDict(), recommended when same dictionary is used multiple times. */ ZSTDLIB_API size_t ZSTD_decompress_usingDDict(ZSTD_DCtx* dctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize, @@ -260,17 +294,22 @@ * ZSTD_endStream() instructs to finish a frame. * It will perform a flush and write frame epilogue. * The epilogue is required for decoders to consider a frame completed. -* Similar to ZSTD_flushStream(), it may not be able to flush the full content if `output->size` is too small. +* ZSTD_endStream() may not be able to flush full data if `output->size` is too small. * In which case, call again ZSTD_endStream() to complete the flush. -* @return : nb of bytes still present within internal buffer (0 if it's empty, hence compression completed) +* @return : 0 if frame fully completed and fully flushed, + or >0 if some data is still present within internal buffer + (value is minimum size estimation for remaining data to flush, but it could be more) * or an error code, which can be tested using ZSTD_isError(). * * *******************************************************************/ -typedef struct ZSTD_CStream_s ZSTD_CStream; +typedef ZSTD_CCtx ZSTD_CStream; /**< CCtx and CStream are now effectively same object (>= v1.3.0) */ + /* Continue to distinguish them for compatibility with versions <= v1.2.0 */ +/*===== ZSTD_CStream management functions =====*/ ZSTDLIB_API ZSTD_CStream* ZSTD_createCStream(void); ZSTDLIB_API size_t ZSTD_freeCStream(ZSTD_CStream* zcs); +/*===== Streaming compression functions =====*/ ZSTDLIB_API size_t ZSTD_initCStream(ZSTD_CStream* zcs, int compressionLevel); ZSTDLIB_API size_t ZSTD_compressStream(ZSTD_CStream* zcs, ZSTD_outBuffer* output, ZSTD_inBuffer* input); ZSTDLIB_API size_t ZSTD_flushStream(ZSTD_CStream* zcs, ZSTD_outBuffer* output); @@ -303,10 +342,13 @@ * The return value is a suggested next input size (a hint to improve latency) that will never load more than the current frame. * *******************************************************************************/ -typedef struct ZSTD_DStream_s ZSTD_DStream; +typedef ZSTD_DCtx ZSTD_DStream; /**< DCtx and DStream are now effectively same object (>= v1.3.0) */ + /* For compatibility with versions <= v1.2.0, continue to consider them separated. */ +/*===== ZSTD_DStream management functions =====*/ ZSTDLIB_API ZSTD_DStream* ZSTD_createDStream(void); ZSTDLIB_API size_t ZSTD_freeDStream(ZSTD_DStream* zds); +/*===== Streaming decompression functions =====*/ ZSTDLIB_API size_t ZSTD_initDStream(ZSTD_DStream* zds); ZSTDLIB_API size_t ZSTD_decompressStream(ZSTD_DStream* zds, ZSTD_outBuffer* output, ZSTD_inBuffer* input); @@ -316,47 +358,55 @@ #endif /* ZSTD_H_235446 */ -#if defined(ZSTD_STATIC_LINKING_ONLY) && !defined(ZSTD_H_ZSTD_STATIC_LINKING_ONLY) -#define ZSTD_H_ZSTD_STATIC_LINKING_ONLY /**************************************************************************************** * START OF ADVANCED AND EXPERIMENTAL FUNCTIONS * The definitions in this section are considered experimental. - * They should never be used with a dynamic library, as they may change in the future. - * They are provided for advanced usages. + * They should never be used with a dynamic library, as prototypes may change in the future. + * They are provided for advanced scenarios. * Use them only in association with static linking. * ***************************************************************************************/ +#if defined(ZSTD_STATIC_LINKING_ONLY) && !defined(ZSTD_H_ZSTD_STATIC_LINKING_ONLY) +#define ZSTD_H_ZSTD_STATIC_LINKING_ONLY + /* --- Constants ---*/ #define ZSTD_MAGICNUMBER 0xFD2FB528 /* >= v0.8.0 */ #define ZSTD_MAGIC_SKIPPABLE_START 0x184D2A50U +#define ZSTD_MAGIC_DICTIONARY 0xEC30A437 /* >= v0.7.0 */ -#define ZSTD_WINDOWLOG_MAX_32 25 -#define ZSTD_WINDOWLOG_MAX_64 27 -#define ZSTD_WINDOWLOG_MAX ((U32)(MEM_32bits() ? ZSTD_WINDOWLOG_MAX_32 : ZSTD_WINDOWLOG_MAX_64)) -#define ZSTD_WINDOWLOG_MIN 10 -#define ZSTD_HASHLOG_MAX ZSTD_WINDOWLOG_MAX -#define ZSTD_HASHLOG_MIN 6 -#define ZSTD_CHAINLOG_MAX (ZSTD_WINDOWLOG_MAX+1) -#define ZSTD_CHAINLOG_MIN ZSTD_HASHLOG_MIN -#define ZSTD_HASHLOG3_MAX 17 -#define ZSTD_SEARCHLOG_MAX (ZSTD_WINDOWLOG_MAX-1) -#define ZSTD_SEARCHLOG_MIN 1 -#define ZSTD_SEARCHLENGTH_MAX 7 /* only for ZSTD_fast, other strategies are limited to 6 */ -#define ZSTD_SEARCHLENGTH_MIN 3 /* only for ZSTD_btopt, other strategies are limited to 4 */ -#define ZSTD_TARGETLENGTH_MIN 4 -#define ZSTD_TARGETLENGTH_MAX 999 +#define ZSTD_WINDOWLOG_MAX_32 30 +#define ZSTD_WINDOWLOG_MAX_64 31 +#define ZSTD_WINDOWLOG_MAX ((unsigned)(sizeof(size_t) == 4 ? ZSTD_WINDOWLOG_MAX_32 : ZSTD_WINDOWLOG_MAX_64)) +#define ZSTD_WINDOWLOG_MIN 10 +#define ZSTD_HASHLOG_MAX ((ZSTD_WINDOWLOG_MAX < 30) ? ZSTD_WINDOWLOG_MAX : 30) +#define ZSTD_HASHLOG_MIN 6 +#define ZSTD_CHAINLOG_MAX_32 29 +#define ZSTD_CHAINLOG_MAX_64 30 +#define ZSTD_CHAINLOG_MAX ((unsigned)(sizeof(size_t) == 4 ? ZSTD_CHAINLOG_MAX_32 : ZSTD_CHAINLOG_MAX_64)) +#define ZSTD_CHAINLOG_MIN ZSTD_HASHLOG_MIN +#define ZSTD_HASHLOG3_MAX 17 +#define ZSTD_SEARCHLOG_MAX (ZSTD_WINDOWLOG_MAX-1) +#define ZSTD_SEARCHLOG_MIN 1 +#define ZSTD_SEARCHLENGTH_MAX 7 /* only for ZSTD_fast, other strategies are limited to 6 */ +#define ZSTD_SEARCHLENGTH_MIN 3 /* only for ZSTD_btopt, other strategies are limited to 4 */ +#define ZSTD_TARGETLENGTH_MIN 1 /* only used by btopt, btultra and btfast */ +#define ZSTD_LDM_MINMATCH_MIN 4 +#define ZSTD_LDM_MINMATCH_MAX 4096 +#define ZSTD_LDM_BUCKETSIZELOG_MAX 8 -#define ZSTD_FRAMEHEADERSIZE_MAX 18 /* for static allocation */ -#define ZSTD_FRAMEHEADERSIZE_MIN 6 -static const size_t ZSTD_frameHeaderSize_prefix = 5; +#define ZSTD_FRAMEHEADERSIZE_PREFIX 5 /* minimum input size to know frame header size */ +#define ZSTD_FRAMEHEADERSIZE_MIN 6 +#define ZSTD_FRAMEHEADERSIZE_MAX 18 /* for static allocation */ +static const size_t ZSTD_frameHeaderSize_prefix = ZSTD_FRAMEHEADERSIZE_PREFIX; static const size_t ZSTD_frameHeaderSize_min = ZSTD_FRAMEHEADERSIZE_MIN; static const size_t ZSTD_frameHeaderSize_max = ZSTD_FRAMEHEADERSIZE_MAX; static const size_t ZSTD_skippableHeaderSize = 8; /* magic number + skippable frame length */ /*--- Advanced types ---*/ -typedef enum { ZSTD_fast, ZSTD_dfast, ZSTD_greedy, ZSTD_lazy, ZSTD_lazy2, ZSTD_btlazy2, ZSTD_btopt, ZSTD_btopt2 } ZSTD_strategy; /* from faster to stronger */ +typedef enum { ZSTD_fast=1, ZSTD_dfast, ZSTD_greedy, ZSTD_lazy, ZSTD_lazy2, + ZSTD_btlazy2, ZSTD_btopt, ZSTD_btultra } ZSTD_strategy; /* from faster to stronger */ typedef struct { unsigned windowLog; /**< largest match distance : larger == more compression, more memory needed during decompression */ @@ -379,35 +429,190 @@ ZSTD_frameParameters fParams; } ZSTD_parameters; -/*= Custom memory allocation functions */ +typedef struct ZSTD_CCtx_params_s ZSTD_CCtx_params; + +typedef enum { + ZSTD_dct_auto=0, /* dictionary is "full" when starting with ZSTD_MAGIC_DICTIONARY, otherwise it is "rawContent" */ + ZSTD_dct_rawContent, /* ensures dictionary is always loaded as rawContent, even if it starts with ZSTD_MAGIC_DICTIONARY */ + ZSTD_dct_fullDict /* refuses to load a dictionary if it does not respect Zstandard's specification */ +} ZSTD_dictContentType_e; + +typedef enum { + ZSTD_dlm_byCopy = 0, /**< Copy dictionary content internally */ + ZSTD_dlm_byRef, /**< Reference dictionary content -- the dictionary buffer must outlive its users. */ +} ZSTD_dictLoadMethod_e; + + + +/*************************************** +* Frame size functions +***************************************/ + +/*! ZSTD_findFrameCompressedSize() : + * `src` should point to the start of a ZSTD encoded frame or skippable frame + * `srcSize` must be >= first frame size + * @return : the compressed size of the first frame starting at `src`, + * suitable to pass to `ZSTD_decompress` or similar, + * or an error code if input is invalid */ +ZSTDLIB_API size_t ZSTD_findFrameCompressedSize(const void* src, size_t srcSize); + +/*! ZSTD_findDecompressedSize() : + * `src` should point the start of a series of ZSTD encoded and/or skippable frames + * `srcSize` must be the _exact_ size of this series + * (i.e. there should be a frame boundary exactly at `srcSize` bytes after `src`) + * @return : - decompressed size of all data in all successive frames + * - if the decompressed size cannot be determined: ZSTD_CONTENTSIZE_UNKNOWN + * - if an error occurred: ZSTD_CONTENTSIZE_ERROR + * + * note 1 : decompressed size is an optional field, that may not be present, especially in streaming mode. + * When `return==ZSTD_CONTENTSIZE_UNKNOWN`, data to decompress could be any size. + * In which case, it's necessary to use streaming mode to decompress data. + * note 2 : decompressed size is always present when compression is done with ZSTD_compress() + * note 3 : decompressed size can be very large (64-bits value), + * potentially larger than what local system can handle as a single memory segment. + * In which case, it's necessary to use streaming mode to decompress data. + * note 4 : If source is untrusted, decompressed size could be wrong or intentionally modified. + * Always ensure result fits within application's authorized limits. + * Each application can set its own limits. + * note 5 : ZSTD_findDecompressedSize handles multiple frames, and so it must traverse the input to + * read each contained frame header. This is fast as most of the data is skipped, + * however it does mean that all frame data must be present and valid. */ +ZSTDLIB_API unsigned long long ZSTD_findDecompressedSize(const void* src, size_t srcSize); + +/*! ZSTD_frameHeaderSize() : +* `src` should point to the start of a ZSTD frame +* `srcSize` must be >= ZSTD_frameHeaderSize_prefix. +* @return : size of the Frame Header */ +ZSTDLIB_API size_t ZSTD_frameHeaderSize(const void* src, size_t srcSize); + + +/*************************************** +* Memory management +***************************************/ + +/*! ZSTD_sizeof_*() : + * These functions give the current memory usage of selected object. + * Object memory usage can evolve when re-used. */ +ZSTDLIB_API size_t ZSTD_sizeof_CCtx(const ZSTD_CCtx* cctx); +ZSTDLIB_API size_t ZSTD_sizeof_DCtx(const ZSTD_DCtx* dctx); +ZSTDLIB_API size_t ZSTD_sizeof_CStream(const ZSTD_CStream* zcs); +ZSTDLIB_API size_t ZSTD_sizeof_DStream(const ZSTD_DStream* zds); +ZSTDLIB_API size_t ZSTD_sizeof_CDict(const ZSTD_CDict* cdict); +ZSTDLIB_API size_t ZSTD_sizeof_DDict(const ZSTD_DDict* ddict); + +/*! ZSTD_estimate*() : + * These functions make it possible to estimate memory usage + * of a future {D,C}Ctx, before its creation. + * ZSTD_estimateCCtxSize() will provide a budget large enough for any compression level up to selected one. + * It will also consider src size to be arbitrarily "large", which is worst case. + * If srcSize is known to always be small, ZSTD_estimateCCtxSize_usingCParams() can provide a tighter estimation. + * ZSTD_estimateCCtxSize_usingCParams() can be used in tandem with ZSTD_getCParams() to create cParams from compressionLevel. + * ZSTD_estimateCCtxSize_usingCCtxParams() can be used in tandem with ZSTD_CCtxParam_setParameter(). Only single-threaded compression is supported. This function will return an error code if ZSTD_p_nbWorkers is >= 1. + * Note : CCtx size estimation is only correct for single-threaded compression. */ +ZSTDLIB_API size_t ZSTD_estimateCCtxSize(int compressionLevel); +ZSTDLIB_API size_t ZSTD_estimateCCtxSize_usingCParams(ZSTD_compressionParameters cParams); +ZSTDLIB_API size_t ZSTD_estimateCCtxSize_usingCCtxParams(const ZSTD_CCtx_params* params); +ZSTDLIB_API size_t ZSTD_estimateDCtxSize(void); + +/*! ZSTD_estimateCStreamSize() : + * ZSTD_estimateCStreamSize() will provide a budget large enough for any compression level up to selected one. + * It will also consider src size to be arbitrarily "large", which is worst case. + * If srcSize is known to always be small, ZSTD_estimateCStreamSize_usingCParams() can provide a tighter estimation. + * ZSTD_estimateCStreamSize_usingCParams() can be used in tandem with ZSTD_getCParams() to create cParams from compressionLevel. + * ZSTD_estimateCStreamSize_usingCCtxParams() can be used in tandem with ZSTD_CCtxParam_setParameter(). Only single-threaded compression is supported. This function will return an error code if ZSTD_p_nbWorkers is >= 1. + * Note : CStream size estimation is only correct for single-threaded compression. + * ZSTD_DStream memory budget depends on window Size. + * This information can be passed manually, using ZSTD_estimateDStreamSize, + * or deducted from a valid frame Header, using ZSTD_estimateDStreamSize_fromFrame(); + * Note : if streaming is init with function ZSTD_init?Stream_usingDict(), + * an internal ?Dict will be created, which additional size is not estimated here. + * In this case, get total size by adding ZSTD_estimate?DictSize */ +ZSTDLIB_API size_t ZSTD_estimateCStreamSize(int compressionLevel); +ZSTDLIB_API size_t ZSTD_estimateCStreamSize_usingCParams(ZSTD_compressionParameters cParams); +ZSTDLIB_API size_t ZSTD_estimateCStreamSize_usingCCtxParams(const ZSTD_CCtx_params* params); +ZSTDLIB_API size_t ZSTD_estimateDStreamSize(size_t windowSize); +ZSTDLIB_API size_t ZSTD_estimateDStreamSize_fromFrame(const void* src, size_t srcSize); + +/*! ZSTD_estimate?DictSize() : + * ZSTD_estimateCDictSize() will bet that src size is relatively "small", and content is copied, like ZSTD_createCDict(). + * ZSTD_estimateCDictSize_advanced() makes it possible to control compression parameters precisely, like ZSTD_createCDict_advanced(). + * Note : dictionaries created by reference (`ZSTD_dlm_byRef`) are logically smaller. + */ +ZSTDLIB_API size_t ZSTD_estimateCDictSize(size_t dictSize, int compressionLevel); +ZSTDLIB_API size_t ZSTD_estimateCDictSize_advanced(size_t dictSize, ZSTD_compressionParameters cParams, ZSTD_dictLoadMethod_e dictLoadMethod); +ZSTDLIB_API size_t ZSTD_estimateDDictSize(size_t dictSize, ZSTD_dictLoadMethod_e dictLoadMethod); + +/*! ZSTD_initStatic*() : + * Initialize an object using a pre-allocated fixed-size buffer. + * workspace: The memory area to emplace the object into. + * Provided pointer *must be 8-bytes aligned*. + * Buffer must outlive object. + * workspaceSize: Use ZSTD_estimate*Size() to determine + * how large workspace must be to support target scenario. + * @return : pointer to object (same address as workspace, just different type), + * or NULL if error (size too small, incorrect alignment, etc.) + * Note : zstd will never resize nor malloc() when using a static buffer. + * If the object requires more memory than available, + * zstd will just error out (typically ZSTD_error_memory_allocation). + * Note 2 : there is no corresponding "free" function. + * Since workspace is allocated externally, it must be freed externally too. + * Note 3 : cParams : use ZSTD_getCParams() to convert a compression level + * into its associated cParams. + * Limitation 1 : currently not compatible with internal dictionary creation, triggered by + * ZSTD_CCtx_loadDictionary(), ZSTD_initCStream_usingDict() or ZSTD_initDStream_usingDict(). + * Limitation 2 : static cctx currently not compatible with multi-threading. + * Limitation 3 : static dctx is incompatible with legacy support. + */ +ZSTDLIB_API ZSTD_CCtx* ZSTD_initStaticCCtx(void* workspace, size_t workspaceSize); +ZSTDLIB_API ZSTD_CStream* ZSTD_initStaticCStream(void* workspace, size_t workspaceSize); /**< same as ZSTD_initStaticCCtx() */ + +ZSTDLIB_API ZSTD_DCtx* ZSTD_initStaticDCtx(void* workspace, size_t workspaceSize); +ZSTDLIB_API ZSTD_DStream* ZSTD_initStaticDStream(void* workspace, size_t workspaceSize); /**< same as ZSTD_initStaticDCtx() */ + +ZSTDLIB_API const ZSTD_CDict* ZSTD_initStaticCDict( + void* workspace, size_t workspaceSize, + const void* dict, size_t dictSize, + ZSTD_dictLoadMethod_e dictLoadMethod, + ZSTD_dictContentType_e dictContentType, + ZSTD_compressionParameters cParams); + +ZSTDLIB_API const ZSTD_DDict* ZSTD_initStaticDDict( + void* workspace, size_t workspaceSize, + const void* dict, size_t dictSize, + ZSTD_dictLoadMethod_e dictLoadMethod, + ZSTD_dictContentType_e dictContentType); + +/*! Custom memory allocation : + * These prototypes make it possible to pass your own allocation/free functions. + * ZSTD_customMem is provided at creation time, using ZSTD_create*_advanced() variants listed below. + * All allocation/free operations will be completed using these custom variants instead of regular ones. + */ typedef void* (*ZSTD_allocFunction) (void* opaque, size_t size); typedef void (*ZSTD_freeFunction) (void* opaque, void* address); typedef struct { ZSTD_allocFunction customAlloc; ZSTD_freeFunction customFree; void* opaque; } ZSTD_customMem; +static ZSTD_customMem const ZSTD_defaultCMem = { NULL, NULL, NULL }; /**< this constant defers to stdlib's functions */ + +ZSTDLIB_API ZSTD_CCtx* ZSTD_createCCtx_advanced(ZSTD_customMem customMem); +ZSTDLIB_API ZSTD_CStream* ZSTD_createCStream_advanced(ZSTD_customMem customMem); +ZSTDLIB_API ZSTD_DCtx* ZSTD_createDCtx_advanced(ZSTD_customMem customMem); +ZSTDLIB_API ZSTD_DStream* ZSTD_createDStream_advanced(ZSTD_customMem customMem); + +ZSTDLIB_API ZSTD_CDict* ZSTD_createCDict_advanced(const void* dict, size_t dictSize, + ZSTD_dictLoadMethod_e dictLoadMethod, + ZSTD_dictContentType_e dictContentType, + ZSTD_compressionParameters cParams, + ZSTD_customMem customMem); + +ZSTDLIB_API ZSTD_DDict* ZSTD_createDDict_advanced(const void* dict, size_t dictSize, + ZSTD_dictLoadMethod_e dictLoadMethod, + ZSTD_dictContentType_e dictContentType, + ZSTD_customMem customMem); + /*************************************** * Advanced compression functions ***************************************/ -/*! ZSTD_estimateCCtxSize() : - * Gives the amount of memory allocated for a ZSTD_CCtx given a set of compression parameters. - * `frameContentSize` is an optional parameter, provide `0` if unknown */ -ZSTDLIB_API size_t ZSTD_estimateCCtxSize(ZSTD_compressionParameters cParams); - -/*! ZSTD_createCCtx_advanced() : - * Create a ZSTD compression context using external alloc and free functions */ -ZSTDLIB_API ZSTD_CCtx* ZSTD_createCCtx_advanced(ZSTD_customMem customMem); - -/*! ZSTD_sizeofCCtx() : - * Gives the amount of memory used by a given ZSTD_CCtx */ -ZSTDLIB_API size_t ZSTD_sizeof_CCtx(const ZSTD_CCtx* cctx); - -typedef enum { - ZSTD_p_forceWindow /* Force back-references to remain < windowSize, even when referencing Dictionary content (default:0)*/ -} ZSTD_CCtxParameter; -/*! ZSTD_setCCtxParameter() : - * Set advanced parameters, selected through enum ZSTD_CCtxParameter - * @result : 0, or an error code (which can be tested with ZSTD_isError()) */ -ZSTDLIB_API size_t ZSTD_setCCtxParameter(ZSTD_CCtx* cctx, ZSTD_CCtxParameter param, unsigned value); /*! ZSTD_createCDict_byReference() : * Create a digested dictionary for compression @@ -415,15 +620,6 @@ * It is important that dictBuffer outlives CDict, it must remain read accessible throughout the lifetime of CDict */ ZSTDLIB_API ZSTD_CDict* ZSTD_createCDict_byReference(const void* dictBuffer, size_t dictSize, int compressionLevel); -/*! ZSTD_createCDict_advanced() : - * Create a ZSTD_CDict using external alloc and free, and customized compression parameters */ -ZSTDLIB_API ZSTD_CDict* ZSTD_createCDict_advanced(const void* dict, size_t dictSize, unsigned byReference, - ZSTD_parameters params, ZSTD_customMem customMem); - -/*! ZSTD_sizeof_CDict() : - * Gives the amount of memory used by a given ZSTD_sizeof_CDict */ -ZSTDLIB_API size_t ZSTD_sizeof_CDict(const ZSTD_CDict* cdict); - /*! ZSTD_getCParams() : * @return ZSTD_compressionParameters structure for a selected compression level and estimated srcSize. * `estimatedSrcSize` value is optional, select 0 if not known */ @@ -431,7 +627,7 @@ /*! ZSTD_getParams() : * same as ZSTD_getCParams(), but @return a full `ZSTD_parameters` object instead of sub-component `ZSTD_compressionParameters`. -* All fields of `ZSTD_frameParameters` are set to default (0) */ +* All fields of `ZSTD_frameParameters` are set to default : contentSize=1, checksum=0, noDictID=0 */ ZSTDLIB_API ZSTD_parameters ZSTD_getParams(int compressionLevel, unsigned long long estimatedSrcSize, size_t dictSize); /*! ZSTD_checkCParams() : @@ -439,17 +635,24 @@ ZSTDLIB_API size_t ZSTD_checkCParams(ZSTD_compressionParameters params); /*! ZSTD_adjustCParams() : -* optimize params for a given `srcSize` and `dictSize`. -* both values are optional, select `0` if unknown. */ + * optimize params for a given `srcSize` and `dictSize`. + * both values are optional, select `0` if unknown. */ ZSTDLIB_API ZSTD_compressionParameters ZSTD_adjustCParams(ZSTD_compressionParameters cPar, unsigned long long srcSize, size_t dictSize); /*! ZSTD_compress_advanced() : -* Same as ZSTD_compress_usingDict(), with fine-tune control of each compression parameter */ -ZSTDLIB_API size_t ZSTD_compress_advanced (ZSTD_CCtx* ctx, - void* dst, size_t dstCapacity, - const void* src, size_t srcSize, - const void* dict,size_t dictSize, - ZSTD_parameters params); +* Same as ZSTD_compress_usingDict(), with fine-tune control over each compression parameter */ +ZSTDLIB_API size_t ZSTD_compress_advanced (ZSTD_CCtx* cctx, + void* dst, size_t dstCapacity, + const void* src, size_t srcSize, + const void* dict,size_t dictSize, + ZSTD_parameters params); + +/*! ZSTD_compress_usingCDict_advanced() : +* Same as ZSTD_compress_usingCDict(), with fine-tune control over frame parameters */ +ZSTDLIB_API size_t ZSTD_compress_usingCDict_advanced(ZSTD_CCtx* cctx, + void* dst, size_t dstCapacity, + const void* src, size_t srcSize, + const ZSTD_CDict* cdict, ZSTD_frameParameters fParams); /*--- Advanced decompression functions ---*/ @@ -461,30 +664,13 @@ * Note 3 : Skippable Frame Identifiers are considered valid. */ ZSTDLIB_API unsigned ZSTD_isFrame(const void* buffer, size_t size); -/*! ZSTD_estimateDCtxSize() : - * Gives the potential amount of memory allocated to create a ZSTD_DCtx */ -ZSTDLIB_API size_t ZSTD_estimateDCtxSize(void); - -/*! ZSTD_createDCtx_advanced() : - * Create a ZSTD decompression context using external alloc and free functions */ -ZSTDLIB_API ZSTD_DCtx* ZSTD_createDCtx_advanced(ZSTD_customMem customMem); - -/*! ZSTD_sizeof_DCtx() : - * Gives the amount of memory used by a given ZSTD_DCtx */ -ZSTDLIB_API size_t ZSTD_sizeof_DCtx(const ZSTD_DCtx* dctx); - /*! ZSTD_createDDict_byReference() : * Create a digested dictionary, ready to start decompression operation without startup delay. - * Dictionary content is simply referenced, and therefore stays in dictBuffer. - * It is important that dictBuffer outlives DDict, it must remain read accessible throughout the lifetime of DDict */ + * Dictionary content is referenced, and therefore stays in dictBuffer. + * It is important that dictBuffer outlives DDict, + * it must remain read accessible throughout the lifetime of DDict */ ZSTDLIB_API ZSTD_DDict* ZSTD_createDDict_byReference(const void* dictBuffer, size_t dictSize); -ZSTDLIB_API ZSTD_DDict* ZSTD_createDDict_advanced(const void* dict, size_t dictSize, - unsigned byReference, ZSTD_customMem customMem); - -/*! ZSTD_sizeof_DDict() : - * Gives the amount of memory used by a given ZSTD_DDict */ -ZSTDLIB_API size_t ZSTD_sizeof_DDict(const ZSTD_DDict* ddict); /*! ZSTD_getDictID_fromDict() : * Provides the dictID stored within dictionary. @@ -507,7 +693,7 @@ * Note : this use case also happens when using a non-conformant dictionary. * - `srcSize` is too small, and as a result, the frame header could not be decoded (only possible if `srcSize < ZSTD_FRAMEHEADERSIZE_MAX`). * - This is not a Zstandard frame. - * When identifying the exact failure cause, it's possible to used ZSTD_getFrameParams(), which will provide a more precise error code. */ + * When identifying the exact failure cause, it's possible to use ZSTD_getFrameHeader(), which will provide a more precise error code. */ ZSTDLIB_API unsigned ZSTD_getDictID_fromFrame(const void* src, size_t srcSize); @@ -516,32 +702,55 @@ ********************************************************************/ /*===== Advanced Streaming compression functions =====*/ -ZSTDLIB_API ZSTD_CStream* ZSTD_createCStream_advanced(ZSTD_customMem customMem); -ZSTDLIB_API size_t ZSTD_initCStream_srcSize(ZSTD_CStream* zcs, int compressionLevel, unsigned long long pledgedSrcSize); /**< pledgedSrcSize must be correct */ -ZSTDLIB_API size_t ZSTD_initCStream_usingDict(ZSTD_CStream* zcs, const void* dict, size_t dictSize, int compressionLevel); /**< note: a dict will not be used if dict == NULL or dictSize < 8 */ +ZSTDLIB_API size_t ZSTD_initCStream_srcSize(ZSTD_CStream* zcs, int compressionLevel, unsigned long long pledgedSrcSize); /**< pledgedSrcSize must be correct. If it is not known at init time, use ZSTD_CONTENTSIZE_UNKNOWN. Note that, for compatibility with older programs, "0" also disables frame content size field. It may be enabled in the future. */ +ZSTDLIB_API size_t ZSTD_initCStream_usingDict(ZSTD_CStream* zcs, const void* dict, size_t dictSize, int compressionLevel); /**< creates of an internal CDict (incompatible with static CCtx), except if dict == NULL or dictSize < 8, in which case no dict is used. Note: dict is loaded with ZSTD_dm_auto (treated as a full zstd dictionary if it begins with ZSTD_MAGIC_DICTIONARY, else as raw content) and ZSTD_dlm_byCopy.*/ ZSTDLIB_API size_t ZSTD_initCStream_advanced(ZSTD_CStream* zcs, const void* dict, size_t dictSize, - ZSTD_parameters params, unsigned long long pledgedSrcSize); /**< pledgedSrcSize is optional and can be zero == unknown */ + ZSTD_parameters params, unsigned long long pledgedSrcSize); /**< pledgedSrcSize must be correct. If srcSize is not known at init time, use value ZSTD_CONTENTSIZE_UNKNOWN. dict is loaded with ZSTD_dm_auto and ZSTD_dlm_byCopy. */ ZSTDLIB_API size_t ZSTD_initCStream_usingCDict(ZSTD_CStream* zcs, const ZSTD_CDict* cdict); /**< note : cdict will just be referenced, and must outlive compression session */ -ZSTDLIB_API size_t ZSTD_resetCStream(ZSTD_CStream* zcs, unsigned long long pledgedSrcSize); /**< re-use compression parameters from previous init; skip dictionary loading stage; zcs must be init at least once before */ -ZSTDLIB_API size_t ZSTD_sizeof_CStream(const ZSTD_CStream* zcs); +ZSTDLIB_API size_t ZSTD_initCStream_usingCDict_advanced(ZSTD_CStream* zcs, const ZSTD_CDict* cdict, ZSTD_frameParameters fParams, unsigned long long pledgedSrcSize); /**< same as ZSTD_initCStream_usingCDict(), with control over frame parameters. pledgedSrcSize must be correct. If srcSize is not known at init time, use value ZSTD_CONTENTSIZE_UNKNOWN. */ + +/*! ZSTD_resetCStream() : + * start a new compression job, using same parameters from previous job. + * This is typically useful to skip dictionary loading stage, since it will re-use it in-place.. + * Note that zcs must be init at least once before using ZSTD_resetCStream(). + * If pledgedSrcSize is not known at reset time, use macro ZSTD_CONTENTSIZE_UNKNOWN. + * If pledgedSrcSize > 0, its value must be correct, as it will be written in header, and controlled at the end. + * For the time being, pledgedSrcSize==0 is interpreted as "srcSize unknown" for compatibility with older programs, + * but it will change to mean "empty" in future version, so use macro ZSTD_CONTENTSIZE_UNKNOWN instead. + * @return : 0, or an error code (which can be tested using ZSTD_isError()) */ +ZSTDLIB_API size_t ZSTD_resetCStream(ZSTD_CStream* zcs, unsigned long long pledgedSrcSize); + + +typedef struct { + unsigned long long ingested; + unsigned long long consumed; + unsigned long long produced; +} ZSTD_frameProgression; + +/* ZSTD_getFrameProgression(): + * tells how much data has been ingested (read from input) + * consumed (input actually compressed) and produced (output) for current frame. + * Therefore, (ingested - consumed) is amount of input data buffered internally, not yet compressed. + * Can report progression inside worker threads (multi-threading and non-blocking mode). + */ +ZSTD_frameProgression ZSTD_getFrameProgression(const ZSTD_CCtx* cctx); + /*===== Advanced Streaming decompression functions =====*/ typedef enum { DStream_p_maxWindowSize } ZSTD_DStreamParameter_e; -ZSTDLIB_API ZSTD_DStream* ZSTD_createDStream_advanced(ZSTD_customMem customMem); -ZSTDLIB_API size_t ZSTD_initDStream_usingDict(ZSTD_DStream* zds, const void* dict, size_t dictSize); /**< note: a dict will not be used if dict == NULL or dictSize < 8 */ -ZSTDLIB_API size_t ZSTD_setDStreamParameter(ZSTD_DStream* zds, ZSTD_DStreamParameter_e paramType, unsigned paramValue); -ZSTDLIB_API size_t ZSTD_initDStream_usingDDict(ZSTD_DStream* zds, const ZSTD_DDict* ddict); /**< note : ddict will just be referenced, and must outlive decompression session */ +ZSTDLIB_API size_t ZSTD_setDStreamParameter(ZSTD_DStream* zds, ZSTD_DStreamParameter_e paramType, unsigned paramValue); /* obsolete : this API will be removed in a future version */ +ZSTDLIB_API size_t ZSTD_initDStream_usingDict(ZSTD_DStream* zds, const void* dict, size_t dictSize); /**< note: no dictionary will be used if dict == NULL or dictSize < 8 */ +ZSTDLIB_API size_t ZSTD_initDStream_usingDDict(ZSTD_DStream* zds, const ZSTD_DDict* ddict); /**< note : ddict is referenced, it must outlive decompression session */ ZSTDLIB_API size_t ZSTD_resetDStream(ZSTD_DStream* zds); /**< re-use decompression parameters from previous init; saves dictionary loading */ -ZSTDLIB_API size_t ZSTD_sizeof_DStream(const ZSTD_DStream* zds); /********************************************************************* * Buffer-less and synchronous inner streaming functions * * This is an advanced API, giving full control over buffer management, for users which need direct control over memory. -* But it's also a complex one, with many restrictions (documented below). -* Prefer using normal streaming API for an easier experience +* But it's also a complex one, with several restrictions, documented below. +* Prefer normal streaming API for an easier experience. ********************************************************************* */ /** @@ -558,8 +767,8 @@ Then, consume your input using ZSTD_compressContinue(). There are some important considerations to keep in mind when using this advanced function : - - ZSTD_compressContinue() has no internal buffer. It uses externally provided buffer only. - - Interface is synchronous : input is consumed entirely and produce 1+ (or more) compressed blocks. + - ZSTD_compressContinue() has no internal buffer. It uses externally provided buffers only. + - Interface is synchronous : input is consumed entirely and produces 1+ compressed blocks. - Caller must ensure there is enough space in `dst` to store compressed data under worst case scenario. Worst case evaluation is provided by ZSTD_compressBound(). ZSTD_compressContinue() doesn't guarantee recover after a failed compression. @@ -570,22 +779,23 @@ Finish a frame with ZSTD_compressEnd(), which will write the last block(s) and optional checksum. It's possible to use srcSize==0, in which case, it will write a final empty block to end the frame. - Without last block mark, frames will be considered unfinished (corrupted) by decoders. + Without last block mark, frames are considered unfinished (hence corrupted) by compliant decoders. - `ZSTD_CCtx` object can be re-used (ZSTD_compressBegin()) to compress some new frame. + `ZSTD_CCtx` object can be re-used (ZSTD_compressBegin()) to compress again. */ /*===== Buffer-less streaming compression functions =====*/ ZSTDLIB_API size_t ZSTD_compressBegin(ZSTD_CCtx* cctx, int compressionLevel); ZSTDLIB_API size_t ZSTD_compressBegin_usingDict(ZSTD_CCtx* cctx, const void* dict, size_t dictSize, int compressionLevel); -ZSTDLIB_API size_t ZSTD_compressBegin_advanced(ZSTD_CCtx* cctx, const void* dict, size_t dictSize, ZSTD_parameters params, unsigned long long pledgedSrcSize); -ZSTDLIB_API size_t ZSTD_copyCCtx(ZSTD_CCtx* cctx, const ZSTD_CCtx* preparedCCtx, unsigned long long pledgedSrcSize); -ZSTDLIB_API size_t ZSTD_compressBegin_usingCDict(ZSTD_CCtx* cctx, const ZSTD_CDict* cdict, unsigned long long pledgedSrcSize); +ZSTDLIB_API size_t ZSTD_compressBegin_advanced(ZSTD_CCtx* cctx, const void* dict, size_t dictSize, ZSTD_parameters params, unsigned long long pledgedSrcSize); /**< pledgedSrcSize : If srcSize is not known at init time, use ZSTD_CONTENTSIZE_UNKNOWN */ +ZSTDLIB_API size_t ZSTD_compressBegin_usingCDict(ZSTD_CCtx* cctx, const ZSTD_CDict* cdict); /**< note: fails if cdict==NULL */ +ZSTDLIB_API size_t ZSTD_compressBegin_usingCDict_advanced(ZSTD_CCtx* const cctx, const ZSTD_CDict* const cdict, ZSTD_frameParameters const fParams, unsigned long long const pledgedSrcSize); /* compression parameters are already set within cdict. pledgedSrcSize must be correct. If srcSize is not known, use macro ZSTD_CONTENTSIZE_UNKNOWN */ +ZSTDLIB_API size_t ZSTD_copyCCtx(ZSTD_CCtx* cctx, const ZSTD_CCtx* preparedCCtx, unsigned long long pledgedSrcSize); /**< note: if pledgedSrcSize is not known, use ZSTD_CONTENTSIZE_UNKNOWN */ + ZSTDLIB_API size_t ZSTD_compressContinue(ZSTD_CCtx* cctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize); ZSTDLIB_API size_t ZSTD_compressEnd(ZSTD_CCtx* cctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize); - /*- Buffer-less streaming decompression (synchronous mode) @@ -593,38 +803,54 @@ Use ZSTD_createDCtx() / ZSTD_freeDCtx() to manage it. A ZSTD_DCtx object can be re-used multiple times. - First typical operation is to retrieve frame parameters, using ZSTD_getFrameParams(). - It fills a ZSTD_frameParams structure which provide important information to correctly decode the frame, - such as the minimum rolling buffer size to allocate to decompress data (`windowSize`), - and the dictionary ID used. - (Note : content size is optional, it may not be present. 0 means : content size unknown). - Note that these values could be wrong, either because of data malformation, or because an attacker is spoofing deliberate false information. - As a consequence, check that values remain within valid application range, especially `windowSize`, before allocation. - Each application can set its own limit, depending on local restrictions. For extended interoperability, it is recommended to support at least 8 MB. - Frame parameters are extracted from the beginning of the compressed frame. - Data fragment must be large enough to ensure successful decoding, typically `ZSTD_frameHeaderSize_max` bytes. - @result : 0 : successful decoding, the `ZSTD_frameParams` structure is correctly filled. + First typical operation is to retrieve frame parameters, using ZSTD_getFrameHeader(). + Frame header is extracted from the beginning of compressed frame, so providing only the frame's beginning is enough. + Data fragment must be large enough to ensure successful decoding. + `ZSTD_frameHeaderSize_max` bytes is guaranteed to always be large enough. + @result : 0 : successful decoding, the `ZSTD_frameHeader` structure is correctly filled. >0 : `srcSize` is too small, please provide at least @result bytes on next attempt. errorCode, which can be tested using ZSTD_isError(). - Start decompression, with ZSTD_decompressBegin() or ZSTD_decompressBegin_usingDict(). - Alternatively, you can copy a prepared context, using ZSTD_copyDCtx(). + It fills a ZSTD_frameHeader structure with important information to correctly decode the frame, + such as the dictionary ID, content size, or maximum back-reference distance (`windowSize`). + Note that these values could be wrong, either because of data corruption, or because a 3rd party deliberately spoofs false information. + As a consequence, check that values remain within valid application range. + For example, do not allocate memory blindly, check that `windowSize` is within expectation. + Each application can set its own limits, depending on local restrictions. + For extended interoperability, it is recommended to support `windowSize` of at least 8 MB. + + ZSTD_decompressContinue() needs previous data blocks during decompression, up to `windowSize` bytes. + ZSTD_decompressContinue() is very sensitive to contiguity, + if 2 blocks don't follow each other, make sure that either the compressor breaks contiguity at the same place, + or that previous contiguous segment is large enough to properly handle maximum back-reference distance. + There are multiple ways to guarantee this condition. + + The most memory efficient way is to use a round buffer of sufficient size. + Sufficient size is determined by invoking ZSTD_decodingBufferSize_min(), + which can @return an error code if required value is too large for current system (in 32-bits mode). + In a round buffer methodology, ZSTD_decompressContinue() decompresses each block next to previous one, + up to the moment there is not enough room left in the buffer to guarantee decoding another full block, + which maximum size is provided in `ZSTD_frameHeader` structure, field `blockSizeMax`. + At which point, decoding can resume from the beginning of the buffer. + Note that already decoded data stored in the buffer should be flushed before being overwritten. + + There are alternatives possible, for example using two or more buffers of size `windowSize` each, though they consume more memory. + + Finally, if you control the compression process, you can also ignore all buffer size rules, + as long as the encoder and decoder progress in "lock-step", + aka use exactly the same buffer sizes, break contiguity at the same place, etc. + + Once buffers are setup, start decompression, with ZSTD_decompressBegin(). + If decompression requires a dictionary, use ZSTD_decompressBegin_usingDict() or ZSTD_decompressBegin_usingDDict(). Then use ZSTD_nextSrcSizeToDecompress() and ZSTD_decompressContinue() alternatively. ZSTD_nextSrcSizeToDecompress() tells how many bytes to provide as 'srcSize' to ZSTD_decompressContinue(). ZSTD_decompressContinue() requires this _exact_ amount of bytes, or it will fail. - @result of ZSTD_decompressContinue() is the number of bytes regenerated within 'dst' (necessarily <= dstCapacity). - It can be zero, which is not an error; it just means ZSTD_decompressContinue() has decoded some metadata item. + @result of ZSTD_decompressContinue() is the number of bytes regenerated within 'dst' (necessarily <= dstCapacity). + It can be zero : it just means ZSTD_decompressContinue() has decoded some metadata item. It can also be an error code, which can be tested with ZSTD_isError(). - ZSTD_decompressContinue() needs previous data blocks during decompression, up to `windowSize`. - They should preferably be located contiguously, prior to current block. - Alternatively, a round buffer of sufficient size is also possible. Sufficient size is determined by frame parameters. - ZSTD_decompressContinue() is very sensitive to contiguity, - if 2 blocks don't follow each other, make sure that either the compressor breaks contiguity at the same place, - or that previous contiguous segment is large enough to properly handle maximum back-reference. - A frame is fully decoded when ZSTD_nextSrcSizeToDecompress() returns zero. Context can then be reset to start a new decompression. @@ -634,35 +860,505 @@ == Special case : skippable frames == Skippable frames allow integration of user-defined data into a flow of concatenated frames. - Skippable frames will be ignored (skipped) by a decompressor. The format of skippable frames is as follows : + Skippable frames will be ignored (skipped) by decompressor. + The format of skippable frames is as follows : a) Skippable frame ID - 4 Bytes, Little endian format, any value from 0x184D2A50 to 0x184D2A5F b) Frame Size - 4 Bytes, Little endian format, unsigned 32-bits c) Frame Content - any content (User Data) of length equal to Frame Size - For skippable frames ZSTD_decompressContinue() always returns 0. - For skippable frames ZSTD_getFrameParams() returns fparamsPtr->windowLog==0 what means that a frame is skippable. - It also returns Frame Size as fparamsPtr->frameContentSize. + For skippable frames ZSTD_getFrameHeader() returns zfhPtr->frameType==ZSTD_skippableFrame. + For skippable frames ZSTD_decompressContinue() always returns 0 : it only skips the content. */ +/*===== Buffer-less streaming decompression functions =====*/ +typedef enum { ZSTD_frame, ZSTD_skippableFrame } ZSTD_frameType_e; typedef struct { - unsigned long long frameContentSize; - unsigned windowSize; + unsigned long long frameContentSize; /* if == ZSTD_CONTENTSIZE_UNKNOWN, it means this field is not available. 0 means "empty" */ + unsigned long long windowSize; /* can be very large, up to <= frameContentSize */ + unsigned blockSizeMax; + ZSTD_frameType_e frameType; /* if == ZSTD_skippableFrame, frameContentSize is the size of skippable content */ + unsigned headerSize; unsigned dictID; unsigned checksumFlag; -} ZSTD_frameParams; +} ZSTD_frameHeader; +ZSTDLIB_API size_t ZSTD_getFrameHeader(ZSTD_frameHeader* zfhPtr, const void* src, size_t srcSize); /**< doesn't consume input */ +ZSTDLIB_API size_t ZSTD_decodingBufferSize_min(unsigned long long windowSize, unsigned long long frameContentSize); /**< when frame content size is not known, pass in frameContentSize == ZSTD_CONTENTSIZE_UNKNOWN */ -/*===== Buffer-less streaming decompression functions =====*/ -ZSTDLIB_API size_t ZSTD_getFrameParams(ZSTD_frameParams* fparamsPtr, const void* src, size_t srcSize); /**< doesn't consume input, see details below */ ZSTDLIB_API size_t ZSTD_decompressBegin(ZSTD_DCtx* dctx); ZSTDLIB_API size_t ZSTD_decompressBegin_usingDict(ZSTD_DCtx* dctx, const void* dict, size_t dictSize); -ZSTDLIB_API void ZSTD_copyDCtx(ZSTD_DCtx* dctx, const ZSTD_DCtx* preparedDCtx); +ZSTDLIB_API size_t ZSTD_decompressBegin_usingDDict(ZSTD_DCtx* dctx, const ZSTD_DDict* ddict); + ZSTDLIB_API size_t ZSTD_nextSrcSizeToDecompress(ZSTD_DCtx* dctx); ZSTDLIB_API size_t ZSTD_decompressContinue(ZSTD_DCtx* dctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize); + +/* misc */ +ZSTDLIB_API void ZSTD_copyDCtx(ZSTD_DCtx* dctx, const ZSTD_DCtx* preparedDCtx); typedef enum { ZSTDnit_frameHeader, ZSTDnit_blockHeader, ZSTDnit_block, ZSTDnit_lastBlock, ZSTDnit_checksum, ZSTDnit_skippableFrame } ZSTD_nextInputType_e; ZSTDLIB_API ZSTD_nextInputType_e ZSTD_nextInputType(ZSTD_DCtx* dctx); -/** - Block functions + + +/* ============================================ */ +/** New advanced API (experimental) */ +/* ============================================ */ + +/* notes on API design : + * In this proposal, parameters are pushed one by one into an existing context, + * and then applied on all subsequent compression jobs. + * When no parameter is ever provided, CCtx is created with compression level ZSTD_CLEVEL_DEFAULT. + * + * This API is intended to replace all others advanced / experimental API entry points. + * But it stands a reasonable chance to become "stable", after a reasonable testing period. + */ + +/* note on naming convention : + * Initially, the API favored names like ZSTD_setCCtxParameter() . + * In this proposal, convention is changed towards ZSTD_CCtx_setParameter() . + * The main driver is that it identifies more clearly the target object type. + * It feels clearer when considering multiple targets : + * ZSTD_CDict_setParameter() (rather than ZSTD_setCDictParameter()) + * ZSTD_CCtxParams_setParameter() (rather than ZSTD_setCCtxParamsParameter() ) + * etc... + */ + +/* note on enum design : + * All enum will be pinned to explicit values before reaching "stable API" status */ + +typedef enum { + /* Opened question : should we have a format ZSTD_f_auto ? + * Today, it would mean exactly the same as ZSTD_f_zstd1. + * But, in the future, should several formats become supported, + * on the compression side, it would mean "default format". + * On the decompression side, it would mean "automatic format detection", + * so that ZSTD_f_zstd1 would mean "accept *only* zstd frames". + * Since meaning is a little different, another option could be to define different enums for compression and decompression. + * This question could be kept for later, when there are actually multiple formats to support, + * but there is also the question of pinning enum values, and pinning value `0` is especially important */ + ZSTD_f_zstd1 = 0, /* zstd frame format, specified in zstd_compression_format.md (default) */ + ZSTD_f_zstd1_magicless, /* Variant of zstd frame format, without initial 4-bytes magic number. + * Useful to save 4 bytes per generated frame. + * Decoder cannot recognise automatically this format, requiring instructions. */ +} ZSTD_format_e; + +typedef enum { + /* compression format */ + ZSTD_p_format = 10, /* See ZSTD_format_e enum definition. + * Cast selected format as unsigned for ZSTD_CCtx_setParameter() compatibility. */ + + /* compression parameters */ + ZSTD_p_compressionLevel=100, /* Update all compression parameters according to pre-defined cLevel table + * Default level is ZSTD_CLEVEL_DEFAULT==3. + * Special: value 0 means "do not change cLevel". + * Note 1 : it's possible to pass a negative compression level by casting it to unsigned type. + * Note 2 : setting a level sets all default values of other compression parameters. + * Note 3 : setting compressionLevel automatically updates ZSTD_p_compressLiterals. */ + ZSTD_p_windowLog, /* Maximum allowed back-reference distance, expressed as power of 2. + * Must be clamped between ZSTD_WINDOWLOG_MIN and ZSTD_WINDOWLOG_MAX. + * Special: value 0 means "use default windowLog". + * Note: Using a window size greater than ZSTD_MAXWINDOWSIZE_DEFAULT (default: 2^27) + * requires explicitly allowing such window size during decompression stage. */ + ZSTD_p_hashLog, /* Size of the probe table, as a power of 2. + * Resulting table size is (1 << (hashLog+2)). + * Must be clamped between ZSTD_HASHLOG_MIN and ZSTD_HASHLOG_MAX. + * Larger tables improve compression ratio of strategies <= dFast, + * and improve speed of strategies > dFast. + * Special: value 0 means "use default hashLog". */ + ZSTD_p_chainLog, /* Size of the full-search table, as a power of 2. + * Resulting table size is (1 << (chainLog+2)). + * Larger tables result in better and slower compression. + * This parameter is useless when using "fast" strategy. + * Special: value 0 means "use default chainLog". */ + ZSTD_p_searchLog, /* Number of search attempts, as a power of 2. + * More attempts result in better and slower compression. + * This parameter is useless when using "fast" and "dFast" strategies. + * Special: value 0 means "use default searchLog". */ + ZSTD_p_minMatch, /* Minimum size of searched matches (note : repCode matches can be smaller). + * Larger values make faster compression and decompression, but decrease ratio. + * Must be clamped between ZSTD_SEARCHLENGTH_MIN and ZSTD_SEARCHLENGTH_MAX. + * Note that currently, for all strategies < btopt, effective minimum is 4. + * , for all strategies > fast, effective maximum is 6. + * Special: value 0 means "use default minMatchLength". */ + ZSTD_p_targetLength, /* Impact of this field depends on strategy. + * For strategies btopt & btultra: + * Length of Match considered "good enough" to stop search. + * Larger values make compression stronger, and slower. + * For strategy fast: + * Distance between match sampling. + * Larger values make compression faster, and weaker. + * Special: value 0 means "use default targetLength". */ + ZSTD_p_compressionStrategy, /* See ZSTD_strategy enum definition. + * Cast selected strategy as unsigned for ZSTD_CCtx_setParameter() compatibility. + * The higher the value of selected strategy, the more complex it is, + * resulting in stronger and slower compression. + * Special: value 0 means "use default strategy". */ + + ZSTD_p_enableLongDistanceMatching=160, /* Enable long distance matching. + * This parameter is designed to improve compression ratio + * for large inputs, by finding large matches at long distance. + * It increases memory usage and window size. + * Note: enabling this parameter increases ZSTD_p_windowLog to 128 MB + * except when expressly set to a different value. */ + ZSTD_p_ldmHashLog, /* Size of the table for long distance matching, as a power of 2. + * Larger values increase memory usage and compression ratio, + * but decrease compression speed. + * Must be clamped between ZSTD_HASHLOG_MIN and ZSTD_HASHLOG_MAX + * default: windowlog - 7. + * Special: value 0 means "automatically determine hashlog". */ + ZSTD_p_ldmMinMatch, /* Minimum match size for long distance matcher. + * Larger/too small values usually decrease compression ratio. + * Must be clamped between ZSTD_LDM_MINMATCH_MIN and ZSTD_LDM_MINMATCH_MAX. + * Special: value 0 means "use default value" (default: 64). */ + ZSTD_p_ldmBucketSizeLog, /* Log size of each bucket in the LDM hash table for collision resolution. + * Larger values improve collision resolution but decrease compression speed. + * The maximum value is ZSTD_LDM_BUCKETSIZELOG_MAX . + * Special: value 0 means "use default value" (default: 3). */ + ZSTD_p_ldmHashEveryLog, /* Frequency of inserting/looking up entries in the LDM hash table. + * Must be clamped between 0 and (ZSTD_WINDOWLOG_MAX - ZSTD_HASHLOG_MIN). + * Default is MAX(0, (windowLog - ldmHashLog)), optimizing hash table usage. + * Larger values improve compression speed. + * Deviating far from default value will likely result in a compression ratio decrease. + * Special: value 0 means "automatically determine hashEveryLog". */ + + /* frame parameters */ + ZSTD_p_contentSizeFlag=200, /* Content size will be written into frame header _whenever known_ (default:1) + * Content size must be known at the beginning of compression, + * it is provided using ZSTD_CCtx_setPledgedSrcSize() */ + ZSTD_p_checksumFlag, /* A 32-bits checksum of content is written at end of frame (default:0) */ + ZSTD_p_dictIDFlag, /* When applicable, dictionary's ID is written into frame header (default:1) */ + + /* multi-threading parameters */ + /* These parameters are only useful if multi-threading is enabled (ZSTD_MULTITHREAD). + * They return an error otherwise. */ + ZSTD_p_nbWorkers=400, /* Select how many threads will be spawned to compress in parallel. + * When nbWorkers >= 1, triggers asynchronous mode : + * ZSTD_compress_generic() consumes some input, flush some output if possible, and immediately gives back control to caller, + * while compression work is performed in parallel, within worker threads. + * (note : a strong exception to this rule is when first invocation sets ZSTD_e_end : it becomes a blocking call). + * More workers improve speed, but also increase memory usage. + * Default value is `0`, aka "single-threaded mode" : no worker is spawned, compression is performed inside Caller's thread, all invocations are blocking */ + ZSTD_p_jobSize, /* Size of a compression job. This value is enforced only in non-blocking mode. + * Each compression job is completed in parallel, so this value indirectly controls the nb of active threads. + * 0 means default, which is dynamically determined based on compression parameters. + * Job size must be a minimum of overlapSize, or 1 MB, whichever is largest. + * The minimum size is automatically and transparently enforced */ + ZSTD_p_overlapSizeLog, /* Size of previous input reloaded at the beginning of each job. + * 0 => no overlap, 6(default) => use 1/8th of windowSize, >=9 => use full windowSize */ + + /* =================================================================== */ + /* experimental parameters - no stability guaranteed */ + /* =================================================================== */ + + ZSTD_p_compressLiterals=1000, /* control huffman compression of literals (enabled) by default. + * disabling it improves speed and decreases compression ratio by a large amount. + * note : this setting is automatically updated when changing compression level. + * positive compression levels set ZSTD_p_compressLiterals to 1. + * negative compression levels set ZSTD_p_compressLiterals to 0. */ + + ZSTD_p_forceMaxWindow=1100, /* Force back-reference distances to remain < windowSize, + * even when referencing into Dictionary content (default:0) */ + +} ZSTD_cParameter; + + +/*! ZSTD_CCtx_setParameter() : + * Set one compression parameter, selected by enum ZSTD_cParameter. + * Setting a parameter is generally only possible during frame initialization (before starting compression), + * except for a few exceptions which can be updated during compression: compressionLevel, hashLog, chainLog, searchLog, minMatch, targetLength and strategy. + * Note : when `value` is an enum, cast it to unsigned for proper type checking. + * @result : informational value (typically, value being set clamped correctly), + * or an error code (which can be tested with ZSTD_isError()). */ +ZSTDLIB_API size_t ZSTD_CCtx_setParameter(ZSTD_CCtx* cctx, ZSTD_cParameter param, unsigned value); + +/*! ZSTD_CCtx_setPledgedSrcSize() : + * Total input data size to be compressed as a single frame. + * This value will be controlled at the end, and result in error if not respected. + * @result : 0, or an error code (which can be tested with ZSTD_isError()). + * Note 1 : 0 means zero, empty. + * In order to mean "unknown content size", pass constant ZSTD_CONTENTSIZE_UNKNOWN. + * ZSTD_CONTENTSIZE_UNKNOWN is default value for any new compression job. + * Note 2 : If all data is provided and consumed in a single round, + * this value is overriden by srcSize instead. */ +ZSTDLIB_API size_t ZSTD_CCtx_setPledgedSrcSize(ZSTD_CCtx* cctx, unsigned long long pledgedSrcSize); + +/*! ZSTD_CCtx_loadDictionary() : + * Create an internal CDict from `dict` buffer. + * Decompression will have to use same dictionary. + * @result : 0, or an error code (which can be tested with ZSTD_isError()). + * Special: Adding a NULL (or 0-size) dictionary invalidates previous dictionary, + * meaning "return to no-dictionary mode". + * Note 1 : Dictionary will be used for all future compression jobs. + * To return to "no-dictionary" situation, load a NULL dictionary + * Note 2 : Loading a dictionary involves building tables, which are dependent on compression parameters. + * For this reason, compression parameters cannot be changed anymore after loading a dictionary. + * It's also a CPU consuming operation, with non-negligible impact on latency. + * Note 3 :`dict` content will be copied internally. + * Use ZSTD_CCtx_loadDictionary_byReference() to reference dictionary content instead. + * In such a case, dictionary buffer must outlive its users. + * Note 4 : Use ZSTD_CCtx_loadDictionary_advanced() + * to precisely select how dictionary content must be interpreted. */ +ZSTDLIB_API size_t ZSTD_CCtx_loadDictionary(ZSTD_CCtx* cctx, const void* dict, size_t dictSize); +ZSTDLIB_API size_t ZSTD_CCtx_loadDictionary_byReference(ZSTD_CCtx* cctx, const void* dict, size_t dictSize); +ZSTDLIB_API size_t ZSTD_CCtx_loadDictionary_advanced(ZSTD_CCtx* cctx, const void* dict, size_t dictSize, ZSTD_dictLoadMethod_e dictLoadMethod, ZSTD_dictContentType_e dictContentType); + + +/*! ZSTD_CCtx_refCDict() : + * Reference a prepared dictionary, to be used for all next compression jobs. + * Note that compression parameters are enforced from within CDict, + * and supercede any compression parameter previously set within CCtx. + * The dictionary will remain valid for future compression jobs using same CCtx. + * @result : 0, or an error code (which can be tested with ZSTD_isError()). + * Special : adding a NULL CDict means "return to no-dictionary mode". + * Note 1 : Currently, only one dictionary can be managed. + * Adding a new dictionary effectively "discards" any previous one. + * Note 2 : CDict is just referenced, its lifetime must outlive CCtx. */ +ZSTDLIB_API size_t ZSTD_CCtx_refCDict(ZSTD_CCtx* cctx, const ZSTD_CDict* cdict); +/*! ZSTD_CCtx_refPrefix() : + * Reference a prefix (single-usage dictionary) for next compression job. + * Decompression need same prefix to properly regenerate data. + * Prefix is **only used once**. Tables are discarded at end of compression job. + * Subsequent compression jobs will be done without prefix (if none is explicitly referenced). + * If there is a need to use same prefix multiple times, consider embedding it into a ZSTD_CDict instead. + * @result : 0, or an error code (which can be tested with ZSTD_isError()). + * Special: Adding any prefix (including NULL) invalidates any previous prefix or dictionary + * Note 1 : Prefix buffer is referenced. It must outlive compression job. + * Note 2 : Referencing a prefix involves building tables, which are dependent on compression parameters. + * It's a CPU consuming operation, with non-negligible impact on latency. + * Note 3 : By default, the prefix is treated as raw content (ZSTD_dm_rawContent). + * Use ZSTD_CCtx_refPrefix_advanced() to alter dictMode. */ +ZSTDLIB_API size_t ZSTD_CCtx_refPrefix(ZSTD_CCtx* cctx, const void* prefix, size_t prefixSize); +ZSTDLIB_API size_t ZSTD_CCtx_refPrefix_advanced(ZSTD_CCtx* cctx, const void* prefix, size_t prefixSize, ZSTD_dictContentType_e dictContentType); + +/*! ZSTD_CCtx_reset() : + * Return a CCtx to clean state. + * Useful after an error, or to interrupt an ongoing compression job and start a new one. + * Any internal data not yet flushed is cancelled. + * Dictionary (if any) is dropped. + * All parameters are back to default values. + * It's possible to modify compression parameters after a reset. + */ +ZSTDLIB_API void ZSTD_CCtx_reset(ZSTD_CCtx* cctx); + + + +typedef enum { + ZSTD_e_continue=0, /* collect more data, encoder decides when to output compressed result, for optimal conditions */ + ZSTD_e_flush, /* flush any data provided so far - frame will continue, future data can still reference previous data for better compression */ + ZSTD_e_end /* flush any remaining data and close current frame. Any additional data starts a new frame. */ +} ZSTD_EndDirective; + +/*! ZSTD_compress_generic() : + * Behave about the same as ZSTD_compressStream. To note : + * - Compression parameters are pushed into CCtx before starting compression, using ZSTD_CCtx_setParameter() + * - Compression parameters cannot be changed once compression is started. + * - outpot->pos must be <= dstCapacity, input->pos must be <= srcSize + * - outpot->pos and input->pos will be updated. They are guaranteed to remain below their respective limit. + * - In single-thread mode (default), function is blocking : it completed its job before returning to caller. + * - In multi-thread mode, function is non-blocking : it just acquires a copy of input, and distribute job to internal worker threads, + * and then immediately returns, just indicating that there is some data remaining to be flushed. + * The function nonetheless guarantees forward progress : it will return only after it reads or write at least 1+ byte. + * - Exception : in multi-threading mode, if the first call requests a ZSTD_e_end directive, it is blocking : it will complete compression before giving back control to caller. + * - @return provides a minimum amount of data remaining to be flushed from internal buffers + * or an error code, which can be tested using ZSTD_isError(). + * if @return != 0, flush is not fully completed, there is still some data left within internal buffers. + * This is useful for ZSTD_e_flush, since in this case more flushes are necessary to empty all buffers. + * For ZSTD_e_end, @return == 0 when internal buffers are fully flushed and frame is completed. + * - after a ZSTD_e_end directive, if internal buffer is not fully flushed (@return != 0), + * only ZSTD_e_end or ZSTD_e_flush operations are allowed. + * Before starting a new compression job, or changing compression parameters, + * it is required to fully flush internal buffers. + */ +ZSTDLIB_API size_t ZSTD_compress_generic (ZSTD_CCtx* cctx, + ZSTD_outBuffer* output, + ZSTD_inBuffer* input, + ZSTD_EndDirective endOp); + + +/*! ZSTD_compress_generic_simpleArgs() : + * Same as ZSTD_compress_generic(), + * but using only integral types as arguments. + * Argument list is larger than ZSTD_{in,out}Buffer, + * but can be helpful for binders from dynamic languages + * which have troubles handling structures containing memory pointers. + */ +ZSTDLIB_API size_t ZSTD_compress_generic_simpleArgs ( + ZSTD_CCtx* cctx, + void* dst, size_t dstCapacity, size_t* dstPos, + const void* src, size_t srcSize, size_t* srcPos, + ZSTD_EndDirective endOp); + + +/*! ZSTD_CCtx_params : + * Quick howto : + * - ZSTD_createCCtxParams() : Create a ZSTD_CCtx_params structure + * - ZSTD_CCtxParam_setParameter() : Push parameters one by one into + * an existing ZSTD_CCtx_params structure. + * This is similar to + * ZSTD_CCtx_setParameter(). + * - ZSTD_CCtx_setParametersUsingCCtxParams() : Apply parameters to + * an existing CCtx. + * These parameters will be applied to + * all subsequent compression jobs. + * - ZSTD_compress_generic() : Do compression using the CCtx. + * - ZSTD_freeCCtxParams() : Free the memory. + * + * This can be used with ZSTD_estimateCCtxSize_advanced_usingCCtxParams() + * for static allocation for single-threaded compression. + */ +ZSTDLIB_API ZSTD_CCtx_params* ZSTD_createCCtxParams(void); +ZSTDLIB_API size_t ZSTD_freeCCtxParams(ZSTD_CCtx_params* params); + + +/*! ZSTD_CCtxParams_reset() : + * Reset params to default values. + */ +ZSTDLIB_API size_t ZSTD_CCtxParams_reset(ZSTD_CCtx_params* params); + +/*! ZSTD_CCtxParams_init() : + * Initializes the compression parameters of cctxParams according to + * compression level. All other parameters are reset to their default values. + */ +ZSTDLIB_API size_t ZSTD_CCtxParams_init(ZSTD_CCtx_params* cctxParams, int compressionLevel); + +/*! ZSTD_CCtxParams_init_advanced() : + * Initializes the compression and frame parameters of cctxParams according to + * params. All other parameters are reset to their default values. + */ +ZSTDLIB_API size_t ZSTD_CCtxParams_init_advanced(ZSTD_CCtx_params* cctxParams, ZSTD_parameters params); + + +/*! ZSTD_CCtxParam_setParameter() : + * Similar to ZSTD_CCtx_setParameter. + * Set one compression parameter, selected by enum ZSTD_cParameter. + * Parameters must be applied to a ZSTD_CCtx using ZSTD_CCtx_setParametersUsingCCtxParams(). + * Note : when `value` is an enum, cast it to unsigned for proper type checking. + * @result : 0, or an error code (which can be tested with ZSTD_isError()). + */ +ZSTDLIB_API size_t ZSTD_CCtxParam_setParameter(ZSTD_CCtx_params* params, ZSTD_cParameter param, unsigned value); + +/*! ZSTD_CCtx_setParametersUsingCCtxParams() : + * Apply a set of ZSTD_CCtx_params to the compression context. + * This can be done even after compression is started, + * if nbWorkers==0, this will have no impact until a new compression is started. + * if nbWorkers>=1, new parameters will be picked up at next job, + * with a few restrictions (windowLog, pledgedSrcSize, nbWorkers, jobSize, and overlapLog are not updated). + */ +ZSTDLIB_API size_t ZSTD_CCtx_setParametersUsingCCtxParams( + ZSTD_CCtx* cctx, const ZSTD_CCtx_params* params); + + +/*=== Advanced parameters for decompression API ===*/ + +/* The following parameters must be set after creating a ZSTD_DCtx* (or ZSTD_DStream*) object, + * but before starting decompression of a frame. + */ + +/*! ZSTD_DCtx_loadDictionary() : + * Create an internal DDict from dict buffer, + * to be used to decompress next frames. + * @result : 0, or an error code (which can be tested with ZSTD_isError()). + * Special : Adding a NULL (or 0-size) dictionary invalidates any previous dictionary, + * meaning "return to no-dictionary mode". + * Note 1 : `dict` content will be copied internally. + * Use ZSTD_DCtx_loadDictionary_byReference() + * to reference dictionary content instead. + * In which case, the dictionary buffer must outlive its users. + * Note 2 : Loading a dictionary involves building tables, + * which has a non-negligible impact on CPU usage and latency. + * Note 3 : Use ZSTD_DCtx_loadDictionary_advanced() to select + * how dictionary content will be interpreted and loaded. + */ +ZSTDLIB_API size_t ZSTD_DCtx_loadDictionary(ZSTD_DCtx* dctx, const void* dict, size_t dictSize); +ZSTDLIB_API size_t ZSTD_DCtx_loadDictionary_byReference(ZSTD_DCtx* dctx, const void* dict, size_t dictSize); +ZSTDLIB_API size_t ZSTD_DCtx_loadDictionary_advanced(ZSTD_DCtx* dctx, const void* dict, size_t dictSize, ZSTD_dictLoadMethod_e dictLoadMethod, ZSTD_dictContentType_e dictContentType); + + +/*! ZSTD_DCtx_refDDict() : + * Reference a prepared dictionary, to be used to decompress next frames. + * The dictionary remains active for decompression of future frames using same DCtx. + * @result : 0, or an error code (which can be tested with ZSTD_isError()). + * Note 1 : Currently, only one dictionary can be managed. + * Referencing a new dictionary effectively "discards" any previous one. + * Special : adding a NULL DDict means "return to no-dictionary mode". + * Note 2 : DDict is just referenced, its lifetime must outlive its usage from DCtx. + */ +ZSTDLIB_API size_t ZSTD_DCtx_refDDict(ZSTD_DCtx* dctx, const ZSTD_DDict* ddict); + + +/*! ZSTD_DCtx_refPrefix() : + * Reference a prefix (single-usage dictionary) for next compression job. + * Prefix is **only used once**. It must be explicitly referenced before each frame. + * If there is a need to use same prefix multiple times, consider embedding it into a ZSTD_DDict instead. + * @result : 0, or an error code (which can be tested with ZSTD_isError()). + * Note 1 : Adding any prefix (including NULL) invalidates any previously set prefix or dictionary + * Note 2 : Prefix buffer is referenced. It must outlive compression job. + * Note 3 : By default, the prefix is treated as raw content (ZSTD_dm_rawContent). + * Use ZSTD_CCtx_refPrefix_advanced() to alter dictMode. + * Note 4 : Referencing a raw content prefix has almost no cpu nor memory cost. + */ +ZSTDLIB_API size_t ZSTD_DCtx_refPrefix(ZSTD_DCtx* dctx, const void* prefix, size_t prefixSize); +ZSTDLIB_API size_t ZSTD_DCtx_refPrefix_advanced(ZSTD_DCtx* dctx, const void* prefix, size_t prefixSize, ZSTD_dictContentType_e dictContentType); + + +/*! ZSTD_DCtx_setMaxWindowSize() : + * Refuses allocating internal buffers for frames requiring a window size larger than provided limit. + * This is useful to prevent a decoder context from reserving too much memory for itself (potential attack scenario). + * This parameter is only useful in streaming mode, since no internal buffer is allocated in direct mode. + * By default, a decompression context accepts all window sizes <= (1 << ZSTD_WINDOWLOG_MAX) + * @return : 0, or an error code (which can be tested using ZSTD_isError()). + */ +ZSTDLIB_API size_t ZSTD_DCtx_setMaxWindowSize(ZSTD_DCtx* dctx, size_t maxWindowSize); + + +/*! ZSTD_DCtx_setFormat() : + * Instruct the decoder context about what kind of data to decode next. + * This instruction is mandatory to decode data without a fully-formed header, + * such ZSTD_f_zstd1_magicless for example. + * @return : 0, or an error code (which can be tested using ZSTD_isError()). + */ +ZSTDLIB_API size_t ZSTD_DCtx_setFormat(ZSTD_DCtx* dctx, ZSTD_format_e format); + + +/*! ZSTD_decompress_generic() : + * Behave the same as ZSTD_decompressStream. + * Decompression parameters cannot be changed once decompression is started. + * @return : an error code, which can be tested using ZSTD_isError() + * if >0, a hint, nb of expected input bytes for next invocation. + * `0` means : a frame has just been fully decoded and flushed. + */ +ZSTDLIB_API size_t ZSTD_decompress_generic(ZSTD_DCtx* dctx, + ZSTD_outBuffer* output, + ZSTD_inBuffer* input); + + +/*! ZSTD_decompress_generic_simpleArgs() : + * Same as ZSTD_decompress_generic(), + * but using only integral types as arguments. + * Argument list is larger than ZSTD_{in,out}Buffer, + * but can be helpful for binders from dynamic languages + * which have troubles handling structures containing memory pointers. + */ +ZSTDLIB_API size_t ZSTD_decompress_generic_simpleArgs ( + ZSTD_DCtx* dctx, + void* dst, size_t dstCapacity, size_t* dstPos, + const void* src, size_t srcSize, size_t* srcPos); + + +/*! ZSTD_DCtx_reset() : + * Return a DCtx to clean state. + * If a decompression was ongoing, any internal data not yet flushed is cancelled. + * All parameters are back to default values, including sticky ones. + * Dictionary (if any) is dropped. + * Parameters can be modified again after a reset. + */ +ZSTDLIB_API void ZSTD_DCtx_reset(ZSTD_DCtx* dctx); + + + +/* ============================ */ +/** Block level API */ +/* ============================ */ + +/*! Block functions produce and decode raw zstd blocks, without frame metadata. Frame metadata cost is typically ~18 bytes, which can be non-negligible for very small blocks (< 100 bytes). User will have to take in charge required information to regenerate data, such as compressed and content sizes. @@ -671,27 +1367,29 @@ - Compressing and decompressing require a context structure + Use ZSTD_createCCtx() and ZSTD_createDCtx() - It is necessary to init context before starting - + compression : ZSTD_compressBegin() - + decompression : ZSTD_decompressBegin() - + variants _usingDict() are also allowed - + copyCCtx() and copyDCtx() work too - - Block size is limited, it must be <= ZSTD_getBlockSizeMax() - + If you need to compress more, cut data into multiple blocks - + Consider using the regular ZSTD_compress() instead, as frame metadata costs become negligible when source size is large. + + compression : any ZSTD_compressBegin*() variant, including with dictionary + + decompression : any ZSTD_decompressBegin*() variant, including with dictionary + + copyCCtx() and copyDCtx() can be used too + - Block size is limited, it must be <= ZSTD_getBlockSize() <= ZSTD_BLOCKSIZE_MAX == 128 KB + + If input is larger than a block size, it's necessary to split input data into multiple blocks + + For inputs larger than a single block size, consider using the regular ZSTD_compress() instead. + Frame metadata is not that costly, and quickly becomes negligible as source size grows larger. - When a block is considered not compressible enough, ZSTD_compressBlock() result will be zero. In which case, nothing is produced into `dst`. + User must test for such outcome and deal directly with uncompressed data + ZSTD_decompressBlock() doesn't accept uncompressed data as input !!! - + In case of multiple successive blocks, decoder must be informed of uncompressed block existence to follow proper history. - Use ZSTD_insertBlock() in such a case. + + In case of multiple successive blocks, should some of them be uncompressed, + decoder must be informed of their existence in order to follow proper history. + Use ZSTD_insertBlock() for such a case. */ -#define ZSTD_BLOCKSIZE_ABSOLUTEMAX (128 * 1024) /* define, for static allocation */ +#define ZSTD_BLOCKSIZELOG_MAX 17 +#define ZSTD_BLOCKSIZE_MAX (1< WINDOWLOG_MAX: - raise ValueError('invalid window log value') +def _zstd_error(zresult): + # Resolves to bytes on Python 2 and 3. We use the string for formatting + # into error messages, which will be literal unicode. So convert it to + # unicode. + return ffi.string(lib.ZSTD_getErrorName(zresult)).decode('utf-8') - if chain_log < CHAINLOG_MIN or chain_log > CHAINLOG_MAX: - raise ValueError('invalid chain log value') +def _make_cctx_params(params): + res = lib.ZSTD_createCCtxParams() + if res == ffi.NULL: + raise MemoryError() + + res = ffi.gc(res, lib.ZSTD_freeCCtxParams) - if hash_log < HASHLOG_MIN or hash_log > HASHLOG_MAX: - raise ValueError('invalid hash log value') + attrs = [ + (lib.ZSTD_p_format, params.format), + (lib.ZSTD_p_compressionLevel, params.compression_level), + (lib.ZSTD_p_windowLog, params.window_log), + (lib.ZSTD_p_hashLog, params.hash_log), + (lib.ZSTD_p_chainLog, params.chain_log), + (lib.ZSTD_p_searchLog, params.search_log), + (lib.ZSTD_p_minMatch, params.min_match), + (lib.ZSTD_p_targetLength, params.target_length), + (lib.ZSTD_p_compressionStrategy, params.compression_strategy), + (lib.ZSTD_p_contentSizeFlag, params.write_content_size), + (lib.ZSTD_p_checksumFlag, params.write_checksum), + (lib.ZSTD_p_dictIDFlag, params.write_dict_id), + (lib.ZSTD_p_nbWorkers, params.threads), + (lib.ZSTD_p_jobSize, params.job_size), + (lib.ZSTD_p_overlapSizeLog, params.overlap_size_log), + (lib.ZSTD_p_compressLiterals, params.compress_literals), + (lib.ZSTD_p_forceMaxWindow, params.force_max_window), + (lib.ZSTD_p_enableLongDistanceMatching, params.enable_ldm), + (lib.ZSTD_p_ldmHashLog, params.ldm_hash_log), + (lib.ZSTD_p_ldmMinMatch, params.ldm_min_match), + (lib.ZSTD_p_ldmBucketSizeLog, params.ldm_bucket_size_log), + (lib.ZSTD_p_ldmHashEveryLog, params.ldm_hash_every_log), + ] - if search_log < SEARCHLOG_MIN or search_log > SEARCHLOG_MAX: - raise ValueError('invalid search log value') + for param, value in attrs: + _set_compression_parameter(res, param, value) + + return res - if search_length < SEARCHLENGTH_MIN or search_length > SEARCHLENGTH_MAX: - raise ValueError('invalid search length value') +class ZstdCompressionParameters(object): + @staticmethod + def from_level(level, source_size=0, dict_size=0, **kwargs): + params = lib.ZSTD_getCParams(level, source_size, dict_size) - if target_length < TARGETLENGTH_MIN or target_length > TARGETLENGTH_MAX: - raise ValueError('invalid target length value') + args = { + 'window_log': 'windowLog', + 'chain_log': 'chainLog', + 'hash_log': 'hashLog', + 'search_log': 'searchLog', + 'min_match': 'searchLength', + 'target_length': 'targetLength', + 'compression_strategy': 'strategy', + } + + for arg, attr in args.items(): + if arg not in kwargs: + kwargs[arg] = getattr(params, attr) + + if 'compress_literals' not in kwargs: + kwargs['compress_literals'] = 1 if level >= 0 else 0 - if strategy < STRATEGY_FAST or strategy > STRATEGY_BTOPT: - raise ValueError('invalid strategy value') + return ZstdCompressionParameters(**kwargs) + + def __init__(self, format=0, compression_level=0, window_log=0, hash_log=0, + chain_log=0, search_log=0, min_match=0, target_length=0, + compression_strategy=0, write_content_size=1, write_checksum=0, + write_dict_id=0, job_size=0, overlap_size_log=0, + force_max_window=0, enable_ldm=0, ldm_hash_log=0, + ldm_min_match=0, ldm_bucket_size_log=0, ldm_hash_every_log=0, + threads=0, compress_literals=None): + if threads < 0: + threads = _cpu_count() + + if compress_literals is None: + compress_literals = compression_level >= 0 + + self.format = format + self.compression_level = compression_level self.window_log = window_log - self.chain_log = chain_log self.hash_log = hash_log + self.chain_log = chain_log self.search_log = search_log - self.search_length = search_length + self.min_match = min_match self.target_length = target_length - self.strategy = strategy + self.compression_strategy = compression_strategy + self.write_content_size = write_content_size + self.write_checksum = write_checksum + self.write_dict_id = write_dict_id + self.job_size = job_size + self.overlap_size_log = overlap_size_log + self.compress_literals = compress_literals + self.force_max_window = force_max_window + self.enable_ldm = enable_ldm + self.ldm_hash_log = ldm_hash_log + self.ldm_min_match = ldm_min_match + self.ldm_bucket_size_log = ldm_bucket_size_log + self.ldm_hash_every_log = ldm_hash_every_log + self.threads = threads - zresult = lib.ZSTD_checkCParams(self.as_compression_parameters()) - if lib.ZSTD_isError(zresult): - raise ValueError('invalid compression parameters: %s', - ffi.string(lib.ZSTD_getErrorName(zresult))) + self.params = _make_cctx_params(self) def estimated_compression_context_size(self): - return lib.ZSTD_estimateCCtxSize(self.as_compression_parameters()) - - def as_compression_parameters(self): - p = ffi.new('ZSTD_compressionParameters *')[0] - p.windowLog = self.window_log - p.chainLog = self.chain_log - p.hashLog = self.hash_log - p.searchLog = self.search_log - p.searchLength = self.search_length - p.targetLength = self.target_length - p.strategy = self.strategy - - return p + return lib.ZSTD_estimateCCtxSize_usingCCtxParams(self.params) -def get_compression_parameters(level, source_size=0, dict_size=0): - params = lib.ZSTD_getCParams(level, source_size, dict_size) - return CompressionParameters(window_log=params.windowLog, - chain_log=params.chainLog, - hash_log=params.hashLog, - search_log=params.searchLog, - search_length=params.searchLength, - target_length=params.targetLength, - strategy=params.strategy) - - -def estimate_compression_context_size(params): - if not isinstance(params, CompressionParameters): - raise ValueError('argument must be a CompressionParameters') - - cparams = params.as_compression_parameters() - return lib.ZSTD_estimateCCtxSize(cparams) - +CompressionParameters = ZstdCompressionParameters def estimate_decompression_context_size(): return lib.ZSTD_estimateDCtxSize() +def _set_compression_parameter(params, param, value): + zresult = lib.ZSTD_CCtxParam_setParameter(params, param, + ffi.cast('unsigned', value)) + if lib.ZSTD_isError(zresult): + raise ZstdError('unable to set compression context parameter: %s' % + _zstd_error(zresult)) + class ZstdCompressionWriter(object): def __init__(self, compressor, writer, source_size, write_size): self._compressor = compressor @@ -169,16 +291,18 @@ self._source_size = source_size self._write_size = write_size self._entered = False - self._mtcctx = compressor._cctx if compressor._multithreaded else None + self._bytes_compressed = 0 def __enter__(self): if self._entered: raise ZstdError('cannot __enter__ multiple times') - if self._mtcctx: - self._compressor._init_mtcstream(self._source_size) - else: - self._compressor._ensure_cstream(self._source_size) + zresult = lib.ZSTD_CCtx_setPledgedSrcSize(self._compressor._cctx, + self._source_size) + if lib.ZSTD_isError(zresult): + raise ZstdError('error setting source size: %s' % + _zstd_error(zresult)) + self._entered = True return self @@ -186,20 +310,27 @@ self._entered = False if not exc_type and not exc_value and not exc_tb: + dst_buffer = ffi.new('char[]', self._write_size) + out_buffer = ffi.new('ZSTD_outBuffer *') - dst_buffer = ffi.new('char[]', self._write_size) + in_buffer = ffi.new('ZSTD_inBuffer *') + out_buffer.dst = dst_buffer - out_buffer.size = self._write_size + out_buffer.size = len(dst_buffer) out_buffer.pos = 0 + in_buffer.src = ffi.NULL + in_buffer.size = 0 + in_buffer.pos = 0 + while True: - if self._mtcctx: - zresult = lib.ZSTDMT_endStream(self._mtcctx, out_buffer) - else: - zresult = lib.ZSTD_endStream(self._compressor._cstream, out_buffer) + zresult = lib.ZSTD_compress_generic(self._compressor._cctx, + out_buffer, in_buffer, + lib.ZSTD_e_end) + if lib.ZSTD_isError(zresult): raise ZstdError('error ending compression stream: %s' % - ffi.string(lib.ZSTD_getErrorName(zresult))) + _zstd_error(zresult)) if out_buffer.pos: self._writer.write(ffi.buffer(out_buffer.dst, out_buffer.pos)[:]) @@ -217,7 +348,7 @@ raise ZstdError('cannot determine size of an inactive compressor; ' 'call when a context manager is active') - return lib.ZSTD_sizeof_CStream(self._compressor._cstream) + return lib.ZSTD_sizeof_CCtx(self._compressor._cctx) def write(self, data): if not self._entered: @@ -240,19 +371,17 @@ out_buffer.pos = 0 while in_buffer.pos < in_buffer.size: - if self._mtcctx: - zresult = lib.ZSTDMT_compressStream(self._mtcctx, out_buffer, - in_buffer) - else: - zresult = lib.ZSTD_compressStream(self._compressor._cstream, out_buffer, - in_buffer) + zresult = lib.ZSTD_compress_generic(self._compressor._cctx, + out_buffer, in_buffer, + lib.ZSTD_e_continue) if lib.ZSTD_isError(zresult): raise ZstdError('zstd compress error: %s' % - ffi.string(lib.ZSTD_getErrorName(zresult))) + _zstd_error(zresult)) if out_buffer.pos: self._writer.write(ffi.buffer(out_buffer.dst, out_buffer.pos)[:]) total_write += out_buffer.pos + self._bytes_compressed += out_buffer.pos out_buffer.pos = 0 return total_write @@ -269,24 +398,32 @@ out_buffer.size = self._write_size out_buffer.pos = 0 + in_buffer = ffi.new('ZSTD_inBuffer *') + in_buffer.src = ffi.NULL + in_buffer.size = 0 + in_buffer.pos = 0 + while True: - if self._mtcctx: - zresult = lib.ZSTDMT_flushStream(self._mtcctx, out_buffer) - else: - zresult = lib.ZSTD_flushStream(self._compressor._cstream, out_buffer) + zresult = lib.ZSTD_compress_generic(self._compressor._cctx, + out_buffer, in_buffer, + lib.ZSTD_e_flush) if lib.ZSTD_isError(zresult): raise ZstdError('zstd compress error: %s' % - ffi.string(lib.ZSTD_getErrorName(zresult))) + _zstd_error(zresult)) if not out_buffer.pos: break self._writer.write(ffi.buffer(out_buffer.dst, out_buffer.pos)[:]) total_write += out_buffer.pos + self._bytes_compressed += out_buffer.pos out_buffer.pos = 0 return total_write + def tell(self): + return self._bytes_compressed + class ZstdCompressionObj(object): def compress(self, data): @@ -302,15 +439,13 @@ chunks = [] while source.pos < len(data): - if self._mtcctx: - zresult = lib.ZSTDMT_compressStream(self._mtcctx, - self._out, source) - else: - zresult = lib.ZSTD_compressStream(self._compressor._cstream, self._out, - source) + zresult = lib.ZSTD_compress_generic(self._compressor._cctx, + self._out, + source, + lib.ZSTD_e_continue) if lib.ZSTD_isError(zresult): raise ZstdError('zstd compress error: %s' % - ffi.string(lib.ZSTD_getErrorName(zresult))) + _zstd_error(zresult)) if self._out.pos: chunks.append(ffi.buffer(self._out.dst, self._out.pos)[:]) @@ -327,14 +462,19 @@ assert self._out.pos == 0 + in_buffer = ffi.new('ZSTD_inBuffer *') + in_buffer.src = ffi.NULL + in_buffer.size = 0 + in_buffer.pos = 0 + if flush_mode == COMPRESSOBJ_FLUSH_BLOCK: - if self._mtcctx: - zresult = lib.ZSTDMT_flushStream(self._mtcctx, self._out) - else: - zresult = lib.ZSTD_flushStream(self._compressor._cstream, self._out) + zresult = lib.ZSTD_compress_generic(self._compressor._cctx, + self._out, + in_buffer, + lib.ZSTD_e_flush) if lib.ZSTD_isError(zresult): raise ZstdError('zstd compress error: %s' % - ffi.string(lib.ZSTD_getErrorName(zresult))) + _zstd_error(zresult)) # Output buffer is guaranteed to hold full block. assert zresult == 0 @@ -352,13 +492,13 @@ chunks = [] while True: - if self._mtcctx: - zresult = lib.ZSTDMT_endStream(self._mtcctx, self._out) - else: - zresult = lib.ZSTD_endStream(self._compressor._cstream, self._out) + zresult = lib.ZSTD_compress_generic(self._compressor._cctx, + self._out, + in_buffer, + lib.ZSTD_e_end) if lib.ZSTD_isError(zresult): raise ZstdError('error ending compression stream: %s' % - ffi.string(lib.ZSTD_getErroName(zresult))) + _zstd_error(zresult)) if self._out.pos: chunks.append(ffi.buffer(self._out.dst, self._out.pos)[:]) @@ -370,95 +510,335 @@ return b''.join(chunks) +class CompressionReader(object): + def __init__(self, compressor, source, size, read_size): + self._compressor = compressor + self._source = source + self._source_size = size + self._read_size = read_size + self._entered = False + self._closed = False + self._bytes_compressed = 0 + self._finished_input = False + self._finished_output = False + + self._in_buffer = ffi.new('ZSTD_inBuffer *') + # Holds a ref so backing bytes in self._in_buffer stay alive. + self._source_buffer = None + + def __enter__(self): + if self._entered: + raise ValueError('cannot __enter__ multiple times') + + zresult = lib.ZSTD_CCtx_setPledgedSrcSize(self._compressor._cctx, + self._source_size) + if lib.ZSTD_isError(zresult): + raise ZstdError('error setting source size: %s' % + _zstd_error(zresult)) + + self._entered = True + return self + + def __exit__(self, exc_type, exc_value, exc_tb): + self._entered = False + self._closed = True + self._source = None + self._compressor = None + + return False + + def readable(self): + return True + + def writable(self): + return False + + def seekable(self): + return False + + def readline(self): + raise io.UnsupportedOperation() + + def readlines(self): + raise io.UnsupportedOperation() + + def write(self, data): + raise OSError('stream is not writable') + + def writelines(self, ignored): + raise OSError('stream is not writable') + + def isatty(self): + return False + + def flush(self): + return None + + def close(self): + self._closed = True + return None + + def closed(self): + return self._closed + + def tell(self): + return self._bytes_compressed + + def readall(self): + raise NotImplementedError() + + def __iter__(self): + raise io.UnsupportedOperation() + + def __next__(self): + raise io.UnsupportedOperation() + + next = __next__ + + def read(self, size=-1): + if not self._entered: + raise ZstdError('read() must be called from an active context manager') + + if self._closed: + raise ValueError('stream is closed') + + if self._finished_output: + return b'' + + if size < 1: + raise ValueError('cannot read negative or size 0 amounts') + + # Need a dedicated ref to dest buffer otherwise it gets collected. + dst_buffer = ffi.new('char[]', size) + out_buffer = ffi.new('ZSTD_outBuffer *') + out_buffer.dst = dst_buffer + out_buffer.size = size + out_buffer.pos = 0 + + def compress_input(): + if self._in_buffer.pos >= self._in_buffer.size: + return + + old_pos = out_buffer.pos + + zresult = lib.ZSTD_compress_generic(self._compressor._cctx, + out_buffer, self._in_buffer, + lib.ZSTD_e_continue) + + self._bytes_compressed += out_buffer.pos - old_pos + + if self._in_buffer.pos == self._in_buffer.size: + self._in_buffer.src = ffi.NULL + self._in_buffer.pos = 0 + self._in_buffer.size = 0 + self._source_buffer = None + + if not hasattr(self._source, 'read'): + self._finished_input = True + + if lib.ZSTD_isError(zresult): + raise ZstdError('zstd compress error: %s', + _zstd_error(zresult)) + + if out_buffer.pos and out_buffer.pos == out_buffer.size: + return ffi.buffer(out_buffer.dst, out_buffer.pos)[:] + + def get_input(): + if self._finished_input: + return + + if hasattr(self._source, 'read'): + data = self._source.read(self._read_size) + + if not data: + self._finished_input = True + return + + self._source_buffer = ffi.from_buffer(data) + self._in_buffer.src = self._source_buffer + self._in_buffer.size = len(self._source_buffer) + self._in_buffer.pos = 0 + else: + self._source_buffer = ffi.from_buffer(self._source) + self._in_buffer.src = self._source_buffer + self._in_buffer.size = len(self._source_buffer) + self._in_buffer.pos = 0 + + result = compress_input() + if result: + return result + + while not self._finished_input: + get_input() + result = compress_input() + if result: + return result + + # EOF + old_pos = out_buffer.pos + + zresult = lib.ZSTD_compress_generic(self._compressor._cctx, + out_buffer, self._in_buffer, + lib.ZSTD_e_end) + + self._bytes_compressed += out_buffer.pos - old_pos + + if lib.ZSTD_isError(zresult): + raise ZstdError('error ending compression stream: %s', + _zstd_error(zresult)) + + if zresult == 0: + self._finished_output = True + + return ffi.buffer(out_buffer.dst, out_buffer.pos)[:] + class ZstdCompressor(object): def __init__(self, level=3, dict_data=None, compression_params=None, - write_checksum=False, write_content_size=False, - write_dict_id=True, threads=0): - if level < 1: - raise ValueError('level must be greater than 0') - elif level > lib.ZSTD_maxCLevel(): + write_checksum=None, write_content_size=None, + write_dict_id=None, threads=0): + if level > lib.ZSTD_maxCLevel(): raise ValueError('level must be less than %d' % lib.ZSTD_maxCLevel()) if threads < 0: threads = _cpu_count() - self._compression_level = level - self._dict_data = dict_data - self._cparams = compression_params - self._fparams = ffi.new('ZSTD_frameParameters *')[0] - self._fparams.checksumFlag = write_checksum - self._fparams.contentSizeFlag = write_content_size - self._fparams.noDictIDFlag = not write_dict_id + if compression_params and write_checksum is not None: + raise ValueError('cannot define compression_params and ' + 'write_checksum') + + if compression_params and write_content_size is not None: + raise ValueError('cannot define compression_params and ' + 'write_content_size') + + if compression_params and write_dict_id is not None: + raise ValueError('cannot define compression_params and ' + 'write_dict_id') - if threads: - cctx = lib.ZSTDMT_createCCtx(threads) - if cctx == ffi.NULL: - raise MemoryError() + if compression_params and threads: + raise ValueError('cannot define compression_params and threads') - self._cctx = ffi.gc(cctx, lib.ZSTDMT_freeCCtx) - self._multithreaded = True + if compression_params: + self._params = _make_cctx_params(compression_params) else: - cctx = lib.ZSTD_createCCtx() - if cctx == ffi.NULL: + if write_dict_id is None: + write_dict_id = True + + params = lib.ZSTD_createCCtxParams() + if params == ffi.NULL: raise MemoryError() - self._cctx = ffi.gc(cctx, lib.ZSTD_freeCCtx) - self._multithreaded = False + self._params = ffi.gc(params, lib.ZSTD_freeCCtxParams) + + _set_compression_parameter(self._params, + lib.ZSTD_p_compressionLevel, + level) - self._cstream = None + _set_compression_parameter( + self._params, + lib.ZSTD_p_contentSizeFlag, + write_content_size if write_content_size is not None else 1) + + _set_compression_parameter(self._params, + lib.ZSTD_p_checksumFlag, + 1 if write_checksum else 0) - def compress(self, data, allow_empty=False): - if len(data) == 0 and self._fparams.contentSizeFlag and not allow_empty: - raise ValueError('cannot write empty inputs when writing content sizes') + _set_compression_parameter(self._params, + lib.ZSTD_p_dictIDFlag, + 1 if write_dict_id else 0) - if self._multithreaded and self._dict_data: - raise ZstdError('compress() cannot be used with both dictionaries and multi-threaded compression') + if threads: + _set_compression_parameter(self._params, + lib.ZSTD_p_nbWorkers, + threads) - if self._multithreaded and self._cparams: - raise ZstdError('compress() cannot be used with both compression parameters and multi-threaded compression') + cctx = lib.ZSTD_createCCtx() + if cctx == ffi.NULL: + raise MemoryError() + + self._cctx = cctx + self._dict_data = dict_data - # TODO use a CDict for performance. - dict_data = ffi.NULL - dict_size = 0 + # We defer setting up garbage collection until after calling + # _ensure_cctx() to ensure the memory size estimate is more accurate. + try: + self._ensure_cctx() + finally: + self._cctx = ffi.gc(cctx, lib.ZSTD_freeCCtx, + size=lib.ZSTD_sizeof_CCtx(cctx)) - if self._dict_data: - dict_data = self._dict_data.as_bytes() - dict_size = len(self._dict_data) + def _ensure_cctx(self): + lib.ZSTD_CCtx_reset(self._cctx) + + zresult = lib.ZSTD_CCtx_setParametersUsingCCtxParams(self._cctx, + self._params) + if lib.ZSTD_isError(zresult): + raise ZstdError('could not set compression parameters: %s' % + _zstd_error(zresult)) + + dict_data = self._dict_data - params = ffi.new('ZSTD_parameters *')[0] - if self._cparams: - params.cParams = self._cparams.as_compression_parameters() - else: - params.cParams = lib.ZSTD_getCParams(self._compression_level, len(data), - dict_size) - params.fParams = self._fparams + if dict_data: + if dict_data._cdict: + zresult = lib.ZSTD_CCtx_refCDict(self._cctx, dict_data._cdict) + else: + zresult = lib.ZSTD_CCtx_loadDictionary_advanced( + self._cctx, dict_data.as_bytes(), len(dict_data), + lib.ZSTD_dlm_byRef, dict_data._dict_type) - dest_size = lib.ZSTD_compressBound(len(data)) + if lib.ZSTD_isError(zresult): + raise ZstdError('could not load compression dictionary: %s' % + _zstd_error(zresult)) + + def memory_size(self): + return lib.ZSTD_sizeof_CCtx(self._cctx) + + def compress(self, data): + self._ensure_cctx() + + data_buffer = ffi.from_buffer(data) + + dest_size = lib.ZSTD_compressBound(len(data_buffer)) out = new_nonzero('char[]', dest_size) - if self._multithreaded: - zresult = lib.ZSTDMT_compressCCtx(self._cctx, - ffi.addressof(out), dest_size, - data, len(data), - self._compression_level) - else: - zresult = lib.ZSTD_compress_advanced(self._cctx, - ffi.addressof(out), dest_size, - data, len(data), - dict_data, dict_size, - params) + zresult = lib.ZSTD_CCtx_setPledgedSrcSize(self._cctx, len(data_buffer)) + if lib.ZSTD_isError(zresult): + raise ZstdError('error setting source size: %s' % + _zstd_error(zresult)) + + out_buffer = ffi.new('ZSTD_outBuffer *') + in_buffer = ffi.new('ZSTD_inBuffer *') + + out_buffer.dst = out + out_buffer.size = dest_size + out_buffer.pos = 0 + + in_buffer.src = data_buffer + in_buffer.size = len(data_buffer) + in_buffer.pos = 0 + + zresult = lib.ZSTD_compress_generic(self._cctx, + out_buffer, + in_buffer, + lib.ZSTD_e_end) if lib.ZSTD_isError(zresult): raise ZstdError('cannot compress: %s' % - ffi.string(lib.ZSTD_getErrorName(zresult))) + _zstd_error(zresult)) + elif zresult: + raise ZstdError('unexpected partial frame flush') - return ffi.buffer(out, zresult)[:] + return ffi.buffer(out, out_buffer.pos)[:] - def compressobj(self, size=0): - if self._multithreaded: - self._init_mtcstream(size) - else: - self._ensure_cstream(size) + def compressobj(self, size=-1): + self._ensure_cctx() + + if size < 0: + size = lib.ZSTD_CONTENTSIZE_UNKNOWN + + zresult = lib.ZSTD_CCtx_setPledgedSrcSize(self._cctx, size) + if lib.ZSTD_isError(zresult): + raise ZstdError('error setting source size: %s' % + _zstd_error(zresult)) cobj = ZstdCompressionObj() cobj._out = ffi.new('ZSTD_outBuffer *') @@ -469,14 +849,9 @@ cobj._compressor = self cobj._finished = False - if self._multithreaded: - cobj._mtcctx = self._cctx - else: - cobj._mtcctx = None - return cobj - def copy_stream(self, ifh, ofh, size=0, + def copy_stream(self, ifh, ofh, size=-1, read_size=COMPRESSION_RECOMMENDED_INPUT_SIZE, write_size=COMPRESSION_RECOMMENDED_OUTPUT_SIZE): @@ -485,11 +860,15 @@ if not hasattr(ofh, 'write'): raise ValueError('second argument must have a write() method') - mt = self._multithreaded - if mt: - self._init_mtcstream(size) - else: - self._ensure_cstream(size) + self._ensure_cctx() + + if size < 0: + size = lib.ZSTD_CONTENTSIZE_UNKNOWN + + zresult = lib.ZSTD_CCtx_setPledgedSrcSize(self._cctx, size) + if lib.ZSTD_isError(zresult): + raise ZstdError('error setting source size: %s' % + _zstd_error(zresult)) in_buffer = ffi.new('ZSTD_inBuffer *') out_buffer = ffi.new('ZSTD_outBuffer *') @@ -513,14 +892,13 @@ in_buffer.pos = 0 while in_buffer.pos < in_buffer.size: - if mt: - zresult = lib.ZSTDMT_compressStream(self._cctx, out_buffer, in_buffer) - else: - zresult = lib.ZSTD_compressStream(self._cstream, - out_buffer, in_buffer) + zresult = lib.ZSTD_compress_generic(self._cctx, + out_buffer, + in_buffer, + lib.ZSTD_e_continue) if lib.ZSTD_isError(zresult): raise ZstdError('zstd compress error: %s' % - ffi.string(lib.ZSTD_getErrorName(zresult))) + _zstd_error(zresult)) if out_buffer.pos: ofh.write(ffi.buffer(out_buffer.dst, out_buffer.pos)) @@ -529,13 +907,13 @@ # We've finished reading. Flush the compressor. while True: - if mt: - zresult = lib.ZSTDMT_endStream(self._cctx, out_buffer) - else: - zresult = lib.ZSTD_endStream(self._cstream, out_buffer) + zresult = lib.ZSTD_compress_generic(self._cctx, + out_buffer, + in_buffer, + lib.ZSTD_e_end) if lib.ZSTD_isError(zresult): raise ZstdError('error ending compression stream: %s' % - ffi.string(lib.ZSTD_getErrorName(zresult))) + _zstd_error(zresult)) if out_buffer.pos: ofh.write(ffi.buffer(out_buffer.dst, out_buffer.pos)) @@ -547,17 +925,38 @@ return total_read, total_write - def write_to(self, writer, size=0, + def stream_reader(self, source, size=-1, + read_size=COMPRESSION_RECOMMENDED_INPUT_SIZE): + self._ensure_cctx() + + try: + size = len(source) + except Exception: + pass + + if size < 0: + size = lib.ZSTD_CONTENTSIZE_UNKNOWN + + return CompressionReader(self, source, size, read_size) + + def stream_writer(self, writer, size=-1, write_size=COMPRESSION_RECOMMENDED_OUTPUT_SIZE): if not hasattr(writer, 'write'): raise ValueError('must pass an object with a write() method') + self._ensure_cctx() + + if size < 0: + size = lib.ZSTD_CONTENTSIZE_UNKNOWN + return ZstdCompressionWriter(self, writer, size, write_size) - def read_from(self, reader, size=0, - read_size=COMPRESSION_RECOMMENDED_INPUT_SIZE, - write_size=COMPRESSION_RECOMMENDED_OUTPUT_SIZE): + write_to = stream_writer + + def read_to_iter(self, reader, size=-1, + read_size=COMPRESSION_RECOMMENDED_INPUT_SIZE, + write_size=COMPRESSION_RECOMMENDED_OUTPUT_SIZE): if hasattr(reader, 'read'): have_read = True elif hasattr(reader, '__getitem__'): @@ -568,10 +967,15 @@ raise ValueError('must pass an object with a read() method or ' 'conforms to buffer protocol') - if self._multithreaded: - self._init_mtcstream(size) - else: - self._ensure_cstream(size) + self._ensure_cctx() + + if size < 0: + size = lib.ZSTD_CONTENTSIZE_UNKNOWN + + zresult = lib.ZSTD_CCtx_setPledgedSrcSize(self._cctx, size) + if lib.ZSTD_isError(zresult): + raise ZstdError('error setting source size: %s' % + _zstd_error(zresult)) in_buffer = ffi.new('ZSTD_inBuffer *') out_buffer = ffi.new('ZSTD_outBuffer *') @@ -611,13 +1015,11 @@ in_buffer.pos = 0 while in_buffer.pos < in_buffer.size: - if self._multithreaded: - zresult = lib.ZSTDMT_compressStream(self._cctx, out_buffer, in_buffer) - else: - zresult = lib.ZSTD_compressStream(self._cstream, out_buffer, in_buffer) + zresult = lib.ZSTD_compress_generic(self._cctx, out_buffer, in_buffer, + lib.ZSTD_e_continue) if lib.ZSTD_isError(zresult): raise ZstdError('zstd compress error: %s' % - ffi.string(lib.ZSTD_getErrorName(zresult))) + _zstd_error(zresult)) if out_buffer.pos: data = ffi.buffer(out_buffer.dst, out_buffer.pos)[:] @@ -633,13 +1035,13 @@ # remains. while True: assert out_buffer.pos == 0 - if self._multithreaded: - zresult = lib.ZSTDMT_endStream(self._cctx, out_buffer) - else: - zresult = lib.ZSTD_endStream(self._cstream, out_buffer) + zresult = lib.ZSTD_compress_generic(self._cctx, + out_buffer, + in_buffer, + lib.ZSTD_e_end) if lib.ZSTD_isError(zresult): raise ZstdError('error ending compression stream: %s' % - ffi.string(lib.ZSTD_getErrorName(zresult))) + _zstd_error(zresult)) if out_buffer.pos: data = ffi.buffer(out_buffer.dst, out_buffer.pos)[:] @@ -649,67 +1051,12 @@ if zresult == 0: break - def _ensure_cstream(self, size): - if self._cstream: - zresult = lib.ZSTD_resetCStream(self._cstream, size) - if lib.ZSTD_isError(zresult): - raise ZstdError('could not reset CStream: %s' % - ffi.string(lib.ZSTD_getErrorName(zresult))) - - return - - cstream = lib.ZSTD_createCStream() - if cstream == ffi.NULL: - raise MemoryError() - - cstream = ffi.gc(cstream, lib.ZSTD_freeCStream) - - dict_data = ffi.NULL - dict_size = 0 - if self._dict_data: - dict_data = self._dict_data.as_bytes() - dict_size = len(self._dict_data) - - zparams = ffi.new('ZSTD_parameters *')[0] - if self._cparams: - zparams.cParams = self._cparams.as_compression_parameters() - else: - zparams.cParams = lib.ZSTD_getCParams(self._compression_level, - size, dict_size) - zparams.fParams = self._fparams + read_from = read_to_iter - zresult = lib.ZSTD_initCStream_advanced(cstream, dict_data, dict_size, - zparams, size) - if lib.ZSTD_isError(zresult): - raise Exception('cannot init CStream: %s' % - ffi.string(lib.ZSTD_getErrorName(zresult))) - - self._cstream = cstream - - def _init_mtcstream(self, size): - assert self._multithreaded + def frame_progression(self): + progression = lib.ZSTD_getFrameProgression(self._cctx) - dict_data = ffi.NULL - dict_size = 0 - if self._dict_data: - dict_data = self._dict_data.as_bytes() - dict_size = len(self._dict_data) - - zparams = ffi.new('ZSTD_parameters *')[0] - if self._cparams: - zparams.cParams = self._cparams.as_compression_parameters() - else: - zparams.cParams = lib.ZSTD_getCParams(self._compression_level, - size, dict_size) - - zparams.fParams = self._fparams - - zresult = lib.ZSTDMT_initCStream_advanced(self._cctx, dict_data, dict_size, - zparams, size) - - if lib.ZSTD_isError(zresult): - raise ZstdError('cannot init CStream: %s' % - ffi.string(lib.ZSTD_getErrorName(zresult))) + return progression.ingested, progression.consumed, progression.produced class FrameParameters(object): @@ -720,16 +1067,38 @@ self.has_checksum = bool(fparams.checksumFlag) -def get_frame_parameters(data): - if not isinstance(data, bytes_type): - raise TypeError('argument must be bytes') +def frame_content_size(data): + data_buffer = ffi.from_buffer(data) + + size = lib.ZSTD_getFrameContentSize(data_buffer, len(data_buffer)) + + if size == lib.ZSTD_CONTENTSIZE_ERROR: + raise ZstdError('error when determining content size') + elif size == lib.ZSTD_CONTENTSIZE_UNKNOWN: + return -1 + else: + return size + - params = ffi.new('ZSTD_frameParams *') +def frame_header_size(data): + data_buffer = ffi.from_buffer(data) + + zresult = lib.ZSTD_frameHeaderSize(data_buffer, len(data_buffer)) + if lib.ZSTD_isError(zresult): + raise ZstdError('could not determine frame header size: %s' % + _zstd_error(zresult)) - zresult = lib.ZSTD_getFrameParams(params, data, len(data)) + return zresult + + +def get_frame_parameters(data): + params = ffi.new('ZSTD_frameHeader *') + + data_buffer = ffi.from_buffer(data) + zresult = lib.ZSTD_getFrameHeader(params, data_buffer, len(data_buffer)) if lib.ZSTD_isError(zresult): raise ZstdError('cannot get frame parameters: %s' % - ffi.string(lib.ZSTD_getErrorName(zresult))) + _zstd_error(zresult)) if zresult: raise ZstdError('not enough data for frame parameters; need %d bytes' % @@ -739,12 +1108,20 @@ class ZstdCompressionDict(object): - def __init__(self, data, k=0, d=0): + def __init__(self, data, dict_type=DICT_TYPE_AUTO, k=0, d=0): assert isinstance(data, bytes_type) self._data = data self.k = k self.d = d + if dict_type not in (DICT_TYPE_AUTO, DICT_TYPE_RAWCONTENT, + DICT_TYPE_FULLDICT): + raise ValueError('invalid dictionary load mode: %d; must use ' + 'DICT_TYPE_* constants') + + self._dict_type = dict_type + self._cdict = None + def __len__(self): return len(self._data) @@ -754,51 +1131,55 @@ def as_bytes(self): return self._data + def precompute_compress(self, level=0, compression_params=None): + if level and compression_params: + raise ValueError('must only specify one of level or ' + 'compression_params') -def train_dictionary(dict_size, samples, selectivity=0, level=0, - notifications=0, dict_id=0): - if not isinstance(samples, list): - raise TypeError('samples must be a list') - - total_size = sum(map(len, samples)) - - samples_buffer = new_nonzero('char[]', total_size) - sample_sizes = new_nonzero('size_t[]', len(samples)) + if not level and not compression_params: + raise ValueError('must specify one of level or compression_params') - offset = 0 - for i, sample in enumerate(samples): - if not isinstance(sample, bytes_type): - raise ValueError('samples must be bytes') - - l = len(sample) - ffi.memmove(samples_buffer + offset, sample, l) - offset += l - sample_sizes[i] = l - - dict_data = new_nonzero('char[]', dict_size) + if level: + cparams = lib.ZSTD_getCParams(level, 0, len(self._data)) + else: + cparams = ffi.new('ZSTD_compressionParameters') + cparams.chainLog = compression_params.chain_log + cparams.hashLog = compression_params.hash_log + cparams.searchLength = compression_params.min_match + cparams.searchLog = compression_params.search_log + cparams.strategy = compression_params.compression_strategy + cparams.targetLength = compression_params.target_length + cparams.windowLog = compression_params.window_log - dparams = ffi.new('ZDICT_params_t *')[0] - dparams.selectivityLevel = selectivity - dparams.compressionLevel = level - dparams.notificationLevel = notifications - dparams.dictID = dict_id + cdict = lib.ZSTD_createCDict_advanced(self._data, len(self._data), + lib.ZSTD_dlm_byRef, + self._dict_type, + cparams, + lib.ZSTD_defaultCMem) + if cdict == ffi.NULL: + raise ZstdError('unable to precompute dictionary') + + self._cdict = ffi.gc(cdict, lib.ZSTD_freeCDict, + size=lib.ZSTD_sizeof_CDict(cdict)) - zresult = lib.ZDICT_trainFromBuffer_advanced( - ffi.addressof(dict_data), dict_size, - ffi.addressof(samples_buffer), - ffi.addressof(sample_sizes, 0), len(samples), - dparams) + @property + def _ddict(self): + ddict = lib.ZSTD_createDDict_advanced(self._data, len(self._data), + lib.ZSTD_dlm_byRef, + self._dict_type, + lib.ZSTD_defaultCMem) - if lib.ZDICT_isError(zresult): - raise ZstdError('Cannot train dict: %s' % - ffi.string(lib.ZDICT_getErrorName(zresult))) + if ddict == ffi.NULL: + raise ZstdError('could not create decompression dict') - return ZstdCompressionDict(ffi.buffer(dict_data, zresult)[:]) - + ddict = ffi.gc(ddict, lib.ZSTD_freeDDict, + size=lib.ZSTD_sizeof_DDict(ddict)) + self.__dict__['_ddict'] = ddict -def train_cover_dictionary(dict_size, samples, k=0, d=0, - notifications=0, dict_id=0, level=0, optimize=False, - steps=0, threads=0): + return ddict + +def train_dictionary(dict_size, samples, k=0, d=0, notifications=0, dict_id=0, + level=0, steps=0, threads=0): if not isinstance(samples, list): raise TypeError('samples must be a list') @@ -822,47 +1203,55 @@ dict_data = new_nonzero('char[]', dict_size) - dparams = ffi.new('COVER_params_t *')[0] + dparams = ffi.new('ZDICT_cover_params_t *')[0] dparams.k = k dparams.d = d dparams.steps = steps dparams.nbThreads = threads - dparams.notificationLevel = notifications - dparams.dictID = dict_id - dparams.compressionLevel = level + dparams.zParams.notificationLevel = notifications + dparams.zParams.dictID = dict_id + dparams.zParams.compressionLevel = level - if optimize: - zresult = lib.COVER_optimizeTrainFromBuffer( + if (not dparams.k and not dparams.d and not dparams.steps + and not dparams.nbThreads and not dparams.zParams.notificationLevel + and not dparams.zParams.dictID + and not dparams.zParams.compressionLevel): + zresult = lib.ZDICT_trainFromBuffer( + ffi.addressof(dict_data), dict_size, + ffi.addressof(samples_buffer), + ffi.addressof(sample_sizes, 0), len(samples)) + elif dparams.steps or dparams.nbThreads: + zresult = lib.ZDICT_optimizeTrainFromBuffer_cover( ffi.addressof(dict_data), dict_size, ffi.addressof(samples_buffer), ffi.addressof(sample_sizes, 0), len(samples), ffi.addressof(dparams)) else: - zresult = lib.COVER_trainFromBuffer( + zresult = lib.ZDICT_trainFromBuffer_cover( ffi.addressof(dict_data), dict_size, ffi.addressof(samples_buffer), ffi.addressof(sample_sizes, 0), len(samples), dparams) if lib.ZDICT_isError(zresult): - raise ZstdError('cannot train dict: %s' % - ffi.string(lib.ZDICT_getErrorName(zresult))) + msg = ffi.string(lib.ZDICT_getErrorName(zresult)).decode('utf-8') + raise ZstdError('cannot train dict: %s' % msg) return ZstdCompressionDict(ffi.buffer(dict_data, zresult)[:], + dict_type=DICT_TYPE_FULLDICT, k=dparams.k, d=dparams.d) class ZstdDecompressionObj(object): - def __init__(self, decompressor): + def __init__(self, decompressor, write_size): self._decompressor = decompressor + self._write_size = write_size self._finished = False def decompress(self, data): if self._finished: raise ZstdError('cannot use a decompressobj multiple times') - assert(self._decompressor._dstream) - in_buffer = ffi.new('ZSTD_inBuffer *') out_buffer = ffi.new('ZSTD_outBuffer *') @@ -871,7 +1260,7 @@ in_buffer.size = len(data_buffer) in_buffer.pos = 0 - dst_buffer = ffi.new('char[]', DECOMPRESSION_RECOMMENDED_OUTPUT_SIZE) + dst_buffer = ffi.new('char[]', self._write_size) out_buffer.dst = dst_buffer out_buffer.size = len(dst_buffer) out_buffer.pos = 0 @@ -879,11 +1268,11 @@ chunks = [] while in_buffer.pos < in_buffer.size: - zresult = lib.ZSTD_decompressStream(self._decompressor._dstream, - out_buffer, in_buffer) + zresult = lib.ZSTD_decompress_generic(self._decompressor._dctx, + out_buffer, in_buffer) if lib.ZSTD_isError(zresult): raise ZstdError('zstd decompressor error: %s' % - ffi.string(lib.ZSTD_getErrorName(zresult))) + _zstd_error(zresult)) if zresult == 0: self._finished = True @@ -896,6 +1285,203 @@ return b''.join(chunks) +class DecompressionReader(object): + def __init__(self, decompressor, source, read_size): + self._decompressor = decompressor + self._source = source + self._read_size = read_size + self._entered = False + self._closed = False + self._bytes_decompressed = 0 + self._finished_input = False + self._finished_output = False + self._in_buffer = ffi.new('ZSTD_inBuffer *') + # Holds a ref to self._in_buffer.src. + self._source_buffer = None + + def __enter__(self): + if self._entered: + raise ValueError('cannot __enter__ multiple times') + + self._decompressor._ensure_dctx() + + self._entered = True + return self + + def __exit__(self, exc_type, exc_value, exc_tb): + self._entered = False + self._closed = True + self._source = None + self._decompressor = None + + return False + + def readable(self): + return True + + def writable(self): + return False + + def seekable(self): + return True + + def readline(self): + raise NotImplementedError() + + def readlines(self): + raise NotImplementedError() + + def write(self, data): + raise io.UnsupportedOperation() + + def writelines(self, lines): + raise io.UnsupportedOperation() + + def isatty(self): + return False + + def flush(self): + return None + + def close(self): + self._closed = True + return None + + def closed(self): + return self._closed + + def tell(self): + return self._bytes_decompressed + + def readall(self): + raise NotImplementedError() + + def __iter__(self): + raise NotImplementedError() + + def __next__(self): + raise NotImplementedError() + + next = __next__ + + def read(self, size=-1): + if not self._entered: + raise ZstdError('read() must be called from an active context manager') + + if self._closed: + raise ValueError('stream is closed') + + if self._finished_output: + return b'' + + if size < 1: + raise ValueError('cannot read negative or size 0 amounts') + + dst_buffer = ffi.new('char[]', size) + out_buffer = ffi.new('ZSTD_outBuffer *') + out_buffer.dst = dst_buffer + out_buffer.size = size + out_buffer.pos = 0 + + def decompress(): + zresult = lib.ZSTD_decompress_generic(self._decompressor._dctx, + out_buffer, self._in_buffer) + + if self._in_buffer.pos == self._in_buffer.size: + self._in_buffer.src = ffi.NULL + self._in_buffer.pos = 0 + self._in_buffer.size = 0 + self._source_buffer = None + + if not hasattr(self._source, 'read'): + self._finished_input = True + + if lib.ZSTD_isError(zresult): + raise ZstdError('zstd decompress error: %s', + _zstd_error(zresult)) + elif zresult == 0: + self._finished_output = True + + if out_buffer.pos and out_buffer.pos == out_buffer.size: + self._bytes_decompressed += out_buffer.size + return ffi.buffer(out_buffer.dst, out_buffer.pos)[:] + + def get_input(): + if self._finished_input: + return + + if hasattr(self._source, 'read'): + data = self._source.read(self._read_size) + + if not data: + self._finished_input = True + return + + self._source_buffer = ffi.from_buffer(data) + self._in_buffer.src = self._source_buffer + self._in_buffer.size = len(self._source_buffer) + self._in_buffer.pos = 0 + else: + self._source_buffer = ffi.from_buffer(self._source) + self._in_buffer.src = self._source_buffer + self._in_buffer.size = len(self._source_buffer) + self._in_buffer.pos = 0 + + get_input() + result = decompress() + if result: + return result + + while not self._finished_input: + get_input() + result = decompress() + if result: + return result + + self._bytes_decompressed += out_buffer.pos + return ffi.buffer(out_buffer.dst, out_buffer.pos)[:] + + def seek(self, pos, whence=os.SEEK_SET): + if not self._entered: + raise ZstdError('seek() must be called from an active context ' + 'manager') + + if self._closed: + raise ValueError('stream is closed') + + read_amount = 0 + + if whence == os.SEEK_SET: + if pos < 0: + raise ValueError('cannot seek to negative position with SEEK_SET') + + if pos < self._bytes_decompressed: + raise ValueError('cannot seek zstd decompression stream ' + 'backwards') + + read_amount = pos - self._bytes_decompressed + + elif whence == os.SEEK_CUR: + if pos < 0: + raise ValueError('cannot seek zstd decompression stream ' + 'backwards') + + read_amount = pos + elif whence == os.SEEK_END: + raise ValueError('zstd decompression streams cannot be seeked ' + 'with SEEK_END') + + while read_amount: + result = self.read(min(read_amount, + DECOMPRESSION_RECOMMENDED_OUTPUT_SIZE)) + + if not result: + break + + read_amount -= len(result) + + return self._bytes_decompressed + class ZstdDecompressionWriter(object): def __init__(self, decompressor, writer, write_size): self._decompressor = decompressor @@ -907,7 +1493,7 @@ if self._entered: raise ZstdError('cannot __enter__ multiple times') - self._decompressor._ensure_dstream() + self._decompressor._ensure_dctx() self._entered = True return self @@ -916,11 +1502,11 @@ self._entered = False def memory_size(self): - if not self._decompressor._dstream: + if not self._decompressor._dctx: raise ZstdError('cannot determine size of inactive decompressor ' 'call when context manager is active') - return lib.ZSTD_sizeof_DStream(self._decompressor._dstream) + return lib.ZSTD_sizeof_DCtx(self._decompressor._dctx) def write(self, data): if not self._entered: @@ -941,13 +1527,13 @@ out_buffer.size = len(dst_buffer) out_buffer.pos = 0 - dstream = self._decompressor._dstream + dctx = self._decompressor._dctx while in_buffer.pos < in_buffer.size: - zresult = lib.ZSTD_decompressStream(dstream, out_buffer, in_buffer) + zresult = lib.ZSTD_decompress_generic(dctx, out_buffer, in_buffer) if lib.ZSTD_isError(zresult): raise ZstdError('zstd decompress error: %s' % - ffi.string(lib.ZSTD_getErrorName(zresult))) + _zstd_error(zresult)) if out_buffer.pos: self._writer.write(ffi.buffer(out_buffer.dst, out_buffer.pos)[:]) @@ -958,77 +1544,86 @@ class ZstdDecompressor(object): - def __init__(self, dict_data=None): + def __init__(self, dict_data=None, max_window_size=0, format=FORMAT_ZSTD1): self._dict_data = dict_data + self._max_window_size = max_window_size + self._format = format dctx = lib.ZSTD_createDCtx() if dctx == ffi.NULL: raise MemoryError() - self._refdctx = ffi.gc(dctx, lib.ZSTD_freeDCtx) - self._dstream = None - - @property - def _ddict(self): - if self._dict_data: - dict_data = self._dict_data.as_bytes() - dict_size = len(self._dict_data) + self._dctx = dctx - ddict = lib.ZSTD_createDDict(dict_data, dict_size) - if ddict == ffi.NULL: - raise ZstdError('could not create decompression dict') - else: - ddict = None + # Defer setting up garbage collection until full state is loaded so + # the memory size is more accurate. + try: + self._ensure_dctx() + finally: + self._dctx = ffi.gc(dctx, lib.ZSTD_freeDCtx, + size=lib.ZSTD_sizeof_DCtx(dctx)) - self.__dict__['_ddict'] = ddict - return ddict + def memory_size(self): + return lib.ZSTD_sizeof_DCtx(self._dctx) def decompress(self, data, max_output_size=0): + self._ensure_dctx() + data_buffer = ffi.from_buffer(data) - orig_dctx = new_nonzero('char[]', lib.ZSTD_sizeof_DCtx(self._refdctx)) - dctx = ffi.cast('ZSTD_DCtx *', orig_dctx) - lib.ZSTD_copyDCtx(dctx, self._refdctx) - - ddict = self._ddict + output_size = lib.ZSTD_getFrameContentSize(data_buffer, len(data_buffer)) - output_size = lib.ZSTD_getDecompressedSize(data_buffer, len(data_buffer)) - if output_size: - result_buffer = ffi.new('char[]', output_size) - result_size = output_size - else: + if output_size == lib.ZSTD_CONTENTSIZE_ERROR: + raise ZstdError('error determining content size from frame header') + elif output_size == 0: + return b'' + elif output_size == lib.ZSTD_CONTENTSIZE_UNKNOWN: if not max_output_size: - raise ZstdError('input data invalid or missing content size ' - 'in frame header') + raise ZstdError('could not determine content size in frame header') result_buffer = ffi.new('char[]', max_output_size) result_size = max_output_size + output_size = 0 + else: + result_buffer = ffi.new('char[]', output_size) + result_size = output_size - if ddict: - zresult = lib.ZSTD_decompress_usingDDict(dctx, - result_buffer, result_size, - data_buffer, len(data_buffer), - ddict) - else: - zresult = lib.ZSTD_decompressDCtx(dctx, - result_buffer, result_size, - data_buffer, len(data_buffer)) + out_buffer = ffi.new('ZSTD_outBuffer *') + out_buffer.dst = result_buffer + out_buffer.size = result_size + out_buffer.pos = 0 + + in_buffer = ffi.new('ZSTD_inBuffer *') + in_buffer.src = data_buffer + in_buffer.size = len(data_buffer) + in_buffer.pos = 0 + + zresult = lib.ZSTD_decompress_generic(self._dctx, out_buffer, in_buffer) if lib.ZSTD_isError(zresult): raise ZstdError('decompression error: %s' % - ffi.string(lib.ZSTD_getErrorName(zresult))) - elif output_size and zresult != output_size: + _zstd_error(zresult)) + elif zresult: + raise ZstdError('decompression error: did not decompress full frame') + elif output_size and out_buffer.pos != output_size: raise ZstdError('decompression error: decompressed %d bytes; expected %d' % (zresult, output_size)) - return ffi.buffer(result_buffer, zresult)[:] + return ffi.buffer(result_buffer, out_buffer.pos)[:] + + def stream_reader(self, source, read_size=DECOMPRESSION_RECOMMENDED_INPUT_SIZE): + self._ensure_dctx() + return DecompressionReader(self, source, read_size) - def decompressobj(self): - self._ensure_dstream() - return ZstdDecompressionObj(self) + def decompressobj(self, write_size=DECOMPRESSION_RECOMMENDED_OUTPUT_SIZE): + if write_size < 1: + raise ValueError('write_size must be positive') - def read_from(self, reader, read_size=DECOMPRESSION_RECOMMENDED_INPUT_SIZE, - write_size=DECOMPRESSION_RECOMMENDED_OUTPUT_SIZE, - skip_bytes=0): + self._ensure_dctx() + return ZstdDecompressionObj(self, write_size=write_size) + + def read_to_iter(self, reader, read_size=DECOMPRESSION_RECOMMENDED_INPUT_SIZE, + write_size=DECOMPRESSION_RECOMMENDED_OUTPUT_SIZE, + skip_bytes=0): if skip_bytes >= read_size: raise ValueError('skip_bytes must be smaller than read_size') @@ -1051,7 +1646,7 @@ buffer_offset = skip_bytes - self._ensure_dstream() + self._ensure_dctx() in_buffer = ffi.new('ZSTD_inBuffer *') out_buffer = ffi.new('ZSTD_outBuffer *') @@ -1086,10 +1681,10 @@ while in_buffer.pos < in_buffer.size: assert out_buffer.pos == 0 - zresult = lib.ZSTD_decompressStream(self._dstream, out_buffer, in_buffer) + zresult = lib.ZSTD_decompress_generic(self._dctx, out_buffer, in_buffer) if lib.ZSTD_isError(zresult): raise ZstdError('zstd decompress error: %s' % - ffi.string(lib.ZSTD_getErrorName(zresult))) + _zstd_error(zresult)) if out_buffer.pos: data = ffi.buffer(out_buffer.dst, out_buffer.pos)[:] @@ -1104,12 +1699,16 @@ # If we get here, input is exhausted. - def write_to(self, writer, write_size=DECOMPRESSION_RECOMMENDED_OUTPUT_SIZE): + read_from = read_to_iter + + def stream_writer(self, writer, write_size=DECOMPRESSION_RECOMMENDED_OUTPUT_SIZE): if not hasattr(writer, 'write'): raise ValueError('must pass an object with a write() method') return ZstdDecompressionWriter(self, writer, write_size) + write_to = stream_writer + def copy_stream(self, ifh, ofh, read_size=DECOMPRESSION_RECOMMENDED_INPUT_SIZE, write_size=DECOMPRESSION_RECOMMENDED_OUTPUT_SIZE): @@ -1118,7 +1717,7 @@ if not hasattr(ofh, 'write'): raise ValueError('second argument must have a write() method') - self._ensure_dstream() + self._ensure_dctx() in_buffer = ffi.new('ZSTD_inBuffer *') out_buffer = ffi.new('ZSTD_outBuffer *') @@ -1144,10 +1743,10 @@ # Flush all read data to output. while in_buffer.pos < in_buffer.size: - zresult = lib.ZSTD_decompressStream(self._dstream, out_buffer, in_buffer) + zresult = lib.ZSTD_decompress_generic(self._dctx, out_buffer, in_buffer) if lib.ZSTD_isError(zresult): raise ZstdError('zstd decompressor error: %s' % - ffi.string(lib.ZSTD_getErrorName(zresult))) + _zstd_error(zresult)) if out_buffer.pos: ofh.write(ffi.buffer(out_buffer.dst, out_buffer.pos)) @@ -1172,29 +1771,36 @@ # All chunks should be zstd frames and should have content size set. chunk_buffer = ffi.from_buffer(chunk) - params = ffi.new('ZSTD_frameParams *') - zresult = lib.ZSTD_getFrameParams(params, chunk_buffer, len(chunk_buffer)) + params = ffi.new('ZSTD_frameHeader *') + zresult = lib.ZSTD_getFrameHeader(params, chunk_buffer, len(chunk_buffer)) if lib.ZSTD_isError(zresult): raise ValueError('chunk 0 is not a valid zstd frame') elif zresult: raise ValueError('chunk 0 is too small to contain a zstd frame') - if not params.frameContentSize: + if params.frameContentSize == lib.ZSTD_CONTENTSIZE_UNKNOWN: raise ValueError('chunk 0 missing content size in frame') - dctx = lib.ZSTD_createDCtx() - if dctx == ffi.NULL: - raise MemoryError() - - dctx = ffi.gc(dctx, lib.ZSTD_freeDCtx) + self._ensure_dctx(load_dict=False) last_buffer = ffi.new('char[]', params.frameContentSize) - zresult = lib.ZSTD_decompressDCtx(dctx, last_buffer, len(last_buffer), - chunk_buffer, len(chunk_buffer)) + out_buffer = ffi.new('ZSTD_outBuffer *') + out_buffer.dst = last_buffer + out_buffer.size = len(last_buffer) + out_buffer.pos = 0 + + in_buffer = ffi.new('ZSTD_inBuffer *') + in_buffer.src = chunk_buffer + in_buffer.size = len(chunk_buffer) + in_buffer.pos = 0 + + zresult = lib.ZSTD_decompress_generic(self._dctx, out_buffer, in_buffer) if lib.ZSTD_isError(zresult): raise ZstdError('could not decompress chunk 0: %s' % - ffi.string(lib.ZSTD_getErrorName(zresult))) + _zstd_error(zresult)) + elif zresult: + raise ZstdError('chunk 0 did not decompress full frame') # Special case of chain length of 1 if len(frames) == 1: @@ -1207,51 +1813,54 @@ raise ValueError('chunk %d must be bytes' % i) chunk_buffer = ffi.from_buffer(chunk) - zresult = lib.ZSTD_getFrameParams(params, chunk_buffer, len(chunk_buffer)) + zresult = lib.ZSTD_getFrameHeader(params, chunk_buffer, len(chunk_buffer)) if lib.ZSTD_isError(zresult): raise ValueError('chunk %d is not a valid zstd frame' % i) elif zresult: raise ValueError('chunk %d is too small to contain a zstd frame' % i) - if not params.frameContentSize: + if params.frameContentSize == lib.ZSTD_CONTENTSIZE_UNKNOWN: raise ValueError('chunk %d missing content size in frame' % i) dest_buffer = ffi.new('char[]', params.frameContentSize) - zresult = lib.ZSTD_decompress_usingDict(dctx, dest_buffer, len(dest_buffer), - chunk_buffer, len(chunk_buffer), - last_buffer, len(last_buffer)) + out_buffer.dst = dest_buffer + out_buffer.size = len(dest_buffer) + out_buffer.pos = 0 + + in_buffer.src = chunk_buffer + in_buffer.size = len(chunk_buffer) + in_buffer.pos = 0 + + zresult = lib.ZSTD_decompress_generic(self._dctx, out_buffer, in_buffer) if lib.ZSTD_isError(zresult): - raise ZstdError('could not decompress chunk %d' % i) + raise ZstdError('could not decompress chunk %d: %s' % + _zstd_error(zresult)) + elif zresult: + raise ZstdError('chunk %d did not decompress full frame' % i) last_buffer = dest_buffer i += 1 return ffi.buffer(last_buffer, len(last_buffer))[:] - def _ensure_dstream(self): - if self._dstream: - zresult = lib.ZSTD_resetDStream(self._dstream) + def _ensure_dctx(self, load_dict=True): + lib.ZSTD_DCtx_reset(self._dctx) + + if self._max_window_size: + zresult = lib.ZSTD_DCtx_setMaxWindowSize(self._dctx, + self._max_window_size) if lib.ZSTD_isError(zresult): - raise ZstdError('could not reset DStream: %s' % - ffi.string(lib.ZSTD_getErrorName(zresult))) - - return - - self._dstream = lib.ZSTD_createDStream() - if self._dstream == ffi.NULL: - raise MemoryError() + raise ZstdError('unable to set max window size: %s' % + _zstd_error(zresult)) - self._dstream = ffi.gc(self._dstream, lib.ZSTD_freeDStream) + zresult = lib.ZSTD_DCtx_setFormat(self._dctx, self._format) + if lib.ZSTD_isError(zresult): + raise ZstdError('unable to set decoding format: %s' % + _zstd_error(zresult)) - if self._dict_data: - zresult = lib.ZSTD_initDStream_usingDict(self._dstream, - self._dict_data.as_bytes(), - len(self._dict_data)) - else: - zresult = lib.ZSTD_initDStream(self._dstream) - - if lib.ZSTD_isError(zresult): - self._dstream = None - raise ZstdError('could not initialize DStream: %s' % - ffi.string(lib.ZSTD_getErrorName(zresult))) + if self._dict_data and load_dict: + zresult = lib.ZSTD_DCtx_refDDict(self._dctx, self._dict_data._ddict) + if lib.ZSTD_isError(zresult): + raise ZstdError('unable to reference prepared dictionary: %s' % + _zstd_error(zresult)) diff -r fb92df8b634c -r ed5448edcbfa contrib/python3-ratchet.py --- a/contrib/python3-ratchet.py Wed Apr 04 10:35:09 2018 -0400 +++ b/contrib/python3-ratchet.py Wed Apr 18 15:32:08 2018 -0400 @@ -80,8 +80,7 @@ print('warning: Python 3.6.0 and 3.6.1 have ' 'a bug which breaks Mercurial') print('(see https://bugs.python.org/issue29714 for details)') - # TODO(augie): uncomment exit when Python 3.6.2 is available - # sys.exit(1) + sys.exit(1) rt = subprocess.Popen([opts.python3, 'run-tests.py', '-j', str(opts.j), '--blacklist', opts.working_tests, '--json']) diff -r fb92df8b634c -r ed5448edcbfa contrib/python3-whitelist --- a/contrib/python3-whitelist Wed Apr 04 10:35:09 2018 -0400 +++ b/contrib/python3-whitelist Wed Apr 18 15:32:08 2018 -0400 @@ -1,16 +1,42 @@ +test-abort-checkin.t test-add.t test-addremove-similar.t test-addremove.t +test-amend-subrepo.t +test-amend.t test-ancestor.py +test-annotate.py +test-annotate.t +test-archive-symlinks.t +test-atomictempfile.py +test-audit-path.t +test-audit-subrepo.t test-automv.t +test-backout.t test-backwards-remove.t +test-basic.t test-bheads.t +test-bisect.t test-bisect2.t +test-bisect3.t +test-blackbox.t +test-bookmarks-current.t test-bookmarks-merge.t +test-bookmarks-rebase.t test-bookmarks-strip.t +test-bookmarks.t +test-branch-change.t +test-branch-option.t test-branch-tag-confict.t +test-branches.t +test-bundle-phases.t +test-bundle-type.t +test-bundle-vs-outgoing.t +test-bundle2-multiple-changegroups.t +test-cappedreader.py test-casecollision.t test-cat.t +test-censor.t test-changelog-exec.t test-check-commit.t test-check-execute.t @@ -19,55 +45,207 @@ test-check-pylint.t test-check-shbang.t test-children.t +test-clone-cgi.t +test-clone-pull-corruption.t +test-clone-r.t +test-clone-update-order.t +test-command-template.t +test-commit-amend.t +test-commit-interactive.t +test-commit-multiple.t test-commit-unresolved.t +test-commit.t +test-committer.t test-completion.t +test-config-env.py +test-config.t +test-conflict.t +test-confused-revert.t test-contrib-check-code.t test-contrib-check-commit.t +test-convert-authormap.t +test-convert-clonebranches.t +test-convert-datesort.t +test-convert-filemap.t +test-convert-hg-sink.t +test-convert-hg-source.t +test-convert-hg-startrev.t +test-convert-splicemap.t +test-convert-tagsbranch-topology.t +test-copy-move-merge.t +test-copy.t +test-copytrace-heuristics.t +test-debugbuilddag.t +test-debugbundle.t +test-debugextensions.t +test-debugindexdot.t test-debugrename.t +test-default-push.t +test-diff-binary-file.t +test-diff-change.t test-diff-copy-depth.t test-diff-hashes.t +test-diff-ignore-whitespace.t +test-diff-indent-heuristic.t test-diff-issue2761.t test-diff-newlines.t test-diff-reverse.t test-diff-subdir.t +test-diff-unified.t +test-diff-upgrade.t test-diffdir.t +test-diffstat.t test-directaccess.t +test-dirstate-backup.t test-dirstate-nonnormalset.t +test-dirstate.t test-doctest.py test-double-merge.t +test-drawdag.t test-duplicateoptions.py +test-editor-filename.t test-empty-dir.t test-empty-file.t +test-empty-group.t test-empty.t +test-encode.t test-encoding-func.py +test-encoding.t +test-eol-add.t +test-eol-clone.t +test-eol-hook.t +test-eol-tag.t +test-eol-update.t test-excessive-merge.t +test-exchange-obsmarkers-case-A1.t +test-exchange-obsmarkers-case-A2.t +test-exchange-obsmarkers-case-A3.t +test-exchange-obsmarkers-case-A4.t +test-exchange-obsmarkers-case-A5.t +test-exchange-obsmarkers-case-A6.t +test-exchange-obsmarkers-case-A7.t +test-exchange-obsmarkers-case-B1.t +test-exchange-obsmarkers-case-B2.t +test-exchange-obsmarkers-case-B3.t +test-exchange-obsmarkers-case-B4.t +test-exchange-obsmarkers-case-B5.t +test-exchange-obsmarkers-case-B6.t +test-exchange-obsmarkers-case-B7.t +test-exchange-obsmarkers-case-C1.t +test-exchange-obsmarkers-case-C2.t +test-exchange-obsmarkers-case-C3.t +test-exchange-obsmarkers-case-C4.t +test-exchange-obsmarkers-case-D1.t +test-exchange-obsmarkers-case-D2.t +test-exchange-obsmarkers-case-D3.t +test-exchange-obsmarkers-case-D4.t test-execute-bit.t +test-export.t +test-extdata.t +test-extdiff.t +test-extra-filelog-entry.t +test-filebranch.t +test-fileset-generated.t +test-fix-topology.t +test-flags.t +test-generaldelta.t +test-getbundle.t +test-git-export.t +test-glog-topological.t test-gpg.t +test-graft.t +test-hg-parseurl.py test-hghave.t +test-hgignore.t +test-hgk.t +test-hgweb-bundle.t +test-hgweb-descend-empties.t +test-hgweb-empty.t +test-hgweb-removed.t +test-hgwebdirsym.t +test-histedit-arguments.t +test-histedit-base.t +test-histedit-bookmark-motion.t +test-histedit-commute.t +test-histedit-drop.t +test-histedit-edit.t +test-histedit-fold-non-commute.t +test-histedit-fold.t +test-histedit-no-change.t +test-histedit-non-commute-abort.t +test-histedit-non-commute.t +test-histedit-obsolete.t +test-histedit-outgoing.t +test-histedit-templates.t +test-http-branchmap.t +test-http-bundle1.t +test-http-clone-r.t +test-http.t +test-identify.t +test-import-unknown.t +test-import.t test-imports-checker.t +test-incoming-outgoing.t +test-inherit-mode.t test-issue1089.t +test-issue1102.t test-issue1175.t +test-issue1306.t +test-issue1438.t test-issue1502.t test-issue1802.t test-issue1877.t test-issue1993.t +test-issue2137.t +test-issue3084.t +test-issue4074.t test-issue522.t +test-issue586.t test-issue612.t test-issue619.t +test-issue660.t test-issue672.t test-issue842.t test-journal-exists.t +test-journal-share.t +test-journal.t +test-largefiles-cache.t +test-largefiles-misc.t +test-largefiles-small-disk.t +test-largefiles-update.t +test-largefiles.t +test-lfs-largefiles.t +test-linerange.py test-locate.t +test-lock-badness.t +test-log-linerange.t +test-log.t +test-logexchange.t test-lrucachedict.py -test-manifest.py +test-mactext.t +test-mailmap.t test-manifest-merging.t +test-manifest.py +test-manifest.t test-match.py +test-mdiff.py +test-merge-changedelete.t +test-merge-closedheads.t +test-merge-commit.t +test-merge-criss-cross.t test-merge-default.t +test-merge-force.t +test-merge-halt.t test-merge-internal-tools-pattern.t +test-merge-local.t test-merge-remove.t test-merge-revert.t test-merge-revert2.t test-merge-subrepos.t +test-merge-symlinks.t +test-merge-tools.t +test-merge-types.t +test-merge1.t test-merge10.t test-merge2.t test-merge4.t @@ -75,9 +253,74 @@ test-merge6.t test-merge7.t test-merge8.t +test-merge9.t +test-mq-git.t +test-mq-header-date.t +test-mq-header-from.t +test-mq-merge.t +test-mq-pull-from-bundle.t +test-mq-qclone-http.t +test-mq-qdelete.t +test-mq-qdiff.t +test-mq-qfold.t +test-mq-qgoto.t test-mq-qimport-fail-cleanup.t +test-mq-qnew.t +test-mq-qpush-exact.t +test-mq-qqueue.t +test-mq-qrefresh-interactive.t +test-mq-qrefresh-replace-log-message.t +test-mq-qrefresh.t +test-mq-qrename.t +test-mq-qsave.t +test-mq-safety.t +test-mq-subrepo.t +test-mq-symlinks.t +test-mv-cp-st-diff.t +test-narrow-archive.t +test-narrow-clone-no-ellipsis.t +test-narrow-clone-non-narrow-server.t +test-narrow-clone-nonlinear.t +test-narrow-clone.t +test-narrow-commit.t +test-narrow-copies.t +test-narrow-debugcommands.t +test-narrow-debugrebuilddirstate.t +test-narrow-exchange-merges.t +test-narrow-exchange.t +test-narrow-expanddirstate.t +test-narrow-merge.t +test-narrow-patch.t +test-narrow-patterns.t +test-narrow-pull.t +test-narrow-rebase.t +test-narrow-shallow-merges.t +test-narrow-shallow.t +test-narrow-strip.t +test-narrow-update.t +test-nested-repo.t +test-newbranch.t test-obshistory.t +test-obsmarker-template.t +test-obsmarkers-effectflag.t +test-obsolete-bundle-strip.t +test-obsolete-changeset-exchange.t +test-obsolete-checkheads.t +test-obsolete-distributed.t +test-obsolete-tag-cache.t +test-parents.t +test-pathconflicts-merge.t +test-pathconflicts-update.t +test-pending.t test-permissions.t +test-phases.t +test-pull-branch.t +test-pull-http.t +test-pull-permission.t +test-pull-pull-corruption.t +test-pull-r.t +test-pull-update.t +test-purge.t test-push-checkheads-partial-C1.t test-push-checkheads-partial-C2.t test-push-checkheads-partial-C3.t @@ -105,27 +348,110 @@ test-push-checkheads-unpushed-D5.t test-push-checkheads-unpushed-D6.t test-push-checkheads-unpushed-D7.t +test-push-http.t +test-push-warn.t +test-pushvars.t +test-rebase-abort.t +test-rebase-base-flag.t +test-rebase-bookmarks.t +test-rebase-brute-force.t +test-rebase-cache.t +test-rebase-check-restore.t +test-rebase-collapse.t +test-rebase-conflicts.t +test-rebase-dest.t +test-rebase-detach.t +test-rebase-emptycommit.t +test-rebase-inmemory.t +test-rebase-interruptions.t +test-rebase-issue-noparam-single-rev.t +test-rebase-legacy.t +test-rebase-mq-skip.t +test-rebase-mq.t +test-rebase-named-branches.t +test-rebase-newancestor.t +test-rebase-obsolete.t +test-rebase-parameters.t +test-rebase-partial.t +test-rebase-pull.t +test-rebase-rename.t +test-rebase-scenario-global.t +test-rebase-templates.t +test-rebase-transaction.t test-record.t +test-relink.t +test-remove.t +test-rename-after-merge.t test-rename-dir-merge.t test-rename-merge1.t test-rename.t +test-repair-strip.t +test-repo-compengines.t +test-resolve.t test-revert-flags.t test-revert-unknown.t +test-revlog-ancestry.py test-revlog-group-emptyiter.t test-revlog-mmapindex.t test-revlog-packentry.t +test-revset-dirstate-parents.t +test-revset-legacy-lookup.t +test-revset-outgoing.t +test-rollback.t test-run-tests.py +test-run-tests.t +test-schemes.t +test-serve.t +test-setdiscovery.t +test-share.t +test-shelve.t test-show-stack.t +test-show-work.t +test-show.t test-simple-update.t +test-single-head.t test-sparse-clear.t +test-sparse-import.t test-sparse-merges.t +test-sparse-profiles.t test-sparse-requirement.t test-sparse-verbose-json.t +test-ssh-clone-r.t +test-ssh-proto.t +test-sshserver.py +test-stack.t +test-status-rev.t test-status-terse.t +test-strip-cross.t +test-strip.t +test-subrepo-deep-nested-change.t +test-subrepo-missing.t +test-subrepo-recursion.t +test-subrepo-relative-path.t +test-subrepo.t +test-symlinks.t +test-tag.t +test-tags.t +test-template-engine.t +test-treemanifest.t +test-unamend.t test-uncommit.t test-unified-test.t test-unrelated-pull.t +test-up-local-change.t +test-update-branches.t +test-update-dest.t test-update-issue1456.t test-update-names.t test-update-reverse.t +test-upgrade-repo.t +test-url-download.t +test-url-rev.t +test-username-newline.t +test-verify.t +test-websub.t +test-win32text.t +test-wireproto-clientreactor.py +test-wireproto-framing.py +test-wireproto-serverreactor.py test-xdg.t diff -r fb92df8b634c -r ed5448edcbfa contrib/simplemerge --- a/contrib/simplemerge Wed Apr 04 10:35:09 2018 -0400 +++ b/contrib/simplemerge Wed Apr 18 15:32:08 2018 -0400 @@ -14,7 +14,9 @@ fancyopts, simplemerge, ui as uimod, - util, +) +from mercurial.utils import ( + procutil, ) options = [('L', 'label', [], _('labels to use on conflict markers')), @@ -52,7 +54,7 @@ try: for fp in (sys.stdin, sys.stdout, sys.stderr): - util.setbinary(fp) + procutil.setbinary(fp) opts = {} try: diff -r fb92df8b634c -r ed5448edcbfa contrib/synthrepo.py --- a/contrib/synthrepo.py Wed Apr 04 10:35:09 2018 -0400 +++ b/contrib/synthrepo.py Wed Apr 18 15:32:08 2018 -0400 @@ -59,8 +59,8 @@ patch, registrar, scmutil, - util, ) +from mercurial.utils import dateutil # Note for extension authors: ONLY specify testedwith = 'ships-with-hg-core' for # extensions which SHIP WITH MERCURIAL. Non-mainline extensions should @@ -381,8 +381,8 @@ ui.progress(_synthesizing, None) message = 'synthesized wide repo with %d files' % (len(files),) mc = context.memctx(repo, [pctx.node(), nullid], message, - files.iterkeys(), filectxfn, ui.username(), - '%d %d' % util.makedate()) + files, filectxfn, ui.username(), + '%d %d' % dateutil.makedate()) initnode = mc.commit() if ui.debugflag: hexfn = hex diff -r fb92df8b634c -r ed5448edcbfa contrib/undumprevlog --- a/contrib/undumprevlog Wed Apr 04 10:35:09 2018 -0400 +++ b/contrib/undumprevlog Wed Apr 18 15:32:08 2018 -0400 @@ -10,12 +10,14 @@ node, revlog, transaction, - util, vfs as vfsmod, ) +from mercurial.utils import ( + procutil, +) for fp in (sys.stdin, sys.stdout, sys.stderr): - util.setbinary(fp) + procutil.setbinary(fp) opener = vfsmod.vfs('.', False) tr = transaction.transaction(sys.stderr.write, opener, {'store': opener}, diff -r fb92df8b634c -r ed5448edcbfa contrib/wix/dist.wxs --- a/contrib/wix/dist.wxs Wed Apr 04 10:35:09 2018 -0400 +++ b/contrib/wix/dist.wxs Wed Apr 18 15:32:08 2018 -0400 @@ -14,7 +14,6 @@ - diff -r fb92df8b634c -r ed5448edcbfa contrib/wix/help.wxs --- a/contrib/wix/help.wxs Wed Apr 04 10:35:09 2018 -0400 +++ b/contrib/wix/help.wxs Wed Apr 18 15:32:08 2018 -0400 @@ -40,6 +40,7 @@ + diff -r fb92df8b634c -r ed5448edcbfa hgext/acl.py --- a/hgext/acl.py Wed Apr 04 10:35:09 2018 -0400 +++ b/hgext/acl.py Wed Apr 18 15:32:08 2018 -0400 @@ -193,8 +193,6 @@ from __future__ import absolute_import -import getpass - from mercurial.i18n import _ from mercurial import ( error, @@ -203,6 +201,9 @@ registrar, util, ) +from mercurial.utils import ( + procutil, +) urlreq = util.urlreq @@ -334,13 +335,13 @@ return user = None - if source == 'serve' and 'url' in kwargs: - url = kwargs['url'].split(':') + if source == 'serve' and r'url' in kwargs: + url = kwargs[r'url'].split(':') if url[0] == 'remote' and url[1].startswith('http'): user = urlreq.unquote(url[3]) if user is None: - user = getpass.getuser() + user = procutil.getuser() ui.debug('acl: checking access for user "%s"\n' % user) @@ -355,7 +356,7 @@ allow = buildmatch(ui, repo, user, 'acl.allow') deny = buildmatch(ui, repo, user, 'acl.deny') - for rev in xrange(repo[node], len(repo)): + for rev in xrange(repo[node].rev(), len(repo)): ctx = repo[rev] branch = ctx.branch() if denybranches and denybranches(branch): diff -r fb92df8b634c -r ed5448edcbfa hgext/blackbox.py --- a/hgext/blackbox.py Wed Apr 04 10:35:09 2018 -0400 +++ b/hgext/blackbox.py Wed Apr 18 15:32:08 2018 -0400 @@ -49,6 +49,10 @@ ui as uimod, util, ) +from mercurial.utils import ( + dateutil, + procutil, +) # Note for extension authors: ONLY specify testedwith = 'ships-with-hg-core' for # extensions which SHIP WITH MERCURIAL. Non-mainline extensions should @@ -164,9 +168,9 @@ return ui._bbinlog = True default = self.configdate('devel', 'default-date') - date = util.datestr(default, '%Y/%m/%d %H:%M:%S') - user = util.getuser() - pid = '%d' % util.getpid() + date = dateutil.datestr(default, '%Y/%m/%d %H:%M:%S') + user = procutil.getuser() + pid = '%d' % procutil.getpid() formattedmsg = msg[0] % msg[1:] rev = '(unknown)' changed = '' diff -r fb92df8b634c -r ed5448edcbfa hgext/bugzilla.py --- a/hgext/bugzilla.py Wed Apr 04 10:35:09 2018 -0400 +++ b/hgext/bugzilla.py Wed Apr 18 15:32:08 2018 -0400 @@ -300,13 +300,17 @@ from mercurial.i18n import _ from mercurial.node import short from mercurial import ( - cmdutil, error, + logcmdutil, mail, registrar, url, util, ) +from mercurial.utils import ( + procutil, + stringutil, +) xmlrpclib = util.xmlrpclib @@ -524,13 +528,13 @@ except TypeError: cmd = cmdfmt % {'bzdir': bzdir, 'id': id, 'user': user} self.ui.note(_('running notify command %s\n') % cmd) - fp = util.popen('(%s) 2>&1' % cmd) - out = fp.read() + fp = procutil.popen('(%s) 2>&1' % cmd, 'rb') + out = util.fromnativeeol(fp.read()) ret = fp.close() if ret: self.ui.warn(out) raise error.Abort(_('bugzilla notify command %s') % - util.explainexit(ret)[0]) + procutil.explainexit(ret)) self.ui.status(_('done\n')) def get_user_id(self, user): @@ -1090,9 +1094,8 @@ if not mapfile and not tmpl: tmpl = _('changeset {node|short} in repo {root} refers ' 'to bug {bug}.\ndetails:\n\t{desc|tabindent}') - spec = cmdutil.logtemplatespec(tmpl, mapfile) - t = cmdutil.changeset_templater(self.ui, self.repo, spec, - False, None, False) + spec = logcmdutil.templatespec(tmpl, mapfile) + t = logcmdutil.changesettemplater(self.ui, self.repo, spec) self.ui.pushbuffer() t.show(ctx, changes=ctx.changeset(), bug=str(bugid), @@ -1100,7 +1103,8 @@ root=self.repo.root, webroot=webroot(self.repo.root)) data = self.ui.popbuffer() - self.bzdriver.updatebug(bugid, newstate, data, util.email(ctx.user())) + self.bzdriver.updatebug(bugid, newstate, data, + stringutil.email(ctx.user())) def notify(self, bugs, committer): '''ensure Bugzilla users are notified of bug change.''' @@ -1120,6 +1124,6 @@ if bugs: for bug in bugs: bz.update(bug, bugs[bug], ctx) - bz.notify(bugs, util.email(ctx.user())) + bz.notify(bugs, stringutil.email(ctx.user())) except Exception as e: raise error.Abort(_('Bugzilla error: %s') % e) diff -r fb92df8b634c -r ed5448edcbfa hgext/censor.py --- a/hgext/censor.py Wed Apr 04 10:35:09 2018 -0400 +++ b/hgext/censor.py Wed Apr 18 15:32:08 2018 -0400 @@ -32,7 +32,6 @@ from mercurial import ( error, - filelog, lock as lockmod, registrar, revlog, @@ -106,7 +105,7 @@ raise error.Abort( _('censor does not support revlog version %d') % (flogv,)) - tombstone = filelog.packmeta({"censored": tombstone}, "") + tombstone = revlog.packmeta({"censored": tombstone}, "") crev = fctx.filerev() diff -r fb92df8b634c -r ed5448edcbfa hgext/children.py --- a/hgext/children.py Wed Apr 04 10:35:09 2018 -0400 +++ b/hgext/children.py Wed Apr 18 15:32:08 2018 -0400 @@ -19,8 +19,10 @@ from mercurial.i18n import _ from mercurial import ( cmdutil, + logcmdutil, pycompat, registrar, + scmutil, ) templateopts = cmdutil.templateopts @@ -34,7 +36,7 @@ testedwith = 'ships-with-hg-core' @command('children', - [('r', 'rev', '', + [('r', 'rev', '.', _('show children of the specified revision'), _('REV')), ] + templateopts, _('hg children [-r REV] [FILE]'), @@ -58,14 +60,14 @@ """ opts = pycompat.byteskwargs(opts) rev = opts.get('rev') + ctx = scmutil.revsingle(repo, rev) if file_: - fctx = repo.filectx(file_, changeid=rev) + fctx = repo.filectx(file_, changeid=ctx.rev()) childctxs = [fcctx.changectx() for fcctx in fctx.children()] else: - ctx = repo[rev] childctxs = ctx.children() - displayer = cmdutil.show_changeset(ui, repo, opts) + displayer = logcmdutil.changesetdisplayer(ui, repo, opts) for cctx in childctxs: displayer.show(cctx) displayer.close() diff -r fb92df8b634c -r ed5448edcbfa hgext/churn.py --- a/hgext/churn.py Wed Apr 04 10:35:09 2018 -0400 +++ b/hgext/churn.py Wed Apr 18 15:32:08 2018 -0400 @@ -18,12 +18,13 @@ from mercurial import ( cmdutil, encoding, + logcmdutil, patch, pycompat, registrar, scmutil, - util, ) +from mercurial.utils import dateutil cmdtable = {} command = registrar.command(cmdtable) @@ -54,7 +55,7 @@ return date.strftime(opts['dateformat']) else: tmpl = opts.get('oldtemplate') or opts.get('template') - tmpl = cmdutil.makelogtemplater(ui, repo, tmpl) + tmpl = logcmdutil.maketemplater(ui, repo, tmpl) def getkey(ctx): ui.pushbuffer() tmpl.show(ctx) @@ -64,7 +65,7 @@ rate = {} df = False if opts.get('date'): - df = util.matchdate(opts['date']) + df = dateutil.matchdate(opts['date']) m = scmutil.match(repo[None], pats, opts) def prep(ctx, fns): @@ -170,7 +171,7 @@ ui.warn(_("skipping malformed alias: %s\n") % l) continue - rate = countrate(ui, repo, amap, *pats, **opts).items() + rate = list(countrate(ui, repo, amap, *pats, **opts).items()) if not rate: return diff -r fb92df8b634c -r ed5448edcbfa hgext/clonebundles.py --- a/hgext/clonebundles.py Wed Apr 04 10:35:09 2018 -0400 +++ b/hgext/clonebundles.py Wed Apr 18 15:32:08 2018 -0400 @@ -6,7 +6,8 @@ "clonebundles" is a server-side extension used to advertise the existence of pre-generated, externally hosted bundle files to clients that are cloning so that cloning can be faster, more reliable, and require less -resources on the server. +resources on the server. "pullbundles" is a related feature for sending +pre-generated bundle files to clients as part of pull operations. Cloning can be a CPU and I/O intensive operation on servers. Traditionally, the server, in response to a client's request to clone, dynamically generates @@ -16,8 +17,12 @@ servers with large repositories or with high clone volume, the load from clones can make scaling the server challenging and costly. -This extension provides server operators the ability to offload potentially -expensive clone load to an external service. Here's how it works. +This extension provides server operators the ability to offload +potentially expensive clone load to an external service. Pre-generated +bundles also allow using more CPU intensive compression, reducing the +effective bandwidth requirements. + +Here's how clone bundles work: 1. A server operator establishes a mechanism for making bundle files available on a hosting service where Mercurial clients can fetch them. @@ -33,7 +38,7 @@ 7. The client reconnects to the original server and performs the equivalent of :hg:`pull` to retrieve all repository data not in the bundle. (The repository could have been updated between when the bundle was created - and when the client started the clone.) + and when the client started the clone.) This may use "pullbundles". Instead of the server generating full repository bundles for every clone request, it generates full bundles once and they are subsequently reused to @@ -42,13 +47,27 @@ created. For large, established repositories, this can reduce server load for clones to less than 1% of original. +Here's how pullbundles work: + +1. A manifest file listing available bundles and describing the revisions + is added to the Mercurial repository on the server. +2. A new-enough client informs the server that it supports partial pulls + and initiates a pull. +3. If the server has pull bundles enabled and sees the client advertising + partial pulls, it checks for a matching pull bundle in the manifest. + A bundle matches if the format is supported by the client, the client + has the required revisions already and needs something from the bundle. +4. If there is at least one matching bundle, the server sends it to the client. +5. The client applies the bundle and notices that the server reply was + incomplete. It initiates another pull. + To work, this extension requires the following of server operators: * Generating bundle files of repository content (typically periodically, such as once per day). -* A file server that clients have network access to and that Python knows - how to talk to through its normal URL handling facility (typically an - HTTP server). +* Clone bundles: A file server that clients have network access to and that + Python knows how to talk to through its normal URL handling facility + (typically an HTTP/HTTPS server). * A process for keeping the bundles manifest in sync with available bundle files. @@ -61,7 +80,7 @@ :hg:`bundle --all` is used to produce a bundle of the entire repository. :hg:`debugcreatestreamclonebundle` can be used to produce a special -*streaming clone bundle*. These are bundle files that are extremely efficient +*streaming clonebundle*. These are bundle files that are extremely efficient to produce and consume (read: fast). However, they are larger than traditional bundle formats and require that clients support the exact set of repository data store formats in use by the repository that created them. @@ -73,7 +92,8 @@ A server operator is responsible for creating a ``.hg/clonebundles.manifest`` file containing the list of available bundle files suitable for seeding clones. If this file does not exist, the repository will not advertise the -existence of clone bundles when clients connect. +existence of clone bundles when clients connect. For pull bundles, +``.hg/pullbundles.manifest`` is used. The manifest file contains a newline (\\n) delimited list of entries. @@ -85,6 +105,9 @@ pairs describing additional properties of this bundle. Both keys and values are URI encoded. +For pull bundles, the URL is a path under the ``.hg`` directory of the +repository. + Keys in UPPERCASE are reserved for use by Mercurial and are defined below. All non-uppercase keys can be used by site installations. An example use for custom properties is to use the *datacenter* attribute to define which @@ -133,6 +156,15 @@ Value should be "true". +heads + Used for pull bundles. This contains the ``;`` separated changeset + hashes of the heads of the bundle content. + +bases + Used for pull bundles. This contains the ``;`` separated changeset + hashes of the roots of the bundle content. This can be skipped if + the bundle was created without ``--base``. + Manifests can contain multiple entries. Assuming metadata is defined, clients will filter entries from the manifest that they don't support. The remaining entries are optionally sorted by client preferences @@ -166,7 +198,7 @@ from mercurial import ( extensions, - wireproto, + wireprotov1server, ) testedwith = 'ships-with-hg-core' @@ -183,4 +215,4 @@ return caps def extsetup(ui): - extensions.wrapfunction(wireproto, '_capabilities', capabilities) + extensions.wrapfunction(wireprotov1server, '_capabilities', capabilities) diff -r fb92df8b634c -r ed5448edcbfa hgext/commitextras.py --- a/hgext/commitextras.py Wed Apr 04 10:35:09 2018 -0400 +++ b/hgext/commitextras.py Wed Apr 18 15:32:08 2018 -0400 @@ -70,7 +70,7 @@ # This __dict__ logic is needed because the normal # extension.wrapfunction doesn't seem to work. - repo.__dict__['commit'] = _wrappedcommit + repo.__dict__[r'commit'] = _wrappedcommit return orig(ui, repo, *pats, **opts) finally: - del repo.__dict__['commit'] + del repo.__dict__[r'commit'] diff -r fb92df8b634c -r ed5448edcbfa hgext/convert/__init__.py --- a/hgext/convert/__init__.py Wed Apr 04 10:35:09 2018 -0400 +++ b/hgext/convert/__init__.py Wed Apr 18 15:32:08 2018 -0400 @@ -477,11 +477,12 @@ dates.''' return cvsps.debugcvsps(ui, *args, **opts) -def kwconverted(ctx, name): +def kwconverted(context, mapping, name): + ctx = context.resource(mapping, 'ctx') rev = ctx.extra().get('convert_revision', '') if rev.startswith('svn:'): if name == 'svnrev': - return str(subversion.revsplit(rev)[2]) + return (b"%d" % subversion.revsplit(rev)[2]) elif name == 'svnpath': return subversion.revsplit(rev)[1] elif name == 'svnuuid': @@ -490,20 +491,20 @@ templatekeyword = registrar.templatekeyword() -@templatekeyword('svnrev') -def kwsvnrev(repo, ctx, **args): +@templatekeyword('svnrev', requires={'ctx'}) +def kwsvnrev(context, mapping): """String. Converted subversion revision number.""" - return kwconverted(ctx, 'svnrev') + return kwconverted(context, mapping, 'svnrev') -@templatekeyword('svnpath') -def kwsvnpath(repo, ctx, **args): +@templatekeyword('svnpath', requires={'ctx'}) +def kwsvnpath(context, mapping): """String. Converted subversion revision project path.""" - return kwconverted(ctx, 'svnpath') + return kwconverted(context, mapping, 'svnpath') -@templatekeyword('svnuuid') -def kwsvnuuid(repo, ctx, **args): +@templatekeyword('svnuuid', requires={'ctx'}) +def kwsvnuuid(context, mapping): """String. Converted subversion revision repository identifier.""" - return kwconverted(ctx, 'svnuuid') + return kwconverted(context, mapping, 'svnuuid') # tell hggettext to extract docstrings from these functions: i18nfunctions = [kwsvnrev, kwsvnpath, kwsvnuuid] diff -r fb92df8b634c -r ed5448edcbfa hgext/convert/common.py --- a/hgext/convert/common.py Wed Apr 04 10:35:09 2018 -0400 +++ b/hgext/convert/common.py Wed Apr 18 15:32:08 2018 -0400 @@ -11,6 +11,7 @@ import errno import os import re +import shlex import subprocess from mercurial.i18n import _ @@ -18,12 +19,68 @@ encoding, error, phases, + pycompat, util, ) +from mercurial.utils import ( + procutil, +) pickle = util.pickle propertycache = util.propertycache +def _encodeornone(d): + if d is None: + return + return d.encode('latin1') + +class _shlexpy3proxy(object): + + def __init__(self, l): + self._l = l + + def __iter__(self): + return (_encodeornone(v) for v in self._l) + + def get_token(self): + return _encodeornone(self._l.get_token()) + + @property + def infile(self): + return self._l.infile or '' + + @property + def lineno(self): + return self._l.lineno + +def shlexer(data=None, filepath=None, wordchars=None, whitespace=None): + if data is None: + if pycompat.ispy3: + data = open(filepath, 'r', encoding=r'latin1') + else: + data = open(filepath, 'r') + else: + if filepath is not None: + raise error.ProgrammingError( + 'shlexer only accepts data or filepath, not both') + if pycompat.ispy3: + data = data.decode('latin1') + l = shlex.shlex(data, infile=filepath, posix=True) + if whitespace is not None: + l.whitespace_split = True + if pycompat.ispy3: + l.whitespace += whitespace.decode('latin1') + else: + l.whitespace += whitespace + if wordchars is not None: + if pycompat.ispy3: + l.wordchars += wordchars.decode('latin1') + else: + l.wordchars += wordchars + if pycompat.ispy3: + return _shlexpy3proxy(l) + return l + def encodeargs(args): def encodearg(s): lines = base64.encodestring(s) @@ -42,7 +99,7 @@ def checktool(exe, name=None, abort=True): name = name or exe - if not util.findexe(exe): + if not procutil.findexe(exe): if abort: exc = error.Abort else: @@ -87,7 +144,7 @@ """ fails if revstr is not a 40 byte hex. mercurial and git both uses such format for their revision numbering """ - if not re.match(r'[0-9a-fA-F]{40,40}$', revstr): + if not re.match(br'[0-9a-fA-F]{40,40}$', revstr): raise error.Abort(_('%s entry %s is not a valid revision' ' identifier') % (mapname, revstr)) @@ -160,12 +217,13 @@ if isinstance(s, unicode): return s.encode("utf-8") try: - return s.decode(encoding).encode("utf-8") + return s.decode(pycompat.sysstr(encoding)).encode("utf-8") except UnicodeError: try: return s.decode("latin-1").encode("utf-8") except UnicodeError: - return s.decode(encoding, "replace").encode("utf-8") + return s.decode(pycompat.sysstr(encoding), + "replace").encode("utf-8") def getchangedfiles(self, rev, i): """Return the files changed by rev compared to parent[i]. @@ -322,6 +380,7 @@ pass def _cmdline(self, cmd, *args, **kwargs): + kwargs = pycompat.byteskwargs(kwargs) cmdline = [self.command, cmd] + list(args) for k, v in kwargs.iteritems(): if len(k) == 1: @@ -335,25 +394,25 @@ cmdline[-1] += '=' + v except TypeError: pass - cmdline = [util.shellquote(arg) for arg in cmdline] + cmdline = [procutil.shellquote(arg) for arg in cmdline] if not self.ui.debugflag: - cmdline += ['2>', os.devnull] + cmdline += ['2>', pycompat.bytestr(os.devnull)] cmdline = ' '.join(cmdline) return cmdline def _run(self, cmd, *args, **kwargs): def popen(cmdline): p = subprocess.Popen(cmdline, shell=True, bufsize=-1, - close_fds=util.closefds, - stdout=subprocess.PIPE) + close_fds=procutil.closefds, + stdout=subprocess.PIPE) return p return self._dorun(popen, cmd, *args, **kwargs) def _run2(self, cmd, *args, **kwargs): - return self._dorun(util.popen2, cmd, *args, **kwargs) + return self._dorun(procutil.popen2, cmd, *args, **kwargs) def _run3(self, cmd, *args, **kwargs): - return self._dorun(util.popen3, cmd, *args, **kwargs) + return self._dorun(procutil.popen3, cmd, *args, **kwargs) def _dorun(self, openfunc, cmd, *args, **kwargs): cmdline = self._cmdline(cmd, *args, **kwargs) @@ -382,7 +441,7 @@ if output: self.ui.warn(_('%s error:\n') % self.command) self.ui.warn(output) - msg = util.explainexit(status)[0] + msg = procutil.explainexit(status) raise error.Abort('%s %s' % (self.command, msg)) def run0(self, cmd, *args, **kwargs): @@ -416,17 +475,17 @@ def _limit_arglist(self, arglist, cmd, *args, **kwargs): cmdlen = len(self._cmdline(cmd, *args, **kwargs)) limit = self.argmax - cmdlen - bytes = 0 + numbytes = 0 fl = [] for fn in arglist: b = len(fn) + 3 - if bytes + b < limit or len(fl) == 0: + if numbytes + b < limit or len(fl) == 0: fl.append(fn) - bytes += b + numbytes += b else: yield fl fl = [fn] - bytes = b + numbytes = b if fl: yield fl @@ -447,7 +506,7 @@ if not self.path: return try: - fp = open(self.path, 'r') + fp = open(self.path, 'rb') except IOError as err: if err.errno != errno.ENOENT: raise @@ -471,12 +530,12 @@ def __setitem__(self, key, value): if self.fp is None: try: - self.fp = open(self.path, 'a') + self.fp = open(self.path, 'ab') except IOError as err: raise error.Abort( _('could not open map file %r: %s') % (self.path, encoding.strtolocal(err.strerror))) - self.fp.write('%s %s\n' % (key, value)) + self.fp.write(util.tonativeeol('%s %s\n' % (key, value))) self.fp.flush() super(mapfile, self).__setitem__(key, value) @@ -486,7 +545,7 @@ self.fp = None def makedatetimestamp(t): - """Like util.makedate() but for time t instead of current time""" + """Like dateutil.makedate() but for time t instead of current time""" delta = (datetime.datetime.utcfromtimestamp(t) - datetime.datetime.fromtimestamp(t)) tz = delta.days * 86400 + delta.seconds diff -r fb92df8b634c -r ed5448edcbfa hgext/convert/convcmd.py --- a/hgext/convert/convcmd.py Wed Apr 04 10:35:09 2018 -0400 +++ b/hgext/convert/convcmd.py Wed Apr 18 15:32:08 2018 -0400 @@ -8,7 +8,6 @@ import collections import os -import shlex import shutil from mercurial.i18n import _ @@ -16,9 +15,11 @@ encoding, error, hg, + pycompat, scmutil, util, ) +from mercurial.utils import dateutil from . import ( bzr, @@ -55,9 +56,10 @@ def recode(s): if isinstance(s, unicode): - return s.encode(orig_encoding, 'replace') + return s.encode(pycompat.sysstr(orig_encoding), 'replace') else: - return s.decode('utf-8').encode(orig_encoding, 'replace') + return s.decode('utf-8').encode( + pycompat.sysstr(orig_encoding), 'replace') def mapbranch(branch, branchmap): ''' @@ -202,16 +204,14 @@ return {} m = {} try: - fp = open(path, 'r') + fp = open(path, 'rb') for i, line in enumerate(util.iterfile(fp)): line = line.splitlines()[0].rstrip() if not line: # Ignore blank lines continue # split line - lex = shlex.shlex(line, posix=True) - lex.whitespace_split = True - lex.whitespace += ',' + lex = common.shlexer(data=line, whitespace=',') line = list(lex) # check number of parents if not (2 <= len(line) <= 3): @@ -356,7 +356,7 @@ dates = {} def getdate(n): if n not in dates: - dates[n] = util.parsedate(self.commitcache[n].date) + dates[n] = dateutil.parsedate(self.commitcache[n].date) return dates[n] def picknext(nodes): @@ -407,13 +407,14 @@ authorfile = self.authorfile if authorfile: self.ui.status(_('writing author map file %s\n') % authorfile) - ofile = open(authorfile, 'w+') + ofile = open(authorfile, 'wb+') for author in self.authors: - ofile.write("%s=%s\n" % (author, self.authors[author])) + ofile.write(util.tonativeeol("%s=%s\n" + % (author, self.authors[author]))) ofile.close() def readauthormap(self, authorfile): - afile = open(authorfile, 'r') + afile = open(authorfile, 'rb') for line in afile: line = line.strip() @@ -564,6 +565,7 @@ self.map.close() def convert(ui, src, dest=None, revmapfile=None, **opts): + opts = pycompat.byteskwargs(opts) global orig_encoding orig_encoding = encoding.encoding encoding.encoding = 'UTF-8' diff -r fb92df8b634c -r ed5448edcbfa hgext/convert/cvs.py --- a/hgext/convert/cvs.py Wed Apr 04 10:35:09 2018 -0400 +++ b/hgext/convert/cvs.py Wed Apr 18 15:32:08 2018 -0400 @@ -18,6 +18,10 @@ pycompat, util, ) +from mercurial.utils import ( + dateutil, + procutil, +) from . import ( common, @@ -46,8 +50,8 @@ self.tags = {} self.lastbranch = {} self.socket = None - self.cvsroot = open(os.path.join(cvs, "Root")).read()[:-1] - self.cvsrepo = open(os.path.join(cvs, "Repository")).read()[:-1] + self.cvsroot = open(os.path.join(cvs, "Root"), 'rb').read()[:-1] + self.cvsrepo = open(os.path.join(cvs, "Repository"), 'rb').read()[:-1] self.encoding = encoding.encoding self._connect() @@ -87,24 +91,24 @@ for cs in db: if maxrev and cs.id > maxrev: break - id = str(cs.id) + id = (b"%d" % cs.id) cs.author = self.recode(cs.author) self.lastbranch[cs.branch] = id cs.comment = self.recode(cs.comment) if self.ui.configbool('convert', 'localtimezone'): cs.date = makedatetimestamp(cs.date[0]) - date = util.datestr(cs.date, '%Y-%m-%d %H:%M:%S %1%2') + date = dateutil.datestr(cs.date, '%Y-%m-%d %H:%M:%S %1%2') self.tags.update(dict.fromkeys(cs.tags, id)) files = {} for f in cs.entries: - files[f.file] = "%s%s" % ('.'.join([str(x) + files[f.file] = "%s%s" % ('.'.join([(b"%d" % x) for x in f.revision]), ['', '(DEAD)'][f.dead]) # add current commit to set c = commit(author=cs.author, date=date, - parents=[str(p.id) for p in cs.parents], + parents=[(b"%d" % p.id) for p in cs.parents], desc=cs.comment, branch=cs.branch or '') self.changeset[id] = c self.files[id] = files @@ -141,7 +145,7 @@ passw = "A" cvspass = os.path.expanduser("~/.cvspass") try: - pf = open(cvspass) + pf = open(cvspass, 'rb') for line in pf.read().splitlines(): part1, part2 = line.split(' ', 1) # /1 :pserver:user@example.com:2401/cvsroot/foo @@ -179,7 +183,7 @@ # :ext:user@host/home/user/path/to/cvsroot if root.startswith(":ext:"): root = root[5:] - m = re.match(r'(?:([^@:/]+)@)?([^:/]+):?(.*)', root) + m = re.match(br'(?:([^@:/]+)@)?([^:/]+):?(.*)', root) # Do not take Windows path "c:\foo\bar" for a connection strings if os.path.isdir(root) or not m: conntype = "local" @@ -196,9 +200,9 @@ cmd = [rsh, host] + cmd # popen2 does not support argument lists under Windows - cmd = [util.shellquote(arg) for arg in cmd] - cmd = util.quotecommand(' '.join(cmd)) - self.writep, self.readp = util.popen2(cmd) + cmd = [procutil.shellquote(arg) for arg in cmd] + cmd = procutil.quotecommand(' '.join(cmd)) + self.writep, self.readp = procutil.popen2(cmd) self.realroot = root diff -r fb92df8b634c -r ed5448edcbfa hgext/convert/cvsps.py --- a/hgext/convert/cvsps.py Wed Apr 04 10:35:09 2018 -0400 +++ b/hgext/convert/cvsps.py Wed Apr 18 15:32:08 2018 -0400 @@ -17,6 +17,11 @@ pycompat, util, ) +from mercurial.utils import ( + dateutil, + procutil, + stringutil, +) pickle = util.pickle @@ -132,7 +137,7 @@ # Get the real directory in the repository try: - prefix = open(os.path.join('CVS','Repository')).read().strip() + prefix = open(os.path.join('CVS','Repository'), 'rb').read().strip() directory = prefix if prefix == ".": prefix = "" @@ -144,7 +149,7 @@ # Use the Root file in the sandbox, if it exists try: - root = open(os.path.join('CVS','Root')).read().strip() + root = open(os.path.join('CVS','Root'), 'rb').read().strip() except IOError: pass @@ -170,14 +175,14 @@ # /pserver/user/server/path # are mapped to different cache file names. cachefile = root.split(":") + [directory, "cache"] - cachefile = ['-'.join(re.findall(r'\w+', s)) for s in cachefile if s] + cachefile = ['-'.join(re.findall(br'\w+', s)) for s in cachefile if s] cachefile = os.path.join(cachedir, '.'.join([s for s in cachefile if s])) if cache == 'update': try: ui.note(_('reading cvs log cache %s\n') % cachefile) - oldlog = pickle.load(open(cachefile)) + oldlog = pickle.load(open(cachefile, 'rb')) for e in oldlog: if not (util.safehasattr(e, 'branchpoints') and util.safehasattr(e, 'commitid') and @@ -192,7 +197,7 @@ if oldlog: date = oldlog[-1].date # last commit date as a (time,tz) tuple - date = util.datestr(date, '%Y/%m/%d %H:%M:%S %1%2') + date = dateutil.datestr(date, '%Y/%m/%d %H:%M:%S %1%2') # build the CVS commandline cmd = ['cvs', '-q'] @@ -219,17 +224,17 @@ state = 0 store = False # set when a new record can be appended - cmd = [util.shellquote(arg) for arg in cmd] + cmd = [procutil.shellquote(arg) for arg in cmd] ui.note(_("running %s\n") % (' '.join(cmd))) ui.debug("prefix=%r directory=%r root=%r\n" % (prefix, directory, root)) - pfp = util.popen(' '.join(cmd)) - peek = pfp.readline() + pfp = procutil.popen(' '.join(cmd), 'rb') + peek = util.fromnativeeol(pfp.readline()) while True: line = peek if line == '': break - peek = pfp.readline() + peek = util.fromnativeeol(pfp.readline()) if line.endswith('\n'): line = line[:-1] #ui.debug('state=%d line=%r\n' % (state, line)) @@ -336,7 +341,7 @@ if len(d.split()) != 3: # cvs log dates always in GMT d = d + ' UTC' - e.date = util.parsedate(d, ['%y/%m/%d %H:%M:%S', + e.date = dateutil.parsedate(d, ['%y/%m/%d %H:%M:%S', '%Y/%m/%d %H:%M:%S', '%Y-%m-%d %H:%M:%S']) e.author = scache(match.group(2)) @@ -451,7 +456,8 @@ rcsmap[e.rcs.replace('/Attic/', '/')] = e.rcs if len(log) % 100 == 0: - ui.status(util.ellipsis('%d %s' % (len(log), e.file), 80)+'\n') + ui.status(stringutil.ellipsis('%d %s' % (len(log), e.file), 80) + + '\n') log.sort(key=lambda x: (x.rcs, x.revision)) @@ -486,7 +492,7 @@ # write the new cachefile ui.note(_('writing cvs log cache %s\n') % cachefile) - pickle.dump(log, open(cachefile, 'w')) + pickle.dump(log, open(cachefile, 'wb')) else: log = oldlog @@ -607,7 +613,7 @@ files = set() if len(changesets) % 100 == 0: t = '%d %s' % (len(changesets), repr(e.comment)[1:-1]) - ui.status(util.ellipsis(t, 80) + '\n') + ui.status(stringutil.ellipsis(t, 80) + '\n') c.entries.append(e) files.add(e.file) @@ -855,6 +861,7 @@ repository, and convert the log to changesets based on matching commit log entries and dates. ''' + opts = pycompat.byteskwargs(opts) if opts["new_cache"]: cache = "write" elif opts["update_cache"]: @@ -900,7 +907,7 @@ # bug-for-bug compatibility with cvsps. ui.write('---------------------\n') ui.write(('PatchSet %d \n' % cs.id)) - ui.write(('Date: %s\n' % util.datestr(cs.date, + ui.write(('Date: %s\n' % dateutil.datestr(cs.date, '%Y/%m/%d %H:%M:%S %1%2'))) ui.write(('Author: %s\n' % cs.author)) ui.write(('Branch: %s\n' % (cs.branch or 'HEAD'))) @@ -912,7 +919,7 @@ if opts["parents"] and cs.parents: if len(cs.parents) > 1: ui.write(('Parents: %s\n' % - (','.join([str(p.id) for p in cs.parents])))) + (','.join([(b"%d" % p.id) for p in cs.parents])))) else: ui.write(('Parent: %d\n' % cs.parents[0].id)) @@ -934,18 +941,18 @@ fn = fn[len(opts["prefix"]):] ui.write('\t%s:%s->%s%s \n' % ( fn, '.'.join([str(x) for x in f.parent]) or 'INITIAL', - '.'.join([str(x) for x in f.revision]), + '.'.join([(b"%d" % x) for x in f.revision]), ['', '(DEAD)'][f.dead])) ui.write('\n') # have we seen the start tag? if revisions and off: - if revisions[0] == str(cs.id) or \ + if revisions[0] == (b"%d" % cs.id) or \ revisions[0] in cs.tags: off = False # see if we reached the end tag if len(revisions) > 1 and not off: - if revisions[1] == str(cs.id) or \ + if revisions[1] == (b"%d" % cs.id) or \ revisions[1] in cs.tags: break diff -r fb92df8b634c -r ed5448edcbfa hgext/convert/darcs.py --- a/hgext/convert/darcs.py Wed Apr 04 10:35:09 2018 -0400 +++ b/hgext/convert/darcs.py Wed Apr 18 15:32:08 2018 -0400 @@ -16,6 +16,7 @@ error, util, ) +from mercurial.utils import dateutil from . import common NoRepo = common.NoRepo @@ -148,12 +149,14 @@ def getcommit(self, rev): elt = self.changes[rev] - date = util.strdate(elt.get('local_date'), '%a %b %d %H:%M:%S %Z %Y') + dateformat = '%a %b %d %H:%M:%S %Z %Y' + date = dateutil.strdate(elt.get('local_date'), dateformat) desc = elt.findtext('name') + '\n' + elt.findtext('comment', '') # etree can return unicode objects for name, comment, and author, # so recode() is used to ensure str objects are emitted. + newdateformat = '%Y-%m-%d %H:%M:%S %1%2' return common.commit(author=self.recode(elt.get('author')), - date=util.datestr(date, '%Y-%m-%d %H:%M:%S %1%2'), + date=dateutil.datestr(date, newdateformat), desc=self.recode(desc).strip(), parents=self.parents[rev]) diff -r fb92df8b634c -r ed5448edcbfa hgext/convert/filemap.py --- a/hgext/convert/filemap.py Wed Apr 04 10:35:09 2018 -0400 +++ b/hgext/convert/filemap.py Wed Apr 18 15:32:08 2018 -0400 @@ -7,11 +7,11 @@ from __future__ import absolute_import, print_function import posixpath -import shlex from mercurial.i18n import _ from mercurial import ( error, + pycompat, ) from . import common SKIPREV = common.SKIPREV @@ -68,11 +68,12 @@ name.endswith('/') or '//' in name): self.ui.warn(_('%s:%d: superfluous / in %s %r\n') % - (lex.infile, lex.lineno, listname, name)) + (lex.infile, lex.lineno, listname, + pycompat.bytestr(name))) return 1 return 0 - lex = shlex.shlex(open(path), path, True) - lex.wordchars += '!@#$%^&*()-=+[]{}|;:,./<>?' + lex = common.shlexer( + filepath=path, wordchars='!@#$%^&*()-=+[]{}|;:,./<>?') cmd = lex.get_token() while cmd: if cmd == 'include': @@ -93,7 +94,7 @@ errs += self.parse(normalize(lex.get_token())) else: self.ui.warn(_('%s:%d: unknown directive %r\n') % - (lex.infile, lex.lineno, cmd)) + (lex.infile, lex.lineno, pycompat.bytestr(cmd))) errs += 1 cmd = lex.get_token() return errs diff -r fb92df8b634c -r ed5448edcbfa hgext/convert/git.py --- a/hgext/convert/git.py Wed Apr 04 10:35:09 2018 -0400 +++ b/hgext/convert/git.py Wed Apr 18 15:32:08 2018 -0400 @@ -168,19 +168,19 @@ raise error.Abort(_('cannot retrieve git head "%s"') % rev) return heads - def catfile(self, rev, type): + def catfile(self, rev, ftype): if rev == nodemod.nullhex: raise IOError self.catfilepipe[0].write(rev+'\n') self.catfilepipe[0].flush() info = self.catfilepipe[1].readline().split() - if info[1] != type: - raise error.Abort(_('cannot read %r object at %s') % (type, rev)) + if info[1] != ftype: + raise error.Abort(_('cannot read %r object at %s') % (ftype, rev)) size = int(info[2]) data = self.catfilepipe[1].read(size) if len(data) < size: raise error.Abort(_('cannot read %r object at %s: unexpected size') - % (type, rev)) + % (ftype, rev)) # read the trailing newline self.catfilepipe[1].read(1) return data @@ -372,7 +372,7 @@ tzs, tzh, tzm = tz[-5:-4] + "1", tz[-4:-2], tz[-2:] tz = -int(tzs) * (int(tzh) * 3600 + int(tzm)) - date = tm + " " + str(tz) + date = tm + " " + (b"%d" % tz) saverev = self.ui.configbool('convert', 'git.saverev') c = common.commit(parents=parents, date=date, author=author, @@ -435,7 +435,7 @@ else: output, status = self.gitrunlines('diff-tree', '--name-only', '--root', '-r', version, - '%s^%s' % (version, i + 1), '--') + '%s^%d' % (version, i + 1), '--') if status: raise error.Abort(_('cannot read changes in %s') % version) changes = [f.rstrip('\n') for f in output] diff -r fb92df8b634c -r ed5448edcbfa hgext/convert/gnuarch.py --- a/hgext/convert/gnuarch.py Wed Apr 04 10:35:09 2018 -0400 +++ b/hgext/convert/gnuarch.py Wed Apr 18 15:32:08 2018 -0400 @@ -17,7 +17,10 @@ from mercurial import ( encoding, error, - util, +) +from mercurial.utils import ( + dateutil, + procutil, ) from . import common @@ -45,10 +48,10 @@ # Could use checktool, but we want to check for baz or tla. self.execmd = None - if util.findexe('baz'): + if procutil.findexe('baz'): self.execmd = 'baz' else: - if util.findexe('tla'): + if procutil.findexe('tla'): self.execmd = 'tla' else: raise error.Abort(_('cannot find a GNU Arch tool')) @@ -194,9 +197,9 @@ def _execute(self, cmd, *args, **kwargs): cmdline = [self.execmd, cmd] cmdline += args - cmdline = [util.shellquote(arg) for arg in cmdline] + cmdline = [procutil.shellquote(arg) for arg in cmdline] cmdline += ['>', os.devnull, '2>', os.devnull] - cmdline = util.quotecommand(' '.join(cmdline)) + cmdline = procutil.quotecommand(' '.join(cmdline)) self.ui.debug(cmdline, '\n') return os.system(cmdline) @@ -280,8 +283,8 @@ catlog = self.catlogparser.parsestr(data) # Commit date - self.changes[rev].date = util.datestr( - util.strdate(catlog['Standard-date'], + self.changes[rev].date = dateutil.datestr( + dateutil.strdate(catlog['Standard-date'], '%Y-%m-%d %H:%M:%S')) # Commit author diff -r fb92df8b634c -r ed5448edcbfa hgext/convert/hg.py --- a/hgext/convert/hg.py Wed Apr 04 10:35:09 2018 -0400 +++ b/hgext/convert/hg.py Wed Apr 18 15:32:08 2018 -0400 @@ -36,13 +36,14 @@ scmutil, util, ) +from mercurial.utils import dateutil stringio = util.stringio from . import common mapfile = common.mapfile NoRepo = common.NoRepo -sha1re = re.compile(r'\b[0-9a-f]{12,40}\b') +sha1re = re.compile(br'\b[0-9a-f]{12,40}\b') class mercurial_sink(common.converter_sink): def __init__(self, ui, repotype, path): @@ -362,12 +363,8 @@ return p2 def puttags(self, tags): - try: - parentctx = self.repo[self.tagsbranch] - tagparent = parentctx.node() - except error.RepoError: - parentctx = None - tagparent = nodemod.nullid + tagparent = self.repo.branchtip(self.tagsbranch, ignoremissing=True) + tagparent = tagparent or nodemod.nullid oldlines = set() for branch, heads in self.repo.branchmap().iteritems(): @@ -404,7 +401,7 @@ return context.memfilectx(repo, memctx, f, data, False, False, None) self.ui.status(_("updating tags\n")) - date = "%s 0" % int(time.mktime(time.gmtime())) + date = "%d 0" % int(time.mktime(time.gmtime())) extra = {'branch': self.tagsbranch} ctx = context.memctx(self.repo, (tagparent, None), "update tags", [".hgtags"], getfilectx, "convert-repo", date, @@ -480,7 +477,7 @@ else: self.keep = util.always if revs: - self._heads = [self.repo[r].node() for r in revs] + self._heads = [self.repo.lookup(r) for r in revs] else: self._heads = self.repo.heads() else: @@ -563,12 +560,7 @@ if copysource in self.ignored: continue # Ignore copy sources not in parent revisions - found = False - for p in parents: - if copysource in p: - found = True - break - if not found: + if not any(copysource in p for p in parents): continue copies[name] = copysource except TypeError: @@ -588,7 +580,7 @@ crev = rev return common.commit(author=ctx.user(), - date=util.datestr(ctx.date(), + date=dateutil.datestr(ctx.date(), '%Y-%m-%d %H:%M:%S %1%2'), desc=ctx.description(), rev=crev, @@ -625,8 +617,8 @@ def converted(self, rev, destrev): if self.convertfp is None: - self.convertfp = open(self.repo.vfs.join('shamap'), 'a') - self.convertfp.write('%s %s\n' % (destrev, rev)) + self.convertfp = open(self.repo.vfs.join('shamap'), 'ab') + self.convertfp.write(util.tonativeeol('%s %s\n' % (destrev, rev))) self.convertfp.flush() def before(self): diff -r fb92df8b634c -r ed5448edcbfa hgext/convert/monotone.py --- a/hgext/convert/monotone.py Wed Apr 04 10:35:09 2018 -0400 +++ b/hgext/convert/monotone.py Wed Apr 18 15:32:08 2018 -0400 @@ -13,8 +13,9 @@ from mercurial.i18n import _ from mercurial import ( error, - util, + pycompat, ) +from mercurial.utils import dateutil from . import common @@ -36,7 +37,7 @@ if not os.path.exists(os.path.join(path, '_MTN')): # Could be a monotone repository (SQLite db file) try: - f = file(path, 'rb') + f = open(path, 'rb') header = f.read(16) f.close() except IOError: @@ -45,11 +46,11 @@ raise norepo # regular expressions for parsing monotone output - space = r'\s*' - name = r'\s+"((?:\\"|[^"])*)"\s*' + space = br'\s*' + name = br'\s+"((?:\\"|[^"])*)"\s*' value = name - revision = r'\s+\[(\w+)\]\s*' - lines = r'(?:.|\n)+' + revision = br'\s+\[(\w+)\]\s*' + lines = br'(?:.|\n)+' self.dir_re = re.compile(space + "dir" + name) self.file_re = re.compile(space + "file" + name + @@ -84,11 +85,12 @@ return self.mtnrunsingle(*args, **kwargs) def mtnrunsingle(self, *args, **kwargs): - kwargs['d'] = self.path + kwargs[r'd'] = self.path return self.run0('automate', *args, **kwargs) def mtnrunstdio(self, *args, **kwargs): # Prepare the command in automate stdio format + kwargs = pycompat.byteskwargs(kwargs) command = [] for k, v in kwargs.iteritems(): command.append("%s:%s" % (len(k), k)) @@ -308,9 +310,10 @@ certs = self.mtngetcerts(rev) if certs.get('suspend') == certs["branch"]: extra['close'] = 1 + dateformat = "%Y-%m-%dT%H:%M:%S" return common.commit( author=certs["author"], - date=util.datestr(util.strdate(certs["date"], "%Y-%m-%dT%H:%M:%S")), + date=dateutil.datestr(dateutil.strdate(certs["date"], dateformat)), desc=certs["changelog"], rev=rev, parents=self.mtnrun("parents", rev).splitlines(), diff -r fb92df8b634c -r ed5448edcbfa hgext/convert/p4.py --- a/hgext/convert/p4.py Wed Apr 04 10:35:09 2018 -0400 +++ b/hgext/convert/p4.py Wed Apr 18 15:32:08 2018 -0400 @@ -14,6 +14,11 @@ error, util, ) +from mercurial.utils import ( + dateutil, + procutil, + stringutil, +) from . import common @@ -85,8 +90,8 @@ def _parse_view(self, path): "Read changes affecting the path" - cmd = 'p4 -G changes -s submitted %s' % util.shellquote(path) - stdout = util.popen(cmd, mode='rb') + cmd = 'p4 -G changes -s submitted %s' % procutil.shellquote(path) + stdout = procutil.popen(cmd, mode='rb') p4changes = {} for d in loaditer(stdout): c = d.get("change", None) @@ -114,8 +119,8 @@ else: views = {"//": ""} else: - cmd = 'p4 -G client -o %s' % util.shellquote(path) - clientspec = marshal.load(util.popen(cmd, mode='rb')) + cmd = 'p4 -G client -o %s' % procutil.shellquote(path) + clientspec = marshal.load(procutil.popen(cmd, mode='rb')) views = {} for client in clientspec: @@ -168,7 +173,7 @@ shortdesc = '**empty changelist description**' t = '%s %s' % (c.rev, repr(shortdesc)[1:-1]) - ui.status(util.ellipsis(t, 80) + '\n') + ui.status(stringutil.ellipsis(t, 80) + '\n') files = [] copies = {} @@ -194,8 +199,8 @@ oldname = depotname[filename] flcmd = 'p4 -G filelog %s' \ - % util.shellquote(oldname) - flstdout = util.popen(flcmd, mode='rb') + % procutil.shellquote(oldname) + flstdout = procutil.popen(flcmd, mode='rb') copiedfilename = None for d in loaditer(flstdout): @@ -268,11 +273,11 @@ def getfile(self, name, rev): cmd = 'p4 -G print %s' \ - % util.shellquote("%s#%s" % (self.depotname[name], rev)) + % procutil.shellquote("%s#%s" % (self.depotname[name], rev)) lasterror = None while True: - stdout = util.popen(cmd, mode='rb') + stdout = procutil.popen(cmd, mode='rb') mode = None contents = [] @@ -346,7 +351,7 @@ parents = [] return common.commit(author=self.recode(obj["user"]), - date=util.datestr(date, '%Y-%m-%d %H:%M:%S %1%2'), + date=dateutil.datestr(date, '%Y-%m-%d %H:%M:%S %1%2'), parents=parents, desc=desc, branch=None, rev=obj['change'], extra={"p4": obj['change'], "convert_revision": obj['change']}) @@ -354,7 +359,7 @@ """Return an output of `p4 describe` including author, commit date as a dictionary.""" cmd = "p4 -G describe -s %s" % rev - stdout = util.popen(cmd, mode='rb') + stdout = procutil.popen(cmd, mode='rb') return marshal.load(stdout) def getcommit(self, rev): diff -r fb92df8b634c -r ed5448edcbfa hgext/convert/subversion.py --- a/hgext/convert/subversion.py Wed Apr 04 10:35:09 2018 -0400 +++ b/hgext/convert/subversion.py Wed Apr 18 15:32:08 2018 -0400 @@ -16,6 +16,11 @@ util, vfs as vfsmod, ) +from mercurial.utils import ( + dateutil, + procutil, + stringutil, +) from . import common @@ -146,10 +151,10 @@ # Caller may interrupt the iteration pickle.dump(None, fp, protocol) except Exception as inst: - pickle.dump(str(inst), fp, protocol) + pickle.dump(stringutil.forcebytestr(inst), fp, protocol) else: pickle.dump(None, fp, protocol) - fp.close() + fp.flush() # With large history, cleanup process goes crazy and suddenly # consumes *huge* amount of memory. The output file being closed, # there is no need for clean termination. @@ -231,7 +236,7 @@ def httpcheck(ui, path, proto): try: opener = urlreq.buildopener() - rsp = opener.open('%s://%s/!svn/ver/0/.svn' % (proto, path)) + rsp = opener.open('%s://%s/!svn/ver/0/.svn' % (proto, path), 'rb') data = rsp.read() except urlerr.httperror as inst: if inst.code != 404: @@ -384,7 +389,7 @@ def setrevmap(self, revmap): lastrevs = {} - for revid in revmap.iterkeys(): + for revid in revmap: uuid, module, revnum = revsplit(revid) lastrevnum = lastrevs.setdefault(module, revnum) if revnum > lastrevnum: @@ -639,8 +644,9 @@ return if self.convertfp is None: self.convertfp = open(os.path.join(self.wc, '.svn', 'hg-shamap'), - 'a') - self.convertfp.write('%s %d\n' % (destrev, self.revnum(rev))) + 'ab') + self.convertfp.write(util.tonativeeol('%s %d\n' + % (destrev, self.revnum(rev)))) self.convertfp.flush() def revid(self, revnum, module=None): @@ -890,7 +896,7 @@ # Example SVN datetime. Includes microseconds. # ISO-8601 conformant # '2007-01-04T17:35:00.902377Z' - date = util.parsedate(date[:19] + " UTC", ["%Y-%m-%dT%H:%M:%S"]) + date = dateutil.parsedate(date[:19] + " UTC", ["%Y-%m-%dT%H:%M:%S"]) if self.ui.configbool('convert', 'localtimezone'): date = makedatetimestamp(date[0]) @@ -912,7 +918,7 @@ branch = None cset = commit(author=author, - date=util.datestr(date, '%Y-%m-%d %H:%M:%S %1%2'), + date=dateutil.datestr(date, '%Y-%m-%d %H:%M:%S %1%2'), desc=log, parents=parents, branch=branch, @@ -1064,9 +1070,9 @@ if not self.ui.configbool('convert', 'svn.debugsvnlog'): return directlogstream(*args) arg = encodeargs(args) - hgexe = util.hgexecutable() - cmd = '%s debugsvnlog' % util.shellquote(hgexe) - stdin, stdout = util.popen2(util.quotecommand(cmd)) + hgexe = procutil.hgexecutable() + cmd = '%s debugsvnlog' % procutil.shellquote(hgexe) + stdin, stdout = procutil.popen2(procutil.quotecommand(cmd)) stdin.write(arg) try: stdin.close() @@ -1128,7 +1134,7 @@ self.wc = os.path.realpath(path) self.run0('update') else: - if not re.search(r'^(file|http|https|svn|svn\+ssh)\://', path): + if not re.search(br'^(file|http|https|svn|svn\+ssh)\://', path): path = os.path.realpath(path) if os.path.isdir(os.path.dirname(path)): if not os.path.exists(os.path.join(path, 'db', 'fs-type')): @@ -1158,7 +1164,7 @@ if created: hook = os.path.join(created, 'hooks', 'pre-revprop-change') - fp = open(hook, 'w') + fp = open(hook, 'wb') fp.write(pre_revprop_change) fp.close() util.setflags(hook, False, True) @@ -1308,12 +1314,12 @@ self.setexec = [] fd, messagefile = tempfile.mkstemp(prefix='hg-convert-') - fp = os.fdopen(fd, pycompat.sysstr('w')) - fp.write(commit.desc) + fp = os.fdopen(fd, r'wb') + fp.write(util.tonativeeol(commit.desc)) fp.close() try: output = self.run0('commit', - username=util.shortuser(commit.author), + username=stringutil.shortuser(commit.author), file=messagefile, encoding='utf-8') try: diff -r fb92df8b634c -r ed5448edcbfa hgext/eol.py --- a/hgext/eol.py Wed Apr 04 10:35:09 2018 -0400 +++ b/hgext/eol.py Wed Apr 18 15:32:08 2018 -0400 @@ -103,8 +103,12 @@ match, pycompat, registrar, + scmutil, util, ) +from mercurial.utils import ( + stringutil, +) # Note for extension authors: ONLY specify testedwith = 'ships-with-hg-core' for # extensions which SHIP WITH MERCURIAL. Non-mainline extensions should @@ -133,7 +137,7 @@ def tolf(s, params, ui, **kwargs): """Filter to convert to LF EOLs.""" - if util.binary(s): + if stringutil.binary(s): return s if ui.configbool('eol', 'only-consistent') and inconsistenteol(s): return s @@ -144,7 +148,7 @@ def tocrlf(s, params, ui, **kwargs): """Filter to convert to CRLF EOLs.""" - if util.binary(s): + if stringutil.binary(s): return s if ui.configbool('eol', 'only-consistent') and inconsistenteol(s): return s @@ -222,7 +226,7 @@ data = ctx[f].data() if (target == "to-lf" and "\r\n" in data or target == "to-crlf" and singlelf.search(data)): - failed.append((f, target, str(ctx))) + failed.append((f, target, bytes(ctx))) break return failed @@ -296,7 +300,8 @@ hook = checkheadshook def preupdate(ui, repo, hooktype, parent1, parent2): - repo.loadeol([parent1]) + p1node = scmutil.resolvehexnodeidprefix(repo, parent1) + repo.loadeol([p1node]) return False def uisetup(ui): @@ -403,7 +408,7 @@ if fctx is None: continue data = fctx.data() - if util.binary(data): + if stringutil.binary(data): # We should not abort here, since the user should # be able to say "** = native" to automatically # have all non-binary files taken care of. diff -r fb92df8b634c -r ed5448edcbfa hgext/extdiff.py --- a/hgext/extdiff.py Wed Apr 04 10:35:09 2018 -0400 +++ b/hgext/extdiff.py Wed Apr 18 15:32:08 2018 -0400 @@ -13,6 +13,11 @@ non-option arguments: paths to directories containing snapshots of files to compare. +If there is more than one file being compared and the "child" revision +is the working directory, any modifications made in the external diff +program will be copied back to the working directory from the temporary +directory. + The extdiff extension also allows you to configure new diff commands, so you do not need to type :hg:`extdiff -p kdiff3` always. :: @@ -65,6 +70,7 @@ import os import re import shutil +import stat import tempfile from mercurial.i18n import _ from mercurial.node import ( @@ -76,11 +82,16 @@ cmdutil, error, filemerge, + formatter, pycompat, registrar, scmutil, util, ) +from mercurial.utils import ( + procutil, + stringutil, +) cmdtable = {} command = registrar.command(cmdtable) @@ -88,12 +99,12 @@ configtable = {} configitem = registrar.configitem(configtable) -configitem('extdiff', r'opts\..*', +configitem('extdiff', br'opts\..*', default='', generic=True, ) -configitem('diff-tools', r'.*\.diffargs$', +configitem('diff-tools', br'.*\.diffargs$', default=None, generic=True, ) @@ -158,14 +169,18 @@ msg = _('cannot specify --rev and --change at the same time') raise error.Abort(msg) elif change: - node2 = scmutil.revsingle(repo, change, None).node() - node1a, node1b = repo.changelog.parents(node2) + ctx2 = scmutil.revsingle(repo, change, None) + ctx1a, ctx1b = ctx2.p1(), ctx2.p2() else: - node1a, node2 = scmutil.revpair(repo, revs) + ctx1a, ctx2 = scmutil.revpair(repo, revs) if not revs: - node1b = repo.dirstate.p2() + ctx1b = repo[None].p2() else: - node1b = nullid + ctx1b = repo[nullid] + + node1a = ctx1a.node() + node1b = ctx1b.node() + node2 = ctx2.node() # Disable 3-way merge if there is only one parent if do3way: @@ -253,11 +268,13 @@ label2 = common_file + rev2 else: template = 'hg-%h.patch' - cmdutil.export(repo, [repo[node1a].rev(), repo[node2].rev()], - fntemplate=repo.vfs.reljoin(tmproot, template), - match=matcher) - label1a = cmdutil.makefilename(repo, template, node1a) - label2 = cmdutil.makefilename(repo, template, node2) + with formatter.nullformatter(ui, 'extdiff', {}) as fm: + cmdutil.export(repo, [repo[node1a].rev(), repo[node2].rev()], + fm, + fntemplate=repo.vfs.reljoin(tmproot, template), + match=matcher) + label1a = cmdutil.makefilename(repo[node1a], template) + label2 = cmdutil.makefilename(repo[node2], template) dir1a = repo.vfs.reljoin(tmproot, label1a) dir2 = repo.vfs.reljoin(tmproot, label2) dir1b = None @@ -276,16 +293,16 @@ key = match.group(3) if not do3way and key == 'parent2': return pre - return pre + util.shellquote(replace[key]) + return pre + procutil.shellquote(replace[key]) # Match parent2 first, so 'parent1?' will match both parent1 and parent - regex = (r'''(['"]?)([^\s'"$]*)''' - r'\$(parent2|parent1?|child|plabel1|plabel2|clabel|root)\1') + regex = (br'''(['"]?)([^\s'"$]*)''' + br'\$(parent2|parent1?|child|plabel1|plabel2|clabel|root)\1') if not do3way and not re.search(regex, cmdline): cmdline += ' $parent1 $child' cmdline = re.sub(regex, quote, cmdline) - ui.debug('running %r in %s\n' % (cmdline, tmproot)) + ui.debug('running %r in %s\n' % (pycompat.bytestr(cmdline), tmproot)) ui.system(cmdline, cwd=tmproot, blockedtag='extdiff') for copy_fn, working_fn, st in fnsandstat: @@ -297,7 +314,8 @@ # copyfile() carries over the permission, so the mode check could # be in an 'elif' branch, but for the case where the file has # changed without affecting mtime or size. - if (cpstat.st_mtime != st.st_mtime or cpstat.st_size != st.st_size + if (cpstat[stat.ST_MTIME] != st[stat.ST_MTIME] + or cpstat.st_size != st.st_size or (cpstat.st_mode & 0o100) != (st.st_mode & 0o100)): ui.debug('file changed while diffing. ' 'Overwriting: %s (src: %s)\n' % (working_fn, copy_fn)) @@ -344,7 +362,7 @@ if not program: program = 'diff' option = option or ['-Npru'] - cmdline = ' '.join(map(util.shellquote, [program] + option)) + cmdline = ' '.join(map(procutil.shellquote, [program] + option)) return dodiff(ui, repo, cmdline, pats, opts) class savedcmd(object): @@ -365,13 +383,13 @@ def __init__(self, path, cmdline): # We can't pass non-ASCII through docstrings (and path is # in an unknown encoding anyway) - docpath = util.escapestr(path) - self.__doc__ = self.__doc__ % {'path': util.uirepr(docpath)} + docpath = stringutil.escapestr(path) + self.__doc__ %= {r'path': pycompat.sysstr(stringutil.uirepr(docpath))} self._cmdline = cmdline def __call__(self, ui, repo, *pats, **opts): opts = pycompat.byteskwargs(opts) - options = ' '.join(map(util.shellquote, opts['option'])) + options = ' '.join(map(procutil.shellquote, opts['option'])) if options: options = ' ' + options return dodiff(ui, repo, self._cmdline + options, pats, opts) @@ -382,11 +400,11 @@ if cmd.startswith('cmd.'): cmd = cmd[4:] if not path: - path = util.findexe(cmd) + path = procutil.findexe(cmd) if path is None: path = filemerge.findexternaltool(ui, cmd) or cmd diffopts = ui.config('extdiff', 'opts.' + cmd) - cmdline = util.shellquote(path) + cmdline = procutil.shellquote(path) if diffopts: cmdline += ' ' + diffopts elif cmd.startswith('opts.'): @@ -398,10 +416,10 @@ diffopts = len(pycompat.shlexsplit(cmdline)) > 1 else: # case "cmd =" - path = util.findexe(cmd) + path = procutil.findexe(cmd) if path is None: path = filemerge.findexternaltool(ui, cmd) or cmd - cmdline = util.shellquote(path) + cmdline = procutil.shellquote(path) diffopts = False # look for diff arguments in [diff-tools] then [merge-tools] if not diffopts: diff -r fb92df8b634c -r ed5448edcbfa hgext/fetch.py --- a/hgext/fetch.py Wed Apr 04 10:35:09 2018 -0400 +++ b/hgext/fetch.py Wed Apr 18 15:32:08 2018 -0400 @@ -23,6 +23,7 @@ registrar, util, ) +from mercurial.utils import dateutil release = lock.release cmdtable = {} @@ -64,7 +65,7 @@ opts = pycompat.byteskwargs(opts) date = opts.get('date') if date: - opts['date'] = util.parsedate(date) + opts['date'] = dateutil.parsedate(date) parent, _p2 = repo.dirstate.parents() branch = repo.dirstate.branch() diff -r fb92df8b634c -r ed5448edcbfa hgext/fix.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/hgext/fix.py Wed Apr 18 15:32:08 2018 -0400 @@ -0,0 +1,553 @@ +# fix - rewrite file content in changesets and working copy +# +# Copyright 2018 Google LLC. +# +# This software may be used and distributed according to the terms of the +# GNU General Public License version 2 or any later version. +"""rewrite file content in changesets or working copy (EXPERIMENTAL) + +Provides a command that runs configured tools on the contents of modified files, +writing back any fixes to the working copy or replacing changesets. + +Here is an example configuration that causes :hg:`fix` to apply automatic +formatting fixes to modified lines in C++ code:: + + [fix] + clang-format:command=clang-format --assume-filename={rootpath} + clang-format:linerange=--lines={first}:{last} + clang-format:fileset=set:**.cpp or **.hpp + +The :command suboption forms the first part of the shell command that will be +used to fix a file. The content of the file is passed on standard input, and the +fixed file content is expected on standard output. If there is any output on +standard error, the file will not be affected. Some values may be substituted +into the command:: + + {rootpath} The path of the file being fixed, relative to the repo root + {basename} The name of the file being fixed, without the directory path + +If the :linerange suboption is set, the tool will only be run if there are +changed lines in a file. The value of this suboption is appended to the shell +command once for every range of changed lines in the file. Some values may be +substituted into the command:: + + {first} The 1-based line number of the first line in the modified range + {last} The 1-based line number of the last line in the modified range + +The :fileset suboption determines which files will be passed through each +configured tool. See :hg:`help fileset` for possible values. If there are file +arguments to :hg:`fix`, the intersection of these filesets is used. + +There is also a configurable limit for the maximum size of file that will be +processed by :hg:`fix`:: + + [fix] + maxfilesize=2MB + +""" + +from __future__ import absolute_import + +import collections +import itertools +import os +import re +import subprocess + +from mercurial.i18n import _ +from mercurial.node import nullrev +from mercurial.node import wdirrev + +from mercurial import ( + cmdutil, + context, + copies, + error, + mdiff, + merge, + obsolete, + pycompat, + registrar, + scmutil, + util, +) + +# Note for extension authors: ONLY specify testedwith = 'ships-with-hg-core' for +# extensions which SHIP WITH MERCURIAL. Non-mainline extensions should +# be specifying the version(s) of Mercurial they are tested with, or +# leave the attribute unspecified. +testedwith = 'ships-with-hg-core' + +cmdtable = {} +command = registrar.command(cmdtable) + +configtable = {} +configitem = registrar.configitem(configtable) + +# Register the suboptions allowed for each configured fixer. +FIXER_ATTRS = ('command', 'linerange', 'fileset') + +for key in FIXER_ATTRS: + configitem('fix', '.*(:%s)?' % key, default=None, generic=True) + +# A good default size allows most source code files to be fixed, but avoids +# letting fixer tools choke on huge inputs, which could be surprising to the +# user. +configitem('fix', 'maxfilesize', default='2MB') + +@command('fix', + [('', 'all', False, _('fix all non-public non-obsolete revisions')), + ('', 'base', [], _('revisions to diff against (overrides automatic ' + 'selection, and applies to every revision being ' + 'fixed)'), _('REV')), + ('r', 'rev', [], _('revisions to fix'), _('REV')), + ('w', 'working-dir', False, _('fix the working directory')), + ('', 'whole', False, _('always fix every line of a file'))], + _('[OPTION]... [FILE]...')) +def fix(ui, repo, *pats, **opts): + """rewrite file content in changesets or working directory + + Runs any configured tools to fix the content of files. Only affects files + with changes, unless file arguments are provided. Only affects changed lines + of files, unless the --whole flag is used. Some tools may always affect the + whole file regardless of --whole. + + If revisions are specified with --rev, those revisions will be checked, and + they may be replaced with new revisions that have fixed file content. It is + desirable to specify all descendants of each specified revision, so that the + fixes propagate to the descendants. If all descendants are fixed at the same + time, no merging, rebasing, or evolution will be required. + + If --working-dir is used, files with uncommitted changes in the working copy + will be fixed. If the checked-out revision is also fixed, the working + directory will update to the replacement revision. + + When determining what lines of each file to fix at each revision, the whole + set of revisions being fixed is considered, so that fixes to earlier + revisions are not forgotten in later ones. The --base flag can be used to + override this default behavior, though it is not usually desirable to do so. + """ + opts = pycompat.byteskwargs(opts) + if opts['all']: + if opts['rev']: + raise error.Abort(_('cannot specify both "--rev" and "--all"')) + opts['rev'] = ['not public() and not obsolete()'] + opts['working_dir'] = True + with repo.wlock(), repo.lock(): + revstofix = getrevstofix(ui, repo, opts) + basectxs = getbasectxs(repo, opts, revstofix) + workqueue, numitems = getworkqueue(ui, repo, pats, opts, revstofix, + basectxs) + filedata = collections.defaultdict(dict) + replacements = {} + fixers = getfixers(ui) + # Some day this loop can become a worker pool, but for now it's easier + # to fix everything serially in topological order. + for rev, path in sorted(workqueue): + ctx = repo[rev] + olddata = ctx[path].data() + newdata = fixfile(ui, opts, fixers, ctx, path, basectxs[rev]) + if newdata != olddata: + filedata[rev][path] = newdata + numitems[rev] -= 1 + if not numitems[rev]: + if rev == wdirrev: + writeworkingdir(repo, ctx, filedata[rev], replacements) + else: + replacerev(ui, repo, ctx, filedata[rev], replacements) + del filedata[rev] + + replacements = {prec: [succ] for prec, succ in replacements.iteritems()} + scmutil.cleanupnodes(repo, replacements, 'fix') + +def getworkqueue(ui, repo, pats, opts, revstofix, basectxs): + """"Constructs the list of files to be fixed at specific revisions + + It is up to the caller how to consume the work items, and the only + dependence between them is that replacement revisions must be committed in + topological order. Each work item represents a file in the working copy or + in some revision that should be fixed and written back to the working copy + or into a replacement revision. + """ + workqueue = [] + numitems = collections.defaultdict(int) + maxfilesize = ui.configbytes('fix', 'maxfilesize') + for rev in revstofix: + fixctx = repo[rev] + match = scmutil.match(fixctx, pats, opts) + for path in pathstofix(ui, repo, pats, opts, match, basectxs[rev], + fixctx): + if path not in fixctx: + continue + fctx = fixctx[path] + if fctx.islink(): + continue + if fctx.size() > maxfilesize: + ui.warn(_('ignoring file larger than %s: %s\n') % + (util.bytecount(maxfilesize), path)) + continue + workqueue.append((rev, path)) + numitems[rev] += 1 + return workqueue, numitems + +def getrevstofix(ui, repo, opts): + """Returns the set of revision numbers that should be fixed""" + revs = set(scmutil.revrange(repo, opts['rev'])) + for rev in revs: + checkfixablectx(ui, repo, repo[rev]) + if revs: + cmdutil.checkunfinished(repo) + checknodescendants(repo, revs) + if opts.get('working_dir'): + revs.add(wdirrev) + if list(merge.mergestate.read(repo).unresolved()): + raise error.Abort('unresolved conflicts', hint="use 'hg resolve'") + if not revs: + raise error.Abort( + 'no changesets specified', hint='use --rev or --working-dir') + return revs + +def checknodescendants(repo, revs): + if (not obsolete.isenabled(repo, obsolete.allowunstableopt) and + repo.revs('(%ld::) - (%ld)', revs, revs)): + raise error.Abort(_('can only fix a changeset together ' + 'with all its descendants')) + +def checkfixablectx(ui, repo, ctx): + """Aborts if the revision shouldn't be replaced with a fixed one.""" + if not ctx.mutable(): + raise error.Abort('can\'t fix immutable changeset %s' % + (scmutil.formatchangeid(ctx),)) + if ctx.obsolete(): + # It would be better to actually check if the revision has a successor. + allowdivergence = ui.configbool('experimental', + 'evolution.allowdivergence') + if not allowdivergence: + raise error.Abort('fixing obsolete revision could cause divergence') + +def pathstofix(ui, repo, pats, opts, match, basectxs, fixctx): + """Returns the set of files that should be fixed in a context + + The result depends on the base contexts; we include any file that has + changed relative to any of the base contexts. Base contexts should be + ancestors of the context being fixed. + """ + files = set() + for basectx in basectxs: + stat = repo.status( + basectx, fixctx, match=match, clean=bool(pats), unknown=bool(pats)) + files.update( + set(itertools.chain(stat.added, stat.modified, stat.clean, + stat.unknown))) + return files + +def lineranges(opts, path, basectxs, fixctx, content2): + """Returns the set of line ranges that should be fixed in a file + + Of the form [(10, 20), (30, 40)]. + + This depends on the given base contexts; we must consider lines that have + changed versus any of the base contexts, and whether the file has been + renamed versus any of them. + + Another way to understand this is that we exclude line ranges that are + common to the file in all base contexts. + """ + if opts.get('whole'): + # Return a range containing all lines. Rely on the diff implementation's + # idea of how many lines are in the file, instead of reimplementing it. + return difflineranges('', content2) + + rangeslist = [] + for basectx in basectxs: + basepath = copies.pathcopies(basectx, fixctx).get(path, path) + if basepath in basectx: + content1 = basectx[basepath].data() + else: + content1 = '' + rangeslist.extend(difflineranges(content1, content2)) + return unionranges(rangeslist) + +def unionranges(rangeslist): + """Return the union of some closed intervals + + >>> unionranges([]) + [] + >>> unionranges([(1, 100)]) + [(1, 100)] + >>> unionranges([(1, 100), (1, 100)]) + [(1, 100)] + >>> unionranges([(1, 100), (2, 100)]) + [(1, 100)] + >>> unionranges([(1, 99), (1, 100)]) + [(1, 100)] + >>> unionranges([(1, 100), (40, 60)]) + [(1, 100)] + >>> unionranges([(1, 49), (50, 100)]) + [(1, 100)] + >>> unionranges([(1, 48), (50, 100)]) + [(1, 48), (50, 100)] + >>> unionranges([(1, 2), (3, 4), (5, 6)]) + [(1, 6)] + """ + rangeslist = sorted(set(rangeslist)) + unioned = [] + if rangeslist: + unioned, rangeslist = [rangeslist[0]], rangeslist[1:] + for a, b in rangeslist: + c, d = unioned[-1] + if a > d + 1: + unioned.append((a, b)) + else: + unioned[-1] = (c, max(b, d)) + return unioned + +def difflineranges(content1, content2): + """Return list of line number ranges in content2 that differ from content1. + + Line numbers are 1-based. The numbers are the first and last line contained + in the range. Single-line ranges have the same line number for the first and + last line. Excludes any empty ranges that result from lines that are only + present in content1. Relies on mdiff's idea of where the line endings are in + the string. + + >>> from mercurial import pycompat + >>> lines = lambda s: b'\\n'.join([c for c in pycompat.iterbytestr(s)]) + >>> difflineranges2 = lambda a, b: difflineranges(lines(a), lines(b)) + >>> difflineranges2(b'', b'') + [] + >>> difflineranges2(b'a', b'') + [] + >>> difflineranges2(b'', b'A') + [(1, 1)] + >>> difflineranges2(b'a', b'a') + [] + >>> difflineranges2(b'a', b'A') + [(1, 1)] + >>> difflineranges2(b'ab', b'') + [] + >>> difflineranges2(b'', b'AB') + [(1, 2)] + >>> difflineranges2(b'abc', b'ac') + [] + >>> difflineranges2(b'ab', b'aCb') + [(2, 2)] + >>> difflineranges2(b'abc', b'aBc') + [(2, 2)] + >>> difflineranges2(b'ab', b'AB') + [(1, 2)] + >>> difflineranges2(b'abcde', b'aBcDe') + [(2, 2), (4, 4)] + >>> difflineranges2(b'abcde', b'aBCDe') + [(2, 4)] + """ + ranges = [] + for lines, kind in mdiff.allblocks(content1, content2): + firstline, lastline = lines[2:4] + if kind == '!' and firstline != lastline: + ranges.append((firstline + 1, lastline)) + return ranges + +def getbasectxs(repo, opts, revstofix): + """Returns a map of the base contexts for each revision + + The base contexts determine which lines are considered modified when we + attempt to fix just the modified lines in a file. + """ + # The --base flag overrides the usual logic, and we give every revision + # exactly the set of baserevs that the user specified. + if opts.get('base'): + baserevs = set(scmutil.revrange(repo, opts.get('base'))) + if not baserevs: + baserevs = {nullrev} + basectxs = {repo[rev] for rev in baserevs} + return {rev: basectxs for rev in revstofix} + + # Proceed in topological order so that we can easily determine each + # revision's baserevs by looking at its parents and their baserevs. + basectxs = collections.defaultdict(set) + for rev in sorted(revstofix): + ctx = repo[rev] + for pctx in ctx.parents(): + if pctx.rev() in basectxs: + basectxs[rev].update(basectxs[pctx.rev()]) + else: + basectxs[rev].add(pctx) + return basectxs + +def fixfile(ui, opts, fixers, fixctx, path, basectxs): + """Run any configured fixers that should affect the file in this context + + Returns the file content that results from applying the fixers in some order + starting with the file's content in the fixctx. Fixers that support line + ranges will affect lines that have changed relative to any of the basectxs + (i.e. they will only avoid lines that are common to all basectxs). + """ + newdata = fixctx[path].data() + for fixername, fixer in fixers.iteritems(): + if fixer.affects(opts, fixctx, path): + ranges = lineranges(opts, path, basectxs, fixctx, newdata) + command = fixer.command(ui, path, ranges) + if command is None: + continue + ui.debug('subprocess: %s\n' % (command,)) + proc = subprocess.Popen( + command, + shell=True, + cwd='/', + stdin=subprocess.PIPE, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE) + newerdata, stderr = proc.communicate(newdata) + if stderr: + showstderr(ui, fixctx.rev(), fixername, stderr) + else: + newdata = newerdata + return newdata + +def showstderr(ui, rev, fixername, stderr): + """Writes the lines of the stderr string as warnings on the ui + + Uses the revision number and fixername to give more context to each line of + the error message. Doesn't include file names, since those take up a lot of + space and would tend to be included in the error message if they were + relevant. + """ + for line in re.split('[\r\n]+', stderr): + if line: + ui.warn(('[')) + if rev is None: + ui.warn(_('wdir'), label='evolve.rev') + else: + ui.warn((str(rev)), label='evolve.rev') + ui.warn(('] %s: %s\n') % (fixername, line)) + +def writeworkingdir(repo, ctx, filedata, replacements): + """Write new content to the working copy and check out the new p1 if any + + We check out a new revision if and only if we fixed something in both the + working directory and its parent revision. This avoids the need for a full + update/merge, and means that the working directory simply isn't affected + unless the --working-dir flag is given. + + Directly updates the dirstate for the affected files. + """ + for path, data in filedata.iteritems(): + fctx = ctx[path] + fctx.write(data, fctx.flags()) + if repo.dirstate[path] == 'n': + repo.dirstate.normallookup(path) + + oldparentnodes = repo.dirstate.parents() + newparentnodes = [replacements.get(n, n) for n in oldparentnodes] + if newparentnodes != oldparentnodes: + repo.setparents(*newparentnodes) + +def replacerev(ui, repo, ctx, filedata, replacements): + """Commit a new revision like the given one, but with file content changes + + "ctx" is the original revision to be replaced by a modified one. + + "filedata" is a dict that maps paths to their new file content. All other + paths will be recreated from the original revision without changes. + "filedata" may contain paths that didn't exist in the original revision; + they will be added. + + "replacements" is a dict that maps a single node to a single node, and it is + updated to indicate the original revision is replaced by the newly created + one. No entry is added if the replacement's node already exists. + + The new revision has the same parents as the old one, unless those parents + have already been replaced, in which case those replacements are the parents + of this new revision. Thus, if revisions are replaced in topological order, + there is no need to rebase them into the original topology later. + """ + + p1rev, p2rev = repo.changelog.parentrevs(ctx.rev()) + p1ctx, p2ctx = repo[p1rev], repo[p2rev] + newp1node = replacements.get(p1ctx.node(), p1ctx.node()) + newp2node = replacements.get(p2ctx.node(), p2ctx.node()) + + def filectxfn(repo, memctx, path): + if path not in ctx: + return None + fctx = ctx[path] + copied = fctx.renamed() + if copied: + copied = copied[0] + return context.memfilectx( + repo, + memctx, + path=fctx.path(), + data=filedata.get(path, fctx.data()), + islink=fctx.islink(), + isexec=fctx.isexec(), + copied=copied) + + overrides = {('phases', 'new-commit'): ctx.phase()} + with ui.configoverride(overrides, source='fix'): + memctx = context.memctx( + repo, + parents=(newp1node, newp2node), + text=ctx.description(), + files=set(ctx.files()) | set(filedata.keys()), + filectxfn=filectxfn, + user=ctx.user(), + date=ctx.date(), + extra=ctx.extra(), + branch=ctx.branch(), + editor=None) + sucnode = memctx.commit() + prenode = ctx.node() + if prenode == sucnode: + ui.debug('node %s already existed\n' % (ctx.hex())) + else: + replacements[ctx.node()] = sucnode + +def getfixers(ui): + """Returns a map of configured fixer tools indexed by their names + + Each value is a Fixer object with methods that implement the behavior of the + fixer's config suboptions. Does not validate the config values. + """ + result = {} + for name in fixernames(ui): + result[name] = Fixer() + attrs = ui.configsuboptions('fix', name)[1] + for key in FIXER_ATTRS: + setattr(result[name], pycompat.sysstr('_' + key), + attrs.get(key, '')) + return result + +def fixernames(ui): + """Returns the names of [fix] config options that have suboptions""" + names = set() + for k, v in ui.configitems('fix'): + if ':' in k: + names.add(k.split(':', 1)[0]) + return names + +class Fixer(object): + """Wraps the raw config values for a fixer with methods""" + + def affects(self, opts, fixctx, path): + """Should this fixer run on the file at the given path and context?""" + return scmutil.match(fixctx, [self._fileset], opts)(path) + + def command(self, ui, path, ranges): + """A shell command to use to invoke this fixer on the given file/lines + + May return None if there is no appropriate command to run for the given + parameters. + """ + expand = cmdutil.rendercommandtemplate + parts = [expand(ui, self._command, + {'rootpath': path, 'basename': os.path.basename(path)})] + if self._linerange: + if not ranges: + # No line ranges to fix, so don't run the fixer. + return None + for first, last in ranges: + parts.append(expand(ui, self._linerange, + {'first': first, 'last': last})) + return ' '.join(parts) diff -r fb92df8b634c -r ed5448edcbfa hgext/fsmonitor/pywatchman/bser.c --- a/hgext/fsmonitor/pywatchman/bser.c Wed Apr 04 10:35:09 2018 -0400 +++ b/hgext/fsmonitor/pywatchman/bser.c Wed Apr 18 15:32:08 2018 -0400 @@ -128,27 +128,38 @@ Py_ssize_t i, n; PyObject* name_bytes = NULL; PyObject* ret = NULL; - const char* namestr; + const char* namestr = NULL; if (PyIndex_Check(name)) { i = PyNumber_AsSsize_t(name, PyExc_IndexError); if (i == -1 && PyErr_Occurred()) { goto bail; } - ret = PySequence_GetItem(obj->values, i); - goto bail; - } - // We can be passed in Unicode objects here -- we don't support anything other - // than UTF-8 for keys. - if (PyUnicode_Check(name)) { - name_bytes = PyUnicode_AsUTF8String(name); - if (name_bytes == NULL) { + if (i == 8 && PySequence_Size(obj->values) < 9) { + // Hack alert: Python 3 removed support for os.stat().st_mtime + // being an integer.Instead, if you need an integer, you have to + // use os.stat()[stat.ST_MTIME] instead. stat.ST_MTIME is 8, and + // our stat tuples are shorter than that, so we can detect + // requests for index 8 on tuples shorter than that and return + // st_mtime instead. + namestr = "st_mtime"; + } else { + ret = PySequence_GetItem(obj->values, i); goto bail; } - namestr = PyBytes_AsString(name_bytes); } else { - namestr = PyBytes_AsString(name); + // We can be passed in Unicode objects here -- we don't support anything other + // than UTF-8 for keys. + if (PyUnicode_Check(name)) { + name_bytes = PyUnicode_AsUTF8String(name); + if (name_bytes == NULL) { + goto bail; + } + namestr = PyBytes_AsString(name_bytes); + } else { + namestr = PyBytes_AsString(name); + } } if (namestr == NULL) { diff -r fb92df8b634c -r ed5448edcbfa hgext/githelp.py --- a/hgext/githelp.py Wed Apr 04 10:35:09 2018 -0400 +++ b/hgext/githelp.py Wed Apr 18 15:32:08 2018 -0400 @@ -22,10 +22,14 @@ from mercurial.i18n import _ from mercurial import ( + encoding, error, fancyopts, registrar, - util, + scmutil, +) +from mercurial.utils import ( + procutil, ) # Note for extension authors: ONLY specify testedwith = 'ships-with-hg-core' for @@ -109,7 +113,7 @@ self.args = [] self.opts = {} - def __str__(self): + def __bytes__(self): cmd = "hg " + self.name if self.opts: for k, values in sorted(self.opts.iteritems()): @@ -123,6 +127,8 @@ cmd += " ".join(self.args) return cmd + __str__ = encoding.strmethod(__bytes__) + def append(self, value): self.args.append(value) @@ -167,14 +173,14 @@ ui.status(_("note: use hg addremove to remove files that have " "been deleted.\n\n")) - ui.status((str(cmd)), "\n") + ui.status((bytes(cmd)), "\n") def am(ui, repo, *args, **kwargs): cmdoptions=[ ] args, opts = parseoptions(ui, cmdoptions, args) cmd = Command('import') - ui.status(str(cmd), "\n") + ui.status(bytes(cmd), "\n") def apply(ui, repo, *args, **kwargs): cmdoptions = [ @@ -187,7 +193,7 @@ cmd['-p'] = opts.get('p') cmd.extend(args) - ui.status((str(cmd)), "\n") + ui.status((bytes(cmd)), "\n") def bisect(ui, repo, *args, **kwargs): ui.status(_("See 'hg help bisect' for how to use bisect.\n\n")) @@ -198,7 +204,7 @@ args, opts = parseoptions(ui, cmdoptions, args) cmd = Command('annotate -udl') cmd.extend([convert(v) for v in args]) - ui.status((str(cmd)), "\n") + ui.status((bytes(cmd)), "\n") def branch(ui, repo, *args, **kwargs): cmdoptions = [ @@ -239,7 +245,7 @@ cmd.append(args[0]) elif len(args) == 1: cmd.append(args[0]) - ui.status((str(cmd)), "\n") + ui.status((bytes(cmd)), "\n") def ispath(repo, string): """ @@ -248,7 +254,7 @@ too many ways to spell revisions in git for us to reasonably catch all of them, so let's be conservative. """ - if string in repo: + if scmutil.isrevsymbol(repo, string): # if it's definitely a revision let's not even check if a file of the # same name exists. return False @@ -330,7 +336,7 @@ else: raise error.Abort("a commit must be specified") - ui.status((str(cmd)), "\n") + ui.status((bytes(cmd)), "\n") def cherrypick(ui, repo, *args, **kwargs): cmdoptions = [ @@ -352,7 +358,7 @@ else: cmd.extend(args) - ui.status((str(cmd)), "\n") + ui.status((bytes(cmd)), "\n") def clean(ui, repo, *args, **kwargs): cmdoptions = [ @@ -367,7 +373,7 @@ cmd['--all'] = None cmd.extend(args) - ui.status((str(cmd)), "\n") + ui.status((bytes(cmd)), "\n") def clone(ui, repo, *args, **kwargs): cmdoptions = [ @@ -397,7 +403,7 @@ cocmd.append(opts.get('branch')) cmd = cmd & cocmd - ui.status((str(cmd)), "\n") + ui.status((bytes(cmd)), "\n") def commit(ui, repo, *args, **kwargs): cmdoptions = [ @@ -445,7 +451,7 @@ cmd.extend(args) - ui.status((str(cmd)), "\n") + ui.status((bytes(cmd)), "\n") def deprecated(ui, repo, *args, **kwargs): ui.warn(_('This command has been deprecated in the git project, ' + @@ -476,7 +482,7 @@ except Exception: cmd.append(a) - ui.status((str(cmd)), "\n") + ui.status((bytes(cmd)), "\n") def difftool(ui, repo, *args, **kwargs): ui.status(_('Mercurial does not enable external difftool by default. You ' @@ -509,7 +515,7 @@ else: cmd['-r'] = v - ui.status((str(cmd)), "\n") + ui.status((bytes(cmd)), "\n") def grep(ui, repo, *args, **kwargs): cmdoptions = [ @@ -522,7 +528,7 @@ # pattern first, followed by paths. cmd.extend(args) - ui.status((str(cmd)), "\n") + ui.status((bytes(cmd)), "\n") def init(ui, repo, *args, **kwargs): cmdoptions = [ @@ -534,7 +540,7 @@ if len(args) > 0: cmd.append(args[0]) - ui.status((str(cmd)), "\n") + ui.status((bytes(cmd)), "\n") def log(ui, repo, *args, **kwargs): cmdoptions = [ @@ -588,7 +594,7 @@ del args[0] cmd.extend(args) - ui.status((str(cmd)), "\n") + ui.status((bytes(cmd)), "\n") def lsfiles(ui, repo, *args, **kwargs): cmdoptions = [ @@ -622,9 +628,9 @@ cmd['-0'] = None cmd.append('.') for include in args: - cmd['-I'] = util.shellquote(include) + cmd['-I'] = procutil.shellquote(include) - ui.status((str(cmd)), "\n") + ui.status((bytes(cmd)), "\n") def merge(ui, repo, *args, **kwargs): cmdoptions = [ @@ -636,7 +642,7 @@ if len(args) > 0: cmd.append(args[len(args) - 1]) - ui.status((str(cmd)), "\n") + ui.status((bytes(cmd)), "\n") def mergebase(ui, repo, *args, **kwargs): cmdoptions = [] @@ -650,7 +656,7 @@ ui.status(_('NOTE: ancestors() is part of the revset language.\n'), _("Learn more about revsets with 'hg help revsets'\n\n")) - ui.status((str(cmd)), "\n") + ui.status((bytes(cmd)), "\n") def mergetool(ui, repo, *args, **kwargs): cmdoptions = [] @@ -661,7 +667,7 @@ if len(args) == 0: cmd['--all'] = None cmd.extend(args) - ui.status((str(cmd)), "\n") + ui.status((bytes(cmd)), "\n") def mv(ui, repo, *args, **kwargs): cmdoptions = [ @@ -675,7 +681,7 @@ if opts.get('force'): cmd['-f'] = None - ui.status((str(cmd)), "\n") + ui.status((bytes(cmd)), "\n") def pull(ui, repo, *args, **kwargs): cmdoptions = [ @@ -701,7 +707,7 @@ else: cmd['-r'] = v - ui.status((str(cmd)), "\n") + ui.status((bytes(cmd)), "\n") def push(ui, repo, *args, **kwargs): cmdoptions = [ @@ -728,7 +734,7 @@ if opts.get('force'): cmd['-f'] = None - ui.status((str(cmd)), "\n") + ui.status((bytes(cmd)), "\n") def rebase(ui, repo, *args, **kwargs): cmdoptions = [ @@ -748,12 +754,12 @@ if len(args) > 0: ui.status(_("also note: 'hg histedit' will automatically detect" " your stack, so no second argument is necessary.\n\n")) - ui.status((str(cmd)), "\n") + ui.status((bytes(cmd)), "\n") return if opts.get('skip'): cmd = Command('revert --all -r .') - ui.status((str(cmd)), "\n") + ui.status((bytes(cmd)), "\n") cmd = Command('rebase') @@ -777,7 +783,7 @@ cmd['-d'] = convert(args[0]) cmd['-b'] = convert(args[1]) - ui.status((str(cmd)), "\n") + ui.status((bytes(cmd)), "\n") def reflog(ui, repo, *args, **kwargs): cmdoptions = [ @@ -791,7 +797,7 @@ if len(args) > 0: cmd.append(args[0]) - ui.status(str(cmd), "\n\n") + ui.status(bytes(cmd), "\n\n") ui.status(_("note: in hg commits can be deleted from repo but we always" " have backups.\n")) @@ -819,7 +825,7 @@ cmd.append(commit) - ui.status((str(cmd)), "\n") + ui.status((bytes(cmd)), "\n") def revert(ui, repo, *args, **kwargs): cmdoptions = [ @@ -834,7 +840,7 @@ if args: cmd.append(args[0]) - ui.status((str(cmd)), "\n") + ui.status((bytes(cmd)), "\n") def revparse(ui, repo, *args, **kwargs): cmdoptions = [ @@ -847,7 +853,7 @@ cmd = Command('root') if opts.get('show_cdup'): ui.status(_("note: hg root prints the root of the repository\n\n")) - ui.status((str(cmd)), "\n") + ui.status((bytes(cmd)), "\n") else: ui.status(_("note: see hg help revset for how to refer to commits\n")) @@ -866,7 +872,7 @@ if opts.get('dry_run'): cmd['-n'] = None - ui.status((str(cmd)), "\n") + ui.status((bytes(cmd)), "\n") def show(ui, repo, *args, **kwargs): cmdoptions = [ @@ -898,7 +904,7 @@ else: cmd = Command('export') - ui.status((str(cmd)), "\n") + ui.status((bytes(cmd)), "\n") def stash(ui, repo, *args, **kwargs): cmdoptions = [ @@ -934,7 +940,7 @@ elif len(args) > 1: cmd['--name'] = args[1] - ui.status((str(cmd)), "\n") + ui.status((bytes(cmd)), "\n") def status(ui, repo, *args, **kwargs): cmdoptions = [ @@ -948,7 +954,7 @@ if opts.get('ignored'): cmd['-i'] = None - ui.status((str(cmd)), "\n") + ui.status((bytes(cmd)), "\n") def svn(ui, repo, *args, **kwargs): svncmd = args[0] @@ -965,7 +971,7 @@ cmd = Command('push') - ui.status((str(cmd)), "\n") + ui.status((bytes(cmd)), "\n") def svnfetch(ui, repo, *args, **kwargs): cmdoptions = [ @@ -975,7 +981,7 @@ cmd = Command('pull') cmd.append('default-push') - ui.status((str(cmd)), "\n") + ui.status((bytes(cmd)), "\n") def svnfindrev(ui, repo, *args, **kwargs): cmdoptions = [ @@ -985,7 +991,7 @@ cmd = Command('log') cmd['-r'] = args[0] - ui.status((str(cmd)), "\n") + ui.status((bytes(cmd)), "\n") def svnrebase(ui, repo, *args, **kwargs): cmdoptions = [ @@ -1000,7 +1006,7 @@ cmd = pullcmd & rebasecmd - ui.status((str(cmd)), "\n") + ui.status((bytes(cmd)), "\n") def tag(ui, repo, *args, **kwargs): cmdoptions = [ @@ -1024,7 +1030,7 @@ if opts.get('force'): cmd['-f'] = None - ui.status((str(cmd)), "\n") + ui.status((bytes(cmd)), "\n") gitcommands = { 'add': add, diff -r fb92df8b634c -r ed5448edcbfa hgext/gpg.py --- a/hgext/gpg.py Wed Apr 04 10:35:09 2018 -0400 +++ b/hgext/gpg.py Wed Apr 18 15:32:08 2018 -0400 @@ -19,7 +19,10 @@ node as hgnode, pycompat, registrar, - util, +) +from mercurial.utils import ( + dateutil, + procutil, ) cmdtable = {} @@ -51,7 +54,7 @@ def sign(self, data): gpgcmd = "%s --sign --detach-sign%s" % (self.path, self.key) - return util.filter(data, gpgcmd) + return procutil.filter(data, gpgcmd) def verify(self, data, sig): """ returns of the good and bad signatures""" @@ -59,16 +62,16 @@ try: # create temporary files fd, sigfile = tempfile.mkstemp(prefix="hg-gpg-", suffix=".sig") - fp = os.fdopen(fd, pycompat.sysstr('wb')) + fp = os.fdopen(fd, r'wb') fp.write(sig) fp.close() fd, datafile = tempfile.mkstemp(prefix="hg-gpg-", suffix=".txt") - fp = os.fdopen(fd, pycompat.sysstr('wb')) + fp = os.fdopen(fd, r'wb') fp.write(data) fp.close() gpgcmd = ("%s --logger-fd 1 --status-fd 1 --verify " "\"%s\" \"%s\"" % (self.path, sigfile, datafile)) - ret = util.filter("", gpgcmd) + ret = procutil.filter("", gpgcmd) finally: for f in (sigfile, datafile): try: @@ -153,8 +156,7 @@ # warn for expired key and/or sigs for key in keys: if key[0] == "ERRSIG": - ui.write(_("%s Unknown key ID \"%s\"\n") - % (prefix, shortkey(ui, key[1][:15]))) + ui.write(_("%s Unknown key ID \"%s\"\n") % (prefix, key[1])) continue if key[0] == "BADSIG": ui.write(_("%s Bad signature from \"%s\"\n") % (prefix, key[2])) @@ -259,7 +261,7 @@ date = opts.get('date') if date: - opts['date'] = util.parsedate(date) + opts['date'] = dateutil.parsedate(date) if revs: nodes = [repo.lookup(n) for n in revs] @@ -318,14 +320,7 @@ repo.commit(message, opts['user'], opts['date'], match=msigs, editor=editor) except ValueError as inst: - raise error.Abort(str(inst)) - -def shortkey(ui, key): - if len(key) != 16: - ui.debug("key ID \"%s\" format error\n" % key) - return key - - return key[-8:] + raise error.Abort(pycompat.bytestr(inst)) def node2txt(repo, node, ver): """map a manifest into some text""" diff -r fb92df8b634c -r ed5448edcbfa hgext/hgk.py --- a/hgext/hgk.py Wed Apr 04 10:35:09 2018 -0400 +++ b/hgext/hgk.py Wed Apr 18 15:32:08 2018 -0400 @@ -51,7 +51,6 @@ pycompat, registrar, scmutil, - util, ) cmdtable = {} @@ -105,15 +104,15 @@ while True: if opts[r'stdin']: - try: - line = util.bytesinput(ui.fin, ui.fout).split(' ') - node1 = line[0] - if len(line) > 1: - node2 = line[1] - else: - node2 = None - except EOFError: + line = ui.fin.readline() + if not line: break + line = line.rstrip(pycompat.oslinesep).split(b' ') + node1 = line[0] + if len(line) > 1: + node2 = line[1] + else: + node2 = None node1 = repo.lookup(node1) if node2: node2 = repo.lookup(node2) @@ -146,7 +145,7 @@ date = ctx.date() description = ctx.description().replace("\0", "") - ui.write(("author %s %s %s\n" % (ctx.user(), int(date[0]), date[1]))) + ui.write(("author %s %d %d\n" % (ctx.user(), int(date[0]), date[1]))) if 'committer' in ctx.extra(): ui.write(("committer %s\n" % ctx.extra()['committer'])) @@ -186,12 +185,11 @@ # prefix = "" if opts[r'stdin']: - try: - (type, r) = util.bytesinput(ui.fin, ui.fout).split(' ') - prefix = " " - except EOFError: + line = ui.fin.readline() + if not line: return - + (type, r) = line.rstrip(pycompat.oslinesep).split(b' ') + prefix = " " else: if not type or not r: ui.warn(_("cat-file: type or revision not supplied\n")) @@ -204,10 +202,10 @@ n = repo.lookup(r) catcommit(ui, repo, n, prefix) if opts[r'stdin']: - try: - (type, r) = util.bytesinput(ui.fin, ui.fout).split(' ') - except EOFError: + line = ui.fin.readline() + if not line: break + (type, r) = line.rstrip(pycompat.oslinesep).split(b' ') else: break diff -r fb92df8b634c -r ed5448edcbfa hgext/highlight/__init__.py --- a/hgext/highlight/__init__.py Wed Apr 04 10:35:09 2018 -0400 +++ b/hgext/highlight/__init__.py Wed Apr 18 15:32:08 2018 -0400 @@ -30,13 +30,11 @@ from . import highlight from mercurial.hgweb import ( - common, webcommands, webutil, ) from mercurial import ( - encoding, extensions, fileset, ) @@ -59,8 +57,8 @@ highlight.pygmentize(field, fctx, style, tmpl, guessfilenameonly=filenameonly) -def filerevision_highlight(orig, web, req, tmpl, fctx): - mt = ''.join(tmpl('mimetype', encoding=encoding.encoding)) +def filerevision_highlight(orig, web, fctx): + mt = web.res.headers['Content-Type'] # only pygmentize for mimetype containing 'html' so we both match # 'text/html' and possibly 'application/xhtml+xml' in the future # so that we don't have to touch the extension when the mimetype @@ -69,24 +67,27 @@ # can't clash with the file's content-type here in case we # pygmentize a html file if 'html' in mt: - pygmentize(web, 'fileline', fctx, tmpl) + pygmentize(web, 'fileline', fctx, web.tmpl) - return orig(web, req, tmpl, fctx) + return orig(web, fctx) -def annotate_highlight(orig, web, req, tmpl): - mt = ''.join(tmpl('mimetype', encoding=encoding.encoding)) +def annotate_highlight(orig, web): + mt = web.res.headers['Content-Type'] if 'html' in mt: - fctx = webutil.filectx(web.repo, req) - pygmentize(web, 'annotateline', fctx, tmpl) + fctx = webutil.filectx(web.repo, web.req) + pygmentize(web, 'annotateline', fctx, web.tmpl) - return orig(web, req, tmpl) + return orig(web) -def generate_css(web, req, tmpl): +def generate_css(web): pg_style = web.config('web', 'pygments_style', 'colorful') fmter = highlight.HtmlFormatter(style=pg_style) - req.respond(common.HTTP_OK, 'text/css') - return ['/* pygments_style = %s */\n\n' % pg_style, - fmter.get_style_defs('')] + web.res.headers['Content-Type'] = 'text/css' + web.res.setbodybytes(''.join([ + '/* pygments_style = %s */\n\n' % pg_style, + fmter.get_style_defs(''), + ])) + return web.res.sendresponse() def extsetup(): # monkeypatch in the new version diff -r fb92df8b634c -r ed5448edcbfa hgext/highlight/highlight.py --- a/hgext/highlight/highlight.py Wed Apr 04 10:35:09 2018 -0400 +++ b/hgext/highlight/highlight.py Wed Apr 18 15:32:08 2018 -0400 @@ -15,7 +15,10 @@ from mercurial import ( encoding, - util, +) + +from mercurial.utils import ( + stringutil, ) with demandimport.deactivated(): @@ -47,7 +50,7 @@ tmpl.cache['header'] = new_header text = fctx.data() - if util.binary(text): + if stringutil.binary(text): return # str.splitlines() != unicode.splitlines() because "reasons" diff -r fb92df8b634c -r ed5448edcbfa hgext/histedit.py --- a/hgext/histedit.py Wed Apr 04 10:35:09 2018 -0400 +++ b/hgext/histedit.py Wed Apr 18 15:32:08 2018 -0400 @@ -209,6 +209,9 @@ scmutil, util, ) +from mercurial.utils import ( + stringutil, +) pickle = util.pickle release = lock.release @@ -221,7 +224,7 @@ default=False, ) configitem('histedit', 'defaultrev', - default=configitem.dynamicdefault, + default=None, ) configitem('histedit', 'dropmissing', default=False, @@ -344,7 +347,7 @@ fp.write('v1\n') fp.write('%s\n' % node.hex(self.parentctxnode)) fp.write('%s\n' % node.hex(self.topmost)) - fp.write('%s\n' % self.keep) + fp.write('%s\n' % ('True' if self.keep else 'False')) fp.write('%d\n' % len(self.actions)) for action in self.actions: fp.write('%s\n' % action.tostate()) @@ -422,24 +425,28 @@ def fromrule(cls, state, rule): """Parses the given rule, returning an instance of the histeditaction. """ - rulehash = rule.strip().split(' ', 1)[0] + ruleid = rule.strip().split(' ', 1)[0] + # ruleid can be anything from rev numbers, hashes, "bookmarks" etc + # Check for validation of rule ids and get the rulehash try: - rev = node.bin(rulehash) + rev = node.bin(ruleid) except TypeError: - raise error.ParseError("invalid changeset %s" % rulehash) + try: + _ctx = scmutil.revsingle(state.repo, ruleid) + rulehash = _ctx.hex() + rev = node.bin(rulehash) + except error.RepoLookupError: + raise error.ParseError(_("invalid changeset %s") % ruleid) return cls(state, rev) def verify(self, prev, expected, seen): """ Verifies semantic correctness of the rule""" repo = self.repo ha = node.hex(self.node) - try: - self.node = repo[ha].node() - except error.RepoError: - raise error.ParseError(_('unknown changeset %s listed') - % ha[:12]) - if self.node is not None: - self._verifynodeconstraints(prev, expected, seen) + self.node = scmutil.resolvehexnodeidprefix(repo, ha) + if self.node is None: + raise error.ParseError(_('unknown changeset %s listed') % ha[:12]) + self._verifynodeconstraints(prev, expected, seen) def _verifynodeconstraints(self, prev, expected, seen): # by default command need a node in the edited list @@ -465,7 +472,7 @@ # (the 5 more are left for verb) maxlen = self.repo.ui.configint('histedit', 'linelen') maxlen = max(maxlen, 22) # avoid truncating hash - return util.ellipsis(line, maxlen) + return stringutil.ellipsis(line, maxlen) def tostate(self): """Print an action in format used by histedit state files @@ -489,9 +496,9 @@ hg.update(repo, self.state.parentctxnode, quietempty=True) stats = applychanges(repo.ui, repo, rulectx, {}) repo.dirstate.setbranch(rulectx.branch()) - if stats and stats[3] > 0: + if stats.unresolvedcount: buf = repo.ui.popbuffer() - repo.ui.write(*buf) + repo.ui.write(buf) raise error.InterventionRequired( _('Fix up the change (%s %s)') % (self.verb, node.short(self.node)), @@ -556,7 +563,7 @@ # edits are "in place" we do not need to make any merge, # just applies changes on parent for editing cmdutil.revert(ui, repo, ctx, (wcpar, node.nullid), all=True) - stats = None + stats = mergemod.updateresult(0, 0, 0, 0) else: try: # ui.forcemerge is an internal variable, do not document @@ -567,7 +574,7 @@ repo.ui.setconfig('ui', 'forcemerge', '', 'histedit') return stats -def collapse(repo, first, last, commitopts, skipprompt=False): +def collapse(repo, firstctx, lastctx, commitopts, skipprompt=False): """collapse the set of revisions from first to last as new one. Expected commit options are: @@ -577,14 +584,14 @@ Commit message is edited in all cases. This function works in memory.""" - ctxs = list(repo.set('%d::%d', first, last)) + ctxs = list(repo.set('%d::%d', firstctx.rev(), lastctx.rev())) if not ctxs: return None for c in ctxs: if not c.mutable(): raise error.ParseError( _("cannot fold into public change %s") % node.short(c.node())) - base = first.parents()[0] + base = firstctx.parents()[0] # commit a new version of the old changeset, including the update # collect all files which might be affected @@ -593,15 +600,15 @@ files.update(ctx.files()) # Recompute copies (avoid recording a -> b -> a) - copied = copies.pathcopies(base, last) + copied = copies.pathcopies(base, lastctx) # prune files which were reverted by the updates - files = [f for f in files if not cmdutil.samefile(f, last, base)] + files = [f for f in files if not cmdutil.samefile(f, lastctx, base)] # commit version of these files as defined by head - headmf = last.manifest() + headmf = lastctx.manifest() def filectxfn(repo, ctx, path): if path in headmf: - fctx = last[path] + fctx = lastctx[path] flags = fctx.flags() mctx = context.memfilectx(repo, ctx, fctx.path(), fctx.data(), @@ -614,12 +621,12 @@ if commitopts.get('message'): message = commitopts['message'] else: - message = first.description() + message = firstctx.description() user = commitopts.get('user') date = commitopts.get('date') extra = commitopts.get('extra') - parents = (first.p1().node(), first.p2().node()) + parents = (firstctx.p1().node(), firstctx.p2().node()) editor = None if not skipprompt: editor = cmdutil.getcommiteditor(edit=True, editform='histedit.fold') @@ -730,8 +737,9 @@ return ctx, [(self.node, (parentctxnode,))] parentctx = repo[parentctxnode] - newcommits = set(c.node() for c in repo.set('(%d::. - %d)', parentctx, - parentctx)) + newcommits = set(c.node() for c in repo.set('(%d::. - %d)', + parentctx.rev(), + parentctx.rev())) if not newcommits: repo.ui.warn(_('%s: cannot fold - working copy is not a ' 'descendant of previous commit %s\n') % @@ -888,10 +896,10 @@ if opts is None: opts = {} dest = ui.expandpath(remote or 'default-push', remote or 'default') - dest, revs = hg.parseurl(dest, None)[:2] + dest, branches = hg.parseurl(dest, None)[:2] ui.status(_('comparing with %s\n') % util.hidepassword(dest)) - revs, checkout = hg.addbranchrevs(repo, repo, revs, None) + revs, checkout = hg.addbranchrevs(repo, repo, branches, None) other = hg.peer(repo, opts, dest) if revs: @@ -905,7 +913,7 @@ msg = _('there are ambiguous outgoing revisions') hint = _("see 'hg help histedit' for more detail") raise error.Abort(msg, hint=hint) - return repo.lookup(roots[0]) + return repo[roots[0]].node() @command('histedit', [('', 'commands', '', @@ -1316,8 +1324,8 @@ # Create a backup so we can always abort completely. backupfile = None if not obsolete.isenabled(repo, obsolete.createmarkersopt): - backupfile = repair._bundle(repo, [parentctxnode], [topmost], root, - 'histedit') + backupfile = repair.backupbundle(repo, [parentctxnode], + [topmost], root, 'histedit') state.backupfile = backupfile def _getsummary(ctx): @@ -1353,19 +1361,19 @@ """select and validate the set of revision to edit When keep is false, the specified set can't have children.""" - ctxs = list(repo.set('%n::%n', old, new)) - if ctxs and not keep: + revs = repo.revs('%n::%n', old, new) + if revs and not keep: if (not obsolete.isenabled(repo, obsolete.allowunstableopt) and - repo.revs('(%ld::) - (%ld)', ctxs, ctxs)): + repo.revs('(%ld::) - (%ld)', revs, revs)): raise error.Abort(_('can only histedit a changeset together ' 'with all its descendants')) - if repo.revs('(%ld) and merge()', ctxs): + if repo.revs('(%ld) and merge()', revs): raise error.Abort(_('cannot edit history that contains merges')) - root = ctxs[0] # list is already sorted by repo.set + root = repo[revs.first()] # list is already sorted by repo.revs() if not root.mutable(): raise error.Abort(_('cannot edit public changeset: %s') % root, hint=_("see 'hg help phases' for details")) - return [c.node() for c in ctxs] + return pycompat.maplist(repo.changelog.node, revs) def ruleeditor(repo, ui, actions, editcomment=""): """open an editor to edit rules @@ -1415,9 +1423,8 @@ # Save edit rules in .hg/histedit-last-edit.txt in case # the user needs to ask for help after something # surprising happens. - f = open(repo.vfs.join('histedit-last-edit.txt'), 'w') - f.write(rules) - f.close() + with repo.vfs('histedit-last-edit.txt', 'wb') as f: + f.write(rules) return rules diff -r fb92df8b634c -r ed5448edcbfa hgext/infinitepush/README --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/hgext/infinitepush/README Wed Apr 18 15:32:08 2018 -0400 @@ -0,0 +1,23 @@ +## What is it? + +This extension adds ability to save certain pushes to a remote blob store +as bundles and to serve commits from remote blob store. +The revisions are stored on disk or in everstore. +The metadata are stored in sql or on disk. + +## Config options + +infinitepush.branchpattern: pattern to detect a scratchbranch, example + 're:scratch/.+' + +infinitepush.indextype: disk or sql for the metadata +infinitepush.reponame: only relevant for sql metadata backend, reponame to put in + sql + +infinitepush.indexpath: only relevant for ondisk metadata backend, the path to + store the index on disk. If not set will be under .hg + in a folder named filebundlestore + +infinitepush.storepath: only relevant for ondisk metadata backend, the path to + store the bundles. If not set, it will be + .hg/filebundlestore diff -r fb92df8b634c -r ed5448edcbfa hgext/infinitepush/__init__.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/hgext/infinitepush/__init__.py Wed Apr 18 15:32:08 2018 -0400 @@ -0,0 +1,1187 @@ +# Infinite push +# +# Copyright 2016 Facebook, Inc. +# +# This software may be used and distributed according to the terms of the +# GNU General Public License version 2 or any later version. +""" store some pushes in a remote blob store on the server (EXPERIMENTAL) + + [infinitepush] + # Server-side and client-side option. Pattern of the infinitepush bookmark + branchpattern = PATTERN + + # Server or client + server = False + + # Server-side option. Possible values: 'disk' or 'sql'. Fails if not set + indextype = disk + + # Server-side option. Used only if indextype=sql. + # Format: 'IP:PORT:DB_NAME:USER:PASSWORD' + sqlhost = IP:PORT:DB_NAME:USER:PASSWORD + + # Server-side option. Used only if indextype=disk. + # Filesystem path to the index store + indexpath = PATH + + # Server-side option. Possible values: 'disk' or 'external' + # Fails if not set + storetype = disk + + # Server-side option. + # Path to the binary that will save bundle to the bundlestore + # Formatted cmd line will be passed to it (see `put_args`) + put_binary = put + + # Serser-side option. Used only if storetype=external. + # Format cmd-line string for put binary. Placeholder: {filename} + put_args = {filename} + + # Server-side option. + # Path to the binary that get bundle from the bundlestore. + # Formatted cmd line will be passed to it (see `get_args`) + get_binary = get + + # Serser-side option. Used only if storetype=external. + # Format cmd-line string for get binary. Placeholders: {filename} {handle} + get_args = {filename} {handle} + + # Server-side option + logfile = FIlE + + # Server-side option + loglevel = DEBUG + + # Server-side option. Used only if indextype=sql. + # Sets mysql wait_timeout option. + waittimeout = 300 + + # Server-side option. Used only if indextype=sql. + # Sets mysql innodb_lock_wait_timeout option. + locktimeout = 120 + + # Server-side option. Used only if indextype=sql. + # Name of the repository + reponame = '' + + # Client-side option. Used by --list-remote option. List of remote scratch + # patterns to list if no patterns are specified. + defaultremotepatterns = ['*'] + + # Instructs infinitepush to forward all received bundle2 parts to the + # bundle for storage. Defaults to False. + storeallparts = True + + # routes each incoming push to the bundlestore. defaults to False + pushtobundlestore = True + + [remotenames] + # Client-side option + # This option should be set only if remotenames extension is enabled. + # Whether remote bookmarks are tracked by remotenames extension. + bookmarks = True +""" + +from __future__ import absolute_import + +import collections +import contextlib +import errno +import functools +import logging +import os +import random +import re +import socket +import subprocess +import tempfile +import time + +from mercurial.node import ( + bin, + hex, +) + +from mercurial.i18n import _ + +from mercurial.utils import ( + procutil, + stringutil, +) + +from mercurial import ( + bundle2, + changegroup, + commands, + discovery, + encoding, + error, + exchange, + extensions, + hg, + localrepo, + phases, + pushkey, + pycompat, + registrar, + util, + wireprototypes, + wireprotov1peer, + wireprotov1server, +) + +from . import ( + bundleparts, + common, +) + +# Note for extension authors: ONLY specify testedwith = 'ships-with-hg-core' for +# extensions which SHIP WITH MERCURIAL. Non-mainline extensions should +# be specifying the version(s) of Mercurial they are tested with, or +# leave the attribute unspecified. +testedwith = 'ships-with-hg-core' + +configtable = {} +configitem = registrar.configitem(configtable) + +configitem('infinitepush', 'server', + default=False, +) +configitem('infinitepush', 'storetype', + default='', +) +configitem('infinitepush', 'indextype', + default='', +) +configitem('infinitepush', 'indexpath', + default='', +) +configitem('infinitepush', 'storeallparts', + default=False, +) +configitem('infinitepush', 'reponame', + default='', +) +configitem('scratchbranch', 'storepath', + default='', +) +configitem('infinitepush', 'branchpattern', + default='', +) +configitem('infinitepush', 'pushtobundlestore', + default=False, +) +configitem('experimental', 'server-bundlestore-bookmark', + default='', +) +configitem('experimental', 'infinitepush-scratchpush', + default=False, +) + +experimental = 'experimental' +configbookmark = 'server-bundlestore-bookmark' +configscratchpush = 'infinitepush-scratchpush' + +scratchbranchparttype = bundleparts.scratchbranchparttype +revsetpredicate = registrar.revsetpredicate() +templatekeyword = registrar.templatekeyword() +_scratchbranchmatcher = lambda x: False +_maybehash = re.compile(r'^[a-f0-9]+$').search + +def _buildexternalbundlestore(ui): + put_args = ui.configlist('infinitepush', 'put_args', []) + put_binary = ui.config('infinitepush', 'put_binary') + if not put_binary: + raise error.Abort('put binary is not specified') + get_args = ui.configlist('infinitepush', 'get_args', []) + get_binary = ui.config('infinitepush', 'get_binary') + if not get_binary: + raise error.Abort('get binary is not specified') + from . import store + return store.externalbundlestore(put_binary, put_args, get_binary, get_args) + +def _buildsqlindex(ui): + sqlhost = ui.config('infinitepush', 'sqlhost') + if not sqlhost: + raise error.Abort(_('please set infinitepush.sqlhost')) + host, port, db, user, password = sqlhost.split(':') + reponame = ui.config('infinitepush', 'reponame') + if not reponame: + raise error.Abort(_('please set infinitepush.reponame')) + + logfile = ui.config('infinitepush', 'logfile', '') + waittimeout = ui.configint('infinitepush', 'waittimeout', 300) + locktimeout = ui.configint('infinitepush', 'locktimeout', 120) + from . import sqlindexapi + return sqlindexapi.sqlindexapi( + reponame, host, port, db, user, password, + logfile, _getloglevel(ui), waittimeout=waittimeout, + locktimeout=locktimeout) + +def _getloglevel(ui): + loglevel = ui.config('infinitepush', 'loglevel', 'DEBUG') + numeric_loglevel = getattr(logging, loglevel.upper(), None) + if not isinstance(numeric_loglevel, int): + raise error.Abort(_('invalid log level %s') % loglevel) + return numeric_loglevel + +def _tryhoist(ui, remotebookmark): + '''returns a bookmarks with hoisted part removed + + Remotenames extension has a 'hoist' config that allows to use remote + bookmarks without specifying remote path. For example, 'hg update master' + works as well as 'hg update remote/master'. We want to allow the same in + infinitepush. + ''' + + if common.isremotebooksenabled(ui): + hoist = ui.config('remotenames', 'hoistedpeer') + '/' + if remotebookmark.startswith(hoist): + return remotebookmark[len(hoist):] + return remotebookmark + +class bundlestore(object): + def __init__(self, repo): + self._repo = repo + storetype = self._repo.ui.config('infinitepush', 'storetype') + if storetype == 'disk': + from . import store + self.store = store.filebundlestore(self._repo.ui, self._repo) + elif storetype == 'external': + self.store = _buildexternalbundlestore(self._repo.ui) + else: + raise error.Abort( + _('unknown infinitepush store type specified %s') % storetype) + + indextype = self._repo.ui.config('infinitepush', 'indextype') + if indextype == 'disk': + from . import fileindexapi + self.index = fileindexapi.fileindexapi(self._repo) + elif indextype == 'sql': + self.index = _buildsqlindex(self._repo.ui) + else: + raise error.Abort( + _('unknown infinitepush index type specified %s') % indextype) + +def _isserver(ui): + return ui.configbool('infinitepush', 'server') + +def reposetup(ui, repo): + if _isserver(ui) and repo.local(): + repo.bundlestore = bundlestore(repo) + +def extsetup(ui): + commonsetup(ui) + if _isserver(ui): + serverextsetup(ui) + else: + clientextsetup(ui) + +def commonsetup(ui): + wireprotov1server.commands['listkeyspatterns'] = ( + wireprotolistkeyspatterns, 'namespace patterns') + scratchbranchpat = ui.config('infinitepush', 'branchpattern') + if scratchbranchpat: + global _scratchbranchmatcher + kind, pat, _scratchbranchmatcher = \ + stringutil.stringmatcher(scratchbranchpat) + +def serverextsetup(ui): + origpushkeyhandler = bundle2.parthandlermapping['pushkey'] + + def newpushkeyhandler(*args, **kwargs): + bundle2pushkey(origpushkeyhandler, *args, **kwargs) + newpushkeyhandler.params = origpushkeyhandler.params + bundle2.parthandlermapping['pushkey'] = newpushkeyhandler + + orighandlephasehandler = bundle2.parthandlermapping['phase-heads'] + newphaseheadshandler = lambda *args, **kwargs: \ + bundle2handlephases(orighandlephasehandler, *args, **kwargs) + newphaseheadshandler.params = orighandlephasehandler.params + bundle2.parthandlermapping['phase-heads'] = newphaseheadshandler + + extensions.wrapfunction(localrepo.localrepository, 'listkeys', + localrepolistkeys) + wireprotov1server.commands['lookup'] = ( + _lookupwrap(wireprotov1server.commands['lookup'][0]), 'key') + extensions.wrapfunction(exchange, 'getbundlechunks', getbundlechunks) + + extensions.wrapfunction(bundle2, 'processparts', processparts) + +def clientextsetup(ui): + entry = extensions.wrapcommand(commands.table, 'push', _push) + + entry[1].append( + ('', 'bundle-store', None, + _('force push to go to bundle store (EXPERIMENTAL)'))) + + extensions.wrapcommand(commands.table, 'pull', _pull) + + extensions.wrapfunction(discovery, 'checkheads', _checkheads) + + wireprotov1peer.wirepeer.listkeyspatterns = listkeyspatterns + + partorder = exchange.b2partsgenorder + index = partorder.index('changeset') + partorder.insert( + index, partorder.pop(partorder.index(scratchbranchparttype))) + +def _checkheads(orig, pushop): + if pushop.ui.configbool(experimental, configscratchpush, False): + return + return orig(pushop) + +def wireprotolistkeyspatterns(repo, proto, namespace, patterns): + patterns = wireprototypes.decodelist(patterns) + d = repo.listkeys(encoding.tolocal(namespace), patterns).iteritems() + return pushkey.encodekeys(d) + +def localrepolistkeys(orig, self, namespace, patterns=None): + if namespace == 'bookmarks' and patterns: + index = self.bundlestore.index + results = {} + bookmarks = orig(self, namespace) + for pattern in patterns: + results.update(index.getbookmarks(pattern)) + if pattern.endswith('*'): + pattern = 're:^' + pattern[:-1] + '.*' + kind, pat, matcher = stringutil.stringmatcher(pattern) + for bookmark, node in bookmarks.iteritems(): + if matcher(bookmark): + results[bookmark] = node + return results + else: + return orig(self, namespace) + +@wireprotov1peer.batchable +def listkeyspatterns(self, namespace, patterns): + if not self.capable('pushkey'): + yield {}, None + f = wireprotov1peer.future() + self.ui.debug('preparing listkeys for "%s" with pattern "%s"\n' % + (namespace, patterns)) + yield { + 'namespace': encoding.fromlocal(namespace), + 'patterns': wireprototypes.encodelist(patterns) + }, f + d = f.value + self.ui.debug('received listkey for "%s": %i bytes\n' + % (namespace, len(d))) + yield pushkey.decodekeys(d) + +def _readbundlerevs(bundlerepo): + return list(bundlerepo.revs('bundle()')) + +def _includefilelogstobundle(bundlecaps, bundlerepo, bundlerevs, ui): + '''Tells remotefilelog to include all changed files to the changegroup + + By default remotefilelog doesn't include file content to the changegroup. + But we need to include it if we are fetching from bundlestore. + ''' + changedfiles = set() + cl = bundlerepo.changelog + for r in bundlerevs: + # [3] means changed files + changedfiles.update(cl.read(r)[3]) + if not changedfiles: + return bundlecaps + + changedfiles = '\0'.join(changedfiles) + newcaps = [] + appended = False + for cap in (bundlecaps or []): + if cap.startswith('excludepattern='): + newcaps.append('\0'.join((cap, changedfiles))) + appended = True + else: + newcaps.append(cap) + if not appended: + # Not found excludepattern cap. Just append it + newcaps.append('excludepattern=' + changedfiles) + + return newcaps + +def _rebundle(bundlerepo, bundleroots, unknownhead): + ''' + Bundle may include more revision then user requested. For example, + if user asks for revision but bundle also consists its descendants. + This function will filter out all revision that user is not requested. + ''' + parts = [] + + version = '02' + outgoing = discovery.outgoing(bundlerepo, commonheads=bundleroots, + missingheads=[unknownhead]) + cgstream = changegroup.makestream(bundlerepo, outgoing, version, 'pull') + cgstream = util.chunkbuffer(cgstream).read() + cgpart = bundle2.bundlepart('changegroup', data=cgstream) + cgpart.addparam('version', version) + parts.append(cgpart) + + return parts + +def _getbundleroots(oldrepo, bundlerepo, bundlerevs): + cl = bundlerepo.changelog + bundleroots = [] + for rev in bundlerevs: + node = cl.node(rev) + parents = cl.parents(node) + for parent in parents: + # include all revs that exist in the main repo + # to make sure that bundle may apply client-side + if parent in oldrepo: + bundleroots.append(parent) + return bundleroots + +def _needsrebundling(head, bundlerepo): + bundleheads = list(bundlerepo.revs('heads(bundle())')) + return not (len(bundleheads) == 1 and + bundlerepo[bundleheads[0]].node() == head) + +def _generateoutputparts(head, bundlerepo, bundleroots, bundlefile): + '''generates bundle that will be send to the user + + returns tuple with raw bundle string and bundle type + ''' + parts = [] + if not _needsrebundling(head, bundlerepo): + with util.posixfile(bundlefile, "rb") as f: + unbundler = exchange.readbundle(bundlerepo.ui, f, bundlefile) + if isinstance(unbundler, changegroup.cg1unpacker): + part = bundle2.bundlepart('changegroup', + data=unbundler._stream.read()) + part.addparam('version', '01') + parts.append(part) + elif isinstance(unbundler, bundle2.unbundle20): + haschangegroup = False + for part in unbundler.iterparts(): + if part.type == 'changegroup': + haschangegroup = True + newpart = bundle2.bundlepart(part.type, data=part.read()) + for key, value in part.params.iteritems(): + newpart.addparam(key, value) + parts.append(newpart) + + if not haschangegroup: + raise error.Abort( + 'unexpected bundle without changegroup part, ' + + 'head: %s' % hex(head), + hint='report to administrator') + else: + raise error.Abort('unknown bundle type') + else: + parts = _rebundle(bundlerepo, bundleroots, head) + + return parts + +def getbundlechunks(orig, repo, source, heads=None, bundlecaps=None, **kwargs): + heads = heads or [] + # newheads are parents of roots of scratch bundles that were requested + newphases = {} + scratchbundles = [] + newheads = [] + scratchheads = [] + nodestobundle = {} + allbundlestocleanup = [] + try: + for head in heads: + if head not in repo.changelog.nodemap: + if head not in nodestobundle: + newbundlefile = common.downloadbundle(repo, head) + bundlepath = "bundle:%s+%s" % (repo.root, newbundlefile) + bundlerepo = hg.repository(repo.ui, bundlepath) + + allbundlestocleanup.append((bundlerepo, newbundlefile)) + bundlerevs = set(_readbundlerevs(bundlerepo)) + bundlecaps = _includefilelogstobundle( + bundlecaps, bundlerepo, bundlerevs, repo.ui) + cl = bundlerepo.changelog + bundleroots = _getbundleroots(repo, bundlerepo, bundlerevs) + for rev in bundlerevs: + node = cl.node(rev) + newphases[hex(node)] = str(phases.draft) + nodestobundle[node] = (bundlerepo, bundleroots, + newbundlefile) + + scratchbundles.append( + _generateoutputparts(head, *nodestobundle[head])) + newheads.extend(bundleroots) + scratchheads.append(head) + finally: + for bundlerepo, bundlefile in allbundlestocleanup: + bundlerepo.close() + try: + os.unlink(bundlefile) + except (IOError, OSError): + # if we can't cleanup the file then just ignore the error, + # no need to fail + pass + + pullfrombundlestore = bool(scratchbundles) + wrappedchangegrouppart = False + wrappedlistkeys = False + oldchangegrouppart = exchange.getbundle2partsmapping['changegroup'] + try: + def _changegrouppart(bundler, *args, **kwargs): + # Order is important here. First add non-scratch part + # and only then add parts with scratch bundles because + # non-scratch part contains parents of roots of scratch bundles. + result = oldchangegrouppart(bundler, *args, **kwargs) + for bundle in scratchbundles: + for part in bundle: + bundler.addpart(part) + return result + + exchange.getbundle2partsmapping['changegroup'] = _changegrouppart + wrappedchangegrouppart = True + + def _listkeys(orig, self, namespace): + origvalues = orig(self, namespace) + if namespace == 'phases' and pullfrombundlestore: + if origvalues.get('publishing') == 'True': + # Make repo non-publishing to preserve draft phase + del origvalues['publishing'] + origvalues.update(newphases) + return origvalues + + extensions.wrapfunction(localrepo.localrepository, 'listkeys', + _listkeys) + wrappedlistkeys = True + heads = list((set(newheads) | set(heads)) - set(scratchheads)) + result = orig(repo, source, heads=heads, + bundlecaps=bundlecaps, **kwargs) + finally: + if wrappedchangegrouppart: + exchange.getbundle2partsmapping['changegroup'] = oldchangegrouppart + if wrappedlistkeys: + extensions.unwrapfunction(localrepo.localrepository, 'listkeys', + _listkeys) + return result + +def _lookupwrap(orig): + def _lookup(repo, proto, key): + localkey = encoding.tolocal(key) + + if isinstance(localkey, str) and _scratchbranchmatcher(localkey): + scratchnode = repo.bundlestore.index.getnode(localkey) + if scratchnode: + return "%s %s\n" % (1, scratchnode) + else: + return "%s %s\n" % (0, 'scratch branch %s not found' % localkey) + else: + try: + r = hex(repo.lookup(localkey)) + return "%s %s\n" % (1, r) + except Exception as inst: + if repo.bundlestore.index.getbundle(localkey): + return "%s %s\n" % (1, localkey) + else: + r = str(inst) + return "%s %s\n" % (0, r) + return _lookup + +def _pull(orig, ui, repo, source="default", **opts): + opts = pycompat.byteskwargs(opts) + # Copy paste from `pull` command + source, branches = hg.parseurl(ui.expandpath(source), opts.get('branch')) + + scratchbookmarks = {} + unfi = repo.unfiltered() + unknownnodes = [] + for rev in opts.get('rev', []): + if rev not in unfi: + unknownnodes.append(rev) + if opts.get('bookmark'): + bookmarks = [] + revs = opts.get('rev') or [] + for bookmark in opts.get('bookmark'): + if _scratchbranchmatcher(bookmark): + # rev is not known yet + # it will be fetched with listkeyspatterns next + scratchbookmarks[bookmark] = 'REVTOFETCH' + else: + bookmarks.append(bookmark) + + if scratchbookmarks: + other = hg.peer(repo, opts, source) + fetchedbookmarks = other.listkeyspatterns( + 'bookmarks', patterns=scratchbookmarks) + for bookmark in scratchbookmarks: + if bookmark not in fetchedbookmarks: + raise error.Abort('remote bookmark %s not found!' % + bookmark) + scratchbookmarks[bookmark] = fetchedbookmarks[bookmark] + revs.append(fetchedbookmarks[bookmark]) + opts['bookmark'] = bookmarks + opts['rev'] = revs + + if scratchbookmarks or unknownnodes: + # Set anyincoming to True + extensions.wrapfunction(discovery, 'findcommonincoming', + _findcommonincoming) + try: + # Remote scratch bookmarks will be deleted because remotenames doesn't + # know about them. Let's save it before pull and restore after + remotescratchbookmarks = _readscratchremotebookmarks(ui, repo, source) + result = orig(ui, repo, source, **pycompat.strkwargs(opts)) + # TODO(stash): race condition is possible + # if scratch bookmarks was updated right after orig. + # But that's unlikely and shouldn't be harmful. + if common.isremotebooksenabled(ui): + remotescratchbookmarks.update(scratchbookmarks) + _saveremotebookmarks(repo, remotescratchbookmarks, source) + else: + _savelocalbookmarks(repo, scratchbookmarks) + return result + finally: + if scratchbookmarks: + extensions.unwrapfunction(discovery, 'findcommonincoming') + +def _readscratchremotebookmarks(ui, repo, other): + if common.isremotebooksenabled(ui): + remotenamesext = extensions.find('remotenames') + remotepath = remotenamesext.activepath(repo.ui, other) + result = {} + # Let's refresh remotenames to make sure we have it up to date + # Seems that `repo.names['remotebookmarks']` may return stale bookmarks + # and it results in deleting scratch bookmarks. Our best guess how to + # fix it is to use `clearnames()` + repo._remotenames.clearnames() + for remotebookmark in repo.names['remotebookmarks'].listnames(repo): + path, bookname = remotenamesext.splitremotename(remotebookmark) + if path == remotepath and _scratchbranchmatcher(bookname): + nodes = repo.names['remotebookmarks'].nodes(repo, + remotebookmark) + if nodes: + result[bookname] = hex(nodes[0]) + return result + else: + return {} + +def _saveremotebookmarks(repo, newbookmarks, remote): + remotenamesext = extensions.find('remotenames') + remotepath = remotenamesext.activepath(repo.ui, remote) + branches = collections.defaultdict(list) + bookmarks = {} + remotenames = remotenamesext.readremotenames(repo) + for hexnode, nametype, remote, rname in remotenames: + if remote != remotepath: + continue + if nametype == 'bookmarks': + if rname in newbookmarks: + # It's possible if we have a normal bookmark that matches + # scratch branch pattern. In this case just use the current + # bookmark node + del newbookmarks[rname] + bookmarks[rname] = hexnode + elif nametype == 'branches': + # saveremotenames expects 20 byte binary nodes for branches + branches[rname].append(bin(hexnode)) + + for bookmark, hexnode in newbookmarks.iteritems(): + bookmarks[bookmark] = hexnode + remotenamesext.saveremotenames(repo, remotepath, branches, bookmarks) + +def _savelocalbookmarks(repo, bookmarks): + if not bookmarks: + return + with repo.wlock(), repo.lock(), repo.transaction('bookmark') as tr: + changes = [] + for scratchbook, node in bookmarks.iteritems(): + changectx = repo[node] + changes.append((scratchbook, changectx.node())) + repo._bookmarks.applychanges(repo, tr, changes) + +def _findcommonincoming(orig, *args, **kwargs): + common, inc, remoteheads = orig(*args, **kwargs) + return common, True, remoteheads + +def _push(orig, ui, repo, dest=None, *args, **opts): + + bookmark = opts.get(r'bookmark') + # we only support pushing one infinitepush bookmark at once + if len(bookmark) == 1: + bookmark = bookmark[0] + else: + bookmark = '' + + oldphasemove = None + overrides = {(experimental, configbookmark): bookmark} + + with ui.configoverride(overrides, 'infinitepush'): + scratchpush = opts.get('bundle_store') + if _scratchbranchmatcher(bookmark): + scratchpush = True + # bundle2 can be sent back after push (for example, bundle2 + # containing `pushkey` part to update bookmarks) + ui.setconfig(experimental, 'bundle2.pushback', True) + + if scratchpush: + # this is an infinitepush, we don't want the bookmark to be applied + # rather that should be stored in the bundlestore + opts[r'bookmark'] = [] + ui.setconfig(experimental, configscratchpush, True) + oldphasemove = extensions.wrapfunction(exchange, + '_localphasemove', + _phasemove) + # Copy-paste from `push` command + path = ui.paths.getpath(dest, default=('default-push', 'default')) + if not path: + raise error.Abort(_('default repository not configured!'), + hint=_("see 'hg help config.paths'")) + destpath = path.pushloc or path.loc + # Remote scratch bookmarks will be deleted because remotenames doesn't + # know about them. Let's save it before push and restore after + remotescratchbookmarks = _readscratchremotebookmarks(ui, repo, destpath) + result = orig(ui, repo, dest, *args, **opts) + if common.isremotebooksenabled(ui): + if bookmark and scratchpush: + other = hg.peer(repo, opts, destpath) + fetchedbookmarks = other.listkeyspatterns('bookmarks', + patterns=[bookmark]) + remotescratchbookmarks.update(fetchedbookmarks) + _saveremotebookmarks(repo, remotescratchbookmarks, destpath) + if oldphasemove: + exchange._localphasemove = oldphasemove + return result + +def _deleteinfinitepushbookmarks(ui, repo, path, names): + """Prune remote names by removing the bookmarks we don't want anymore, + then writing the result back to disk + """ + remotenamesext = extensions.find('remotenames') + + # remotename format is: + # (node, nametype ("branches" or "bookmarks"), remote, name) + nametype_idx = 1 + remote_idx = 2 + name_idx = 3 + remotenames = [remotename for remotename in \ + remotenamesext.readremotenames(repo) \ + if remotename[remote_idx] == path] + remote_bm_names = [remotename[name_idx] for remotename in \ + remotenames if remotename[nametype_idx] == "bookmarks"] + + for name in names: + if name not in remote_bm_names: + raise error.Abort(_("infinitepush bookmark '{}' does not exist " + "in path '{}'").format(name, path)) + + bookmarks = {} + branches = collections.defaultdict(list) + for node, nametype, remote, name in remotenames: + if nametype == "bookmarks" and name not in names: + bookmarks[name] = node + elif nametype == "branches": + # saveremotenames wants binary nodes for branches + branches[name].append(bin(node)) + + remotenamesext.saveremotenames(repo, path, branches, bookmarks) + +def _phasemove(orig, pushop, nodes, phase=phases.public): + """prevent commits from being marked public + + Since these are going to a scratch branch, they aren't really being + published.""" + + if phase != phases.public: + orig(pushop, nodes, phase) + +@exchange.b2partsgenerator(scratchbranchparttype) +def partgen(pushop, bundler): + bookmark = pushop.ui.config(experimental, configbookmark) + scratchpush = pushop.ui.configbool(experimental, configscratchpush) + if 'changesets' in pushop.stepsdone or not scratchpush: + return + + if scratchbranchparttype not in bundle2.bundle2caps(pushop.remote): + return + + pushop.stepsdone.add('changesets') + if not pushop.outgoing.missing: + pushop.ui.status(_('no changes found\n')) + pushop.cgresult = 0 + return + + # This parameter tells the server that the following bundle is an + # infinitepush. This let's it switch the part processing to our infinitepush + # code path. + bundler.addparam("infinitepush", "True") + + scratchparts = bundleparts.getscratchbranchparts(pushop.repo, + pushop.remote, + pushop.outgoing, + pushop.ui, + bookmark) + + for scratchpart in scratchparts: + bundler.addpart(scratchpart) + + def handlereply(op): + # server either succeeds or aborts; no code to read + pushop.cgresult = 1 + + return handlereply + +bundle2.capabilities[bundleparts.scratchbranchparttype] = () + +def _getrevs(bundle, oldnode, force, bookmark): + 'extracts and validates the revs to be imported' + revs = [bundle[r] for r in bundle.revs('sort(bundle())')] + + # new bookmark + if oldnode is None: + return revs + + # Fast forward update + if oldnode in bundle and list(bundle.set('bundle() & %s::', oldnode)): + return revs + + return revs + +@contextlib.contextmanager +def logservicecall(logger, service, **kwargs): + start = time.time() + logger(service, eventtype='start', **kwargs) + try: + yield + logger(service, eventtype='success', + elapsedms=(time.time() - start) * 1000, **kwargs) + except Exception as e: + logger(service, eventtype='failure', + elapsedms=(time.time() - start) * 1000, errormsg=str(e), + **kwargs) + raise + +def _getorcreateinfinitepushlogger(op): + logger = op.records['infinitepushlogger'] + if not logger: + ui = op.repo.ui + try: + username = procutil.getuser() + except Exception: + username = 'unknown' + # Generate random request id to be able to find all logged entries + # for the same request. Since requestid is pseudo-generated it may + # not be unique, but we assume that (hostname, username, requestid) + # is unique. + random.seed() + requestid = random.randint(0, 2000000000) + hostname = socket.gethostname() + logger = functools.partial(ui.log, 'infinitepush', user=username, + requestid=requestid, hostname=hostname, + reponame=ui.config('infinitepush', + 'reponame')) + op.records.add('infinitepushlogger', logger) + else: + logger = logger[0] + return logger + +def storetobundlestore(orig, repo, op, unbundler): + """stores the incoming bundle coming from push command to the bundlestore + instead of applying on the revlogs""" + + repo.ui.status(_("storing changesets on the bundlestore\n")) + bundler = bundle2.bundle20(repo.ui) + + # processing each part and storing it in bundler + with bundle2.partiterator(repo, op, unbundler) as parts: + for part in parts: + bundlepart = None + if part.type == 'replycaps': + # This configures the current operation to allow reply parts. + bundle2._processpart(op, part) + else: + bundlepart = bundle2.bundlepart(part.type, data=part.read()) + for key, value in part.params.iteritems(): + bundlepart.addparam(key, value) + + # Certain parts require a response + if part.type in ('pushkey', 'changegroup'): + if op.reply is not None: + rpart = op.reply.newpart('reply:%s' % part.type) + rpart.addparam('in-reply-to', str(part.id), + mandatory=False) + rpart.addparam('return', '1', mandatory=False) + + op.records.add(part.type, { + 'return': 1, + }) + if bundlepart: + bundler.addpart(bundlepart) + + # storing the bundle in the bundlestore + buf = util.chunkbuffer(bundler.getchunks()) + fd, bundlefile = tempfile.mkstemp() + try: + try: + fp = os.fdopen(fd, r'wb') + fp.write(buf.read()) + finally: + fp.close() + storebundle(op, {}, bundlefile) + finally: + try: + os.unlink(bundlefile) + except Exception: + # we would rather see the original exception + pass + +def processparts(orig, repo, op, unbundler): + + # make sure we don't wrap processparts in case of `hg unbundle` + if op.source == 'unbundle': + return orig(repo, op, unbundler) + + # this server routes each push to bundle store + if repo.ui.configbool('infinitepush', 'pushtobundlestore'): + return storetobundlestore(orig, repo, op, unbundler) + + if unbundler.params.get('infinitepush') != 'True': + return orig(repo, op, unbundler) + + handleallparts = repo.ui.configbool('infinitepush', 'storeallparts') + + bundler = bundle2.bundle20(repo.ui) + cgparams = None + with bundle2.partiterator(repo, op, unbundler) as parts: + for part in parts: + bundlepart = None + if part.type == 'replycaps': + # This configures the current operation to allow reply parts. + bundle2._processpart(op, part) + elif part.type == bundleparts.scratchbranchparttype: + # Scratch branch parts need to be converted to normal + # changegroup parts, and the extra parameters stored for later + # when we upload to the store. Eventually those parameters will + # be put on the actual bundle instead of this part, then we can + # send a vanilla changegroup instead of the scratchbranch part. + cgversion = part.params.get('cgversion', '01') + bundlepart = bundle2.bundlepart('changegroup', data=part.read()) + bundlepart.addparam('version', cgversion) + cgparams = part.params + + # If we're not dumping all parts into the new bundle, we need to + # alert the future pushkey and phase-heads handler to skip + # the part. + if not handleallparts: + op.records.add(scratchbranchparttype + '_skippushkey', True) + op.records.add(scratchbranchparttype + '_skipphaseheads', + True) + else: + if handleallparts: + # Ideally we would not process any parts, and instead just + # forward them to the bundle for storage, but since this + # differs from previous behavior, we need to put it behind a + # config flag for incremental rollout. + bundlepart = bundle2.bundlepart(part.type, data=part.read()) + for key, value in part.params.iteritems(): + bundlepart.addparam(key, value) + + # Certain parts require a response + if part.type == 'pushkey': + if op.reply is not None: + rpart = op.reply.newpart('reply:pushkey') + rpart.addparam('in-reply-to', str(part.id), + mandatory=False) + rpart.addparam('return', '1', mandatory=False) + else: + bundle2._processpart(op, part) + + if handleallparts: + op.records.add(part.type, { + 'return': 1, + }) + if bundlepart: + bundler.addpart(bundlepart) + + # If commits were sent, store them + if cgparams: + buf = util.chunkbuffer(bundler.getchunks()) + fd, bundlefile = tempfile.mkstemp() + try: + try: + fp = os.fdopen(fd, r'wb') + fp.write(buf.read()) + finally: + fp.close() + storebundle(op, cgparams, bundlefile) + finally: + try: + os.unlink(bundlefile) + except Exception: + # we would rather see the original exception + pass + +def storebundle(op, params, bundlefile): + log = _getorcreateinfinitepushlogger(op) + parthandlerstart = time.time() + log(scratchbranchparttype, eventtype='start') + index = op.repo.bundlestore.index + store = op.repo.bundlestore.store + op.records.add(scratchbranchparttype + '_skippushkey', True) + + bundle = None + try: # guards bundle + bundlepath = "bundle:%s+%s" % (op.repo.root, bundlefile) + bundle = hg.repository(op.repo.ui, bundlepath) + + bookmark = params.get('bookmark') + bookprevnode = params.get('bookprevnode', '') + force = params.get('force') + + if bookmark: + oldnode = index.getnode(bookmark) + else: + oldnode = None + bundleheads = bundle.revs('heads(bundle())') + if bookmark and len(bundleheads) > 1: + raise error.Abort( + _('cannot push more than one head to a scratch branch')) + + revs = _getrevs(bundle, oldnode, force, bookmark) + + # Notify the user of what is being pushed + plural = 's' if len(revs) > 1 else '' + op.repo.ui.warn(_("pushing %d commit%s:\n") % (len(revs), plural)) + maxoutput = 10 + for i in range(0, min(len(revs), maxoutput)): + firstline = bundle[revs[i]].description().split('\n')[0][:50] + op.repo.ui.warn((" %s %s\n") % (revs[i], firstline)) + + if len(revs) > maxoutput + 1: + op.repo.ui.warn((" ...\n")) + firstline = bundle[revs[-1]].description().split('\n')[0][:50] + op.repo.ui.warn((" %s %s\n") % (revs[-1], firstline)) + + nodesctx = [bundle[rev] for rev in revs] + inindex = lambda rev: bool(index.getbundle(bundle[rev].hex())) + if bundleheads: + newheadscount = sum(not inindex(rev) for rev in bundleheads) + else: + newheadscount = 0 + # If there's a bookmark specified, there should be only one head, + # so we choose the last node, which will be that head. + # If a bug or malicious client allows there to be a bookmark + # with multiple heads, we will place the bookmark on the last head. + bookmarknode = nodesctx[-1].hex() if nodesctx else None + key = None + if newheadscount: + with open(bundlefile, 'r') as f: + bundledata = f.read() + with logservicecall(log, 'bundlestore', + bundlesize=len(bundledata)): + bundlesizelimit = 100 * 1024 * 1024 # 100 MB + if len(bundledata) > bundlesizelimit: + error_msg = ('bundle is too big: %d bytes. ' + + 'max allowed size is 100 MB') + raise error.Abort(error_msg % (len(bundledata),)) + key = store.write(bundledata) + + with logservicecall(log, 'index', newheadscount=newheadscount), index: + if key: + index.addbundle(key, nodesctx) + if bookmark: + index.addbookmark(bookmark, bookmarknode) + _maybeaddpushbackpart(op, bookmark, bookmarknode, + bookprevnode, params) + log(scratchbranchparttype, eventtype='success', + elapsedms=(time.time() - parthandlerstart) * 1000) + + except Exception as e: + log(scratchbranchparttype, eventtype='failure', + elapsedms=(time.time() - parthandlerstart) * 1000, + errormsg=str(e)) + raise + finally: + if bundle: + bundle.close() + +@bundle2.parthandler(scratchbranchparttype, + ('bookmark', 'bookprevnode', 'force', + 'pushbackbookmarks', 'cgversion')) +def bundle2scratchbranch(op, part): + '''unbundle a bundle2 part containing a changegroup to store''' + + bundler = bundle2.bundle20(op.repo.ui) + cgversion = part.params.get('cgversion', '01') + cgpart = bundle2.bundlepart('changegroup', data=part.read()) + cgpart.addparam('version', cgversion) + bundler.addpart(cgpart) + buf = util.chunkbuffer(bundler.getchunks()) + + fd, bundlefile = tempfile.mkstemp() + try: + try: + fp = os.fdopen(fd, r'wb') + fp.write(buf.read()) + finally: + fp.close() + storebundle(op, part.params, bundlefile) + finally: + try: + os.unlink(bundlefile) + except OSError as e: + if e.errno != errno.ENOENT: + raise + + return 1 + +def _maybeaddpushbackpart(op, bookmark, newnode, oldnode, params): + if params.get('pushbackbookmarks'): + if op.reply and 'pushback' in op.reply.capabilities: + params = { + 'namespace': 'bookmarks', + 'key': bookmark, + 'new': newnode, + 'old': oldnode, + } + op.reply.newpart('pushkey', mandatoryparams=params.iteritems()) + +def bundle2pushkey(orig, op, part): + '''Wrapper of bundle2.handlepushkey() + + The only goal is to skip calling the original function if flag is set. + It's set if infinitepush push is happening. + ''' + if op.records[scratchbranchparttype + '_skippushkey']: + if op.reply is not None: + rpart = op.reply.newpart('reply:pushkey') + rpart.addparam('in-reply-to', str(part.id), mandatory=False) + rpart.addparam('return', '1', mandatory=False) + return 1 + + return orig(op, part) + +def bundle2handlephases(orig, op, part): + '''Wrapper of bundle2.handlephases() + + The only goal is to skip calling the original function if flag is set. + It's set if infinitepush push is happening. + ''' + + if op.records[scratchbranchparttype + '_skipphaseheads']: + return + + return orig(op, part) + +def _asyncsavemetadata(root, nodes): + '''starts a separate process that fills metadata for the nodes + + This function creates a separate process and doesn't wait for it's + completion. This was done to avoid slowing down pushes + ''' + + maxnodes = 50 + if len(nodes) > maxnodes: + return + nodesargs = [] + for node in nodes: + nodesargs.append('--node') + nodesargs.append(node) + with open(os.devnull, 'w+b') as devnull: + cmdline = [util.hgexecutable(), 'debugfillinfinitepushmetadata', + '-R', root] + nodesargs + # Process will run in background. We don't care about the return code + subprocess.Popen(cmdline, close_fds=True, shell=False, + stdin=devnull, stdout=devnull, stderr=devnull) diff -r fb92df8b634c -r ed5448edcbfa hgext/infinitepush/bundleparts.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/hgext/infinitepush/bundleparts.py Wed Apr 18 15:32:08 2018 -0400 @@ -0,0 +1,115 @@ +# Copyright 2017 Facebook, Inc. +# +# This software may be used and distributed according to the terms of the +# GNU General Public License version 2 or any later version. + +from __future__ import absolute_import + +from mercurial.i18n import _ + +from mercurial import ( + bundle2, + changegroup, + error, + extensions, + revsetlang, + util, +) + +from . import common + +isremotebooksenabled = common.isremotebooksenabled + +scratchbranchparttype = 'b2x:infinitepush' + +def getscratchbranchparts(repo, peer, outgoing, ui, bookmark): + if not outgoing.missing: + raise error.Abort(_('no commits to push')) + + if scratchbranchparttype not in bundle2.bundle2caps(peer): + raise error.Abort(_('no server support for %r') % scratchbranchparttype) + + _validaterevset(repo, revsetlang.formatspec('%ln', outgoing.missing), + bookmark) + + supportedversions = changegroup.supportedoutgoingversions(repo) + # Explicitly avoid using '01' changegroup version in infinitepush to + # support general delta + supportedversions.discard('01') + cgversion = min(supportedversions) + _handlelfs(repo, outgoing.missing) + cg = changegroup.makestream(repo, outgoing, cgversion, 'push') + + params = {} + params['cgversion'] = cgversion + if bookmark: + params['bookmark'] = bookmark + # 'prevbooknode' is necessary for pushkey reply part + params['bookprevnode'] = '' + bookmarks = repo._bookmarks + if bookmark in bookmarks: + params['bookprevnode'] = bookmarks.changectx(bookmark).hex() + + # Do not send pushback bundle2 part with bookmarks if remotenames extension + # is enabled. It will be handled manually in `_push()` + if not isremotebooksenabled(ui): + params['pushbackbookmarks'] = '1' + + parts = [] + + # .upper() marks this as a mandatory part: server will abort if there's no + # handler + parts.append(bundle2.bundlepart( + scratchbranchparttype.upper(), + advisoryparams=params.iteritems(), + data=cg)) + + return parts + +def _validaterevset(repo, revset, bookmark): + """Abort if the revs to be pushed aren't valid for a scratch branch.""" + if not repo.revs(revset): + raise error.Abort(_('nothing to push')) + if bookmark: + # Allow bundle with many heads only if no bookmark is specified + heads = repo.revs('heads(%r)', revset) + if len(heads) > 1: + raise error.Abort( + _('cannot push more than one head to a scratch branch')) + +def _handlelfs(repo, missing): + '''Special case if lfs is enabled + + If lfs is enabled then we need to call prepush hook + to make sure large files are uploaded to lfs + ''' + try: + lfsmod = extensions.find('lfs') + lfsmod.wrapper.uploadblobsfromrevs(repo, missing) + except KeyError: + # Ignore if lfs extension is not enabled + return + +class copiedpart(object): + """a copy of unbundlepart content that can be consumed later""" + + def __init__(self, part): + # copy "public properties" + self.type = part.type + self.id = part.id + self.mandatory = part.mandatory + self.mandatoryparams = part.mandatoryparams + self.advisoryparams = part.advisoryparams + self.params = part.params + self.mandatorykeys = part.mandatorykeys + # copy the buffer + self._io = util.stringio(part.read()) + + def consume(self): + return + + def read(self, size=None): + if size is None: + return self._io.read() + else: + return self._io.read(size) diff -r fb92df8b634c -r ed5448edcbfa hgext/infinitepush/common.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/hgext/infinitepush/common.py Wed Apr 18 15:32:08 2018 -0400 @@ -0,0 +1,48 @@ +# Copyright 2017 Facebook, Inc. +# +# This software may be used and distributed according to the terms of the +# GNU General Public License version 2 or any later version. + +from __future__ import absolute_import + +import os +import tempfile + +from mercurial.node import hex + +from mercurial import ( + error, + extensions, +) + +def isremotebooksenabled(ui): + return ('remotenames' in extensions._extensions and + ui.configbool('remotenames', 'bookmarks')) + +def downloadbundle(repo, unknownbinhead): + index = repo.bundlestore.index + store = repo.bundlestore.store + bundleid = index.getbundle(hex(unknownbinhead)) + if bundleid is None: + raise error.Abort('%s head is not known' % hex(unknownbinhead)) + bundleraw = store.read(bundleid) + return _makebundlefromraw(bundleraw) + +def _makebundlefromraw(data): + fp = None + fd, bundlefile = tempfile.mkstemp() + try: # guards bundlefile + try: # guards fp + fp = os.fdopen(fd, 'wb') + fp.write(data) + finally: + fp.close() + except Exception: + try: + os.unlink(bundlefile) + except Exception: + # we would rather see the original exception + pass + raise + + return bundlefile diff -r fb92df8b634c -r ed5448edcbfa hgext/infinitepush/fileindexapi.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/hgext/infinitepush/fileindexapi.py Wed Apr 18 15:32:08 2018 -0400 @@ -0,0 +1,107 @@ +# Infinite push +# +# Copyright 2016 Facebook, Inc. +# +# This software may be used and distributed according to the terms of the +# GNU General Public License version 2 or any later version. +""" + [infinitepush] + # Server-side option. Used only if indextype=disk. + # Filesystem path to the index store + indexpath = PATH +""" + +from __future__ import absolute_import + +import os + +from mercurial.utils import stringutil + +from . import indexapi + +class fileindexapi(indexapi.indexapi): + def __init__(self, repo): + super(fileindexapi, self).__init__() + self._repo = repo + root = repo.ui.config('infinitepush', 'indexpath') + if not root: + root = os.path.join('scratchbranches', 'index') + + self._nodemap = os.path.join(root, 'nodemap') + self._bookmarkmap = os.path.join(root, 'bookmarkmap') + self._metadatamap = os.path.join(root, 'nodemetadatamap') + self._lock = None + + def __enter__(self): + self._lock = self._repo.wlock() + return self + + def __exit__(self, exc_type, exc_val, exc_tb): + if self._lock: + self._lock.__exit__(exc_type, exc_val, exc_tb) + + def addbundle(self, bundleid, nodesctx): + for node in nodesctx: + nodepath = os.path.join(self._nodemap, node.hex()) + self._write(nodepath, bundleid) + + def addbookmark(self, bookmark, node): + bookmarkpath = os.path.join(self._bookmarkmap, bookmark) + self._write(bookmarkpath, node) + + def addmanybookmarks(self, bookmarks): + for bookmark, node in bookmarks.items(): + self.addbookmark(bookmark, node) + + def deletebookmarks(self, patterns): + for pattern in patterns: + for bookmark, _ in self._listbookmarks(pattern): + bookmarkpath = os.path.join(self._bookmarkmap, bookmark) + self._delete(bookmarkpath) + + def getbundle(self, node): + nodepath = os.path.join(self._nodemap, node) + return self._read(nodepath) + + def getnode(self, bookmark): + bookmarkpath = os.path.join(self._bookmarkmap, bookmark) + return self._read(bookmarkpath) + + def getbookmarks(self, query): + return dict(self._listbookmarks(query)) + + def saveoptionaljsonmetadata(self, node, jsonmetadata): + vfs = self._repo.vfs + vfs.write(os.path.join(self._metadatamap, node), jsonmetadata) + + def _listbookmarks(self, pattern): + if pattern.endswith('*'): + pattern = 're:^' + pattern[:-1] + '.*' + kind, pat, matcher = stringutil.stringmatcher(pattern) + prefixlen = len(self._bookmarkmap) + 1 + for dirpath, _, books in self._repo.vfs.walk(self._bookmarkmap): + for book in books: + bookmark = os.path.join(dirpath, book)[prefixlen:] + if not matcher(bookmark): + continue + yield bookmark, self._read(os.path.join(dirpath, book)) + + def _write(self, path, value): + vfs = self._repo.vfs + dirname = vfs.dirname(path) + if not vfs.exists(dirname): + vfs.makedirs(dirname) + + vfs.write(path, value) + + def _read(self, path): + vfs = self._repo.vfs + if not vfs.exists(path): + return None + return vfs.read(path) + + def _delete(self, path): + vfs = self._repo.vfs + if not vfs.exists(path): + return + return vfs.unlink(path) diff -r fb92df8b634c -r ed5448edcbfa hgext/infinitepush/indexapi.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/hgext/infinitepush/indexapi.py Wed Apr 18 15:32:08 2018 -0400 @@ -0,0 +1,70 @@ +# Infinite push +# +# Copyright 2016 Facebook, Inc. +# +# This software may be used and distributed according to the terms of the +# GNU General Public License version 2 or any later version. + +from __future__ import absolute_import + +class indexapi(object): + """Class that manages access to infinitepush index. + + This class is a context manager and all write operations (like + deletebookmarks, addbookmark etc) should use `with` statement: + + with index: + index.deletebookmarks(...) + ... + """ + + def __init__(self): + """Initializes the metadata store connection.""" + + def close(self): + """Cleans up the metadata store connection.""" + + def __enter__(self): + return self + + def __exit__(self, exc_type, exc_val, exc_tb): + pass + + def addbundle(self, bundleid, nodesctx): + """Takes a bundleid and a list of node contexts for each node + in that bundle and records that.""" + raise NotImplementedError() + + def addbookmark(self, bookmark, node): + """Takes a bookmark name and hash, and records mapping in the metadata + store.""" + raise NotImplementedError() + + def addmanybookmarks(self, bookmarks): + """Takes a dict with mapping from bookmark to hash and records mapping + in the metadata store.""" + raise NotImplementedError() + + def deletebookmarks(self, patterns): + """Accepts list of bookmarks and deletes them. + """ + raise NotImplementedError() + + def getbundle(self, node): + """Returns the bundleid for the bundle that contains the given node.""" + raise NotImplementedError() + + def getnode(self, bookmark): + """Returns the node for the given bookmark. None if it doesn't exist.""" + raise NotImplementedError() + + def getbookmarks(self, query): + """Returns bookmarks that match the query""" + raise NotImplementedError() + + def saveoptionaljsonmetadata(self, node, jsonmetadata): + """Saves optional metadata for a given node""" + raise NotImplementedError() + +class indexexception(Exception): + pass diff -r fb92df8b634c -r ed5448edcbfa hgext/infinitepush/schema.sql --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/hgext/infinitepush/schema.sql Wed Apr 18 15:32:08 2018 -0400 @@ -0,0 +1,33 @@ +CREATE TABLE `bookmarkstonode` ( + `node` varbinary(64) NOT NULL, + `bookmark` varbinary(512) NOT NULL, + `reponame` varbinary(255) NOT NULL, + PRIMARY KEY (`reponame`,`bookmark`) +) ENGINE=InnoDB DEFAULT CHARSET=utf8; + +CREATE TABLE `bundles` ( + `bundle` varbinary(512) NOT NULL, + `reponame` varbinary(255) NOT NULL, + PRIMARY KEY (`bundle`,`reponame`) +) ENGINE=InnoDB DEFAULT CHARSET=utf8; + +CREATE TABLE `nodestobundle` ( + `node` varbinary(64) NOT NULL, + `bundle` varbinary(512) NOT NULL, + `reponame` varbinary(255) NOT NULL, + PRIMARY KEY (`node`,`reponame`) +) ENGINE=InnoDB DEFAULT CHARSET=utf8; + +CREATE TABLE `nodesmetadata` ( + `node` varbinary(64) NOT NULL, + `message` mediumblob NOT NULL, + `p1` varbinary(64) NOT NULL, + `p2` varbinary(64) DEFAULT NULL, + `author` varbinary(255) NOT NULL, + `committer` varbinary(255) DEFAULT NULL, + `author_date` bigint(20) NOT NULL, + `committer_date` bigint(20) DEFAULT NULL, + `reponame` varbinary(255) NOT NULL, + `optional_json_metadata` mediumblob, + PRIMARY KEY (`reponame`,`node`) +) ENGINE=InnoDB DEFAULT CHARSET=utf8; diff -r fb92df8b634c -r ed5448edcbfa hgext/infinitepush/sqlindexapi.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/hgext/infinitepush/sqlindexapi.py Wed Apr 18 15:32:08 2018 -0400 @@ -0,0 +1,256 @@ +# Infinite push +# +# Copyright 2016 Facebook, Inc. +# +# This software may be used and distributed according to the terms of the +# GNU General Public License version 2 or any later version. + +from __future__ import absolute_import + +import logging +import os +import time + +import warnings +import mysql.connector + +from . import indexapi + +def _convertbookmarkpattern(pattern): + pattern = pattern.replace('_', '\\_') + pattern = pattern.replace('%', '\\%') + if pattern.endswith('*'): + pattern = pattern[:-1] + '%' + return pattern + +class sqlindexapi(indexapi.indexapi): + ''' + Sql backend for infinitepush index. See schema.sql + ''' + + def __init__(self, reponame, host, port, + database, user, password, logfile, loglevel, + waittimeout=300, locktimeout=120): + super(sqlindexapi, self).__init__() + self.reponame = reponame + self.sqlargs = { + 'host': host, + 'port': port, + 'database': database, + 'user': user, + 'password': password, + } + self.sqlconn = None + self.sqlcursor = None + if not logfile: + logfile = os.devnull + logging.basicConfig(filename=logfile) + self.log = logging.getLogger() + self.log.setLevel(loglevel) + self._connected = False + self._waittimeout = waittimeout + self._locktimeout = locktimeout + + def sqlconnect(self): + if self.sqlconn: + raise indexapi.indexexception("SQL connection already open") + if self.sqlcursor: + raise indexapi.indexexception("SQL cursor already open without" + " connection") + retry = 3 + while True: + try: + self.sqlconn = mysql.connector.connect(**self.sqlargs) + + # Code is copy-pasted from hgsql. Bug fixes need to be + # back-ported! + # The default behavior is to return byte arrays, when we + # need strings. This custom convert returns strings. + self.sqlconn.set_converter_class(CustomConverter) + self.sqlconn.autocommit = False + break + except mysql.connector.errors.Error: + # mysql can be flakey occasionally, so do some minimal + # retrying. + retry -= 1 + if retry == 0: + raise + time.sleep(0.2) + + waittimeout = self.sqlconn.converter.escape('%s' % self._waittimeout) + + self.sqlcursor = self.sqlconn.cursor() + self.sqlcursor.execute("SET wait_timeout=%s" % waittimeout) + self.sqlcursor.execute("SET innodb_lock_wait_timeout=%s" % + self._locktimeout) + self._connected = True + + def close(self): + """Cleans up the metadata store connection.""" + with warnings.catch_warnings(): + warnings.simplefilter("ignore") + self.sqlcursor.close() + self.sqlconn.close() + self.sqlcursor = None + self.sqlconn = None + + def __enter__(self): + if not self._connected: + self.sqlconnect() + return self + + def __exit__(self, exc_type, exc_val, exc_tb): + if exc_type is None: + self.sqlconn.commit() + else: + self.sqlconn.rollback() + + def addbundle(self, bundleid, nodesctx): + if not self._connected: + self.sqlconnect() + self.log.info("ADD BUNDLE %r %r" % (self.reponame, bundleid)) + self.sqlcursor.execute( + "INSERT INTO bundles(bundle, reponame) VALUES " + "(%s, %s)", params=(bundleid, self.reponame)) + for ctx in nodesctx: + self.sqlcursor.execute( + "INSERT INTO nodestobundle(node, bundle, reponame) " + "VALUES (%s, %s, %s) ON DUPLICATE KEY UPDATE " + "bundle=VALUES(bundle)", + params=(ctx.hex(), bundleid, self.reponame)) + + extra = ctx.extra() + author_name = ctx.user() + committer_name = extra.get('committer', ctx.user()) + author_date = int(ctx.date()[0]) + committer_date = int(extra.get('committer_date', author_date)) + self.sqlcursor.execute( + "INSERT IGNORE INTO nodesmetadata(node, message, p1, p2, " + "author, committer, author_date, committer_date, " + "reponame) VALUES " + "(%s, %s, %s, %s, %s, %s, %s, %s, %s)", + params=(ctx.hex(), ctx.description(), + ctx.p1().hex(), ctx.p2().hex(), author_name, + committer_name, author_date, committer_date, + self.reponame) + ) + + def addbookmark(self, bookmark, node): + """Takes a bookmark name and hash, and records mapping in the metadata + store.""" + if not self._connected: + self.sqlconnect() + self.log.info( + "ADD BOOKMARKS %r bookmark: %r node: %r" % + (self.reponame, bookmark, node)) + self.sqlcursor.execute( + "INSERT INTO bookmarkstonode(bookmark, node, reponame) " + "VALUES (%s, %s, %s) ON DUPLICATE KEY UPDATE node=VALUES(node)", + params=(bookmark, node, self.reponame)) + + def addmanybookmarks(self, bookmarks): + if not self._connected: + self.sqlconnect() + args = [] + values = [] + for bookmark, node in bookmarks.iteritems(): + args.append('(%s, %s, %s)') + values.extend((bookmark, node, self.reponame)) + args = ','.join(args) + + self.sqlcursor.execute( + "INSERT INTO bookmarkstonode(bookmark, node, reponame) " + "VALUES %s ON DUPLICATE KEY UPDATE node=VALUES(node)" % args, + params=values) + + def deletebookmarks(self, patterns): + """Accepts list of bookmark patterns and deletes them. + If `commit` is set then bookmark will actually be deleted. Otherwise + deletion will be delayed until the end of transaction. + """ + if not self._connected: + self.sqlconnect() + self.log.info("DELETE BOOKMARKS: %s" % patterns) + for pattern in patterns: + pattern = _convertbookmarkpattern(pattern) + self.sqlcursor.execute( + "DELETE from bookmarkstonode WHERE bookmark LIKE (%s) " + "and reponame = %s", + params=(pattern, self.reponame)) + + def getbundle(self, node): + """Returns the bundleid for the bundle that contains the given node.""" + if not self._connected: + self.sqlconnect() + self.log.info("GET BUNDLE %r %r" % (self.reponame, node)) + self.sqlcursor.execute( + "SELECT bundle from nodestobundle " + "WHERE node = %s AND reponame = %s", params=(node, self.reponame)) + result = self.sqlcursor.fetchall() + if len(result) != 1 or len(result[0]) != 1: + self.log.info("No matching node") + return None + bundle = result[0][0] + self.log.info("Found bundle %r" % bundle) + return bundle + + def getnode(self, bookmark): + """Returns the node for the given bookmark. None if it doesn't exist.""" + if not self._connected: + self.sqlconnect() + self.log.info( + "GET NODE reponame: %r bookmark: %r" % (self.reponame, bookmark)) + self.sqlcursor.execute( + "SELECT node from bookmarkstonode WHERE " + "bookmark = %s AND reponame = %s", params=(bookmark, self.reponame)) + result = self.sqlcursor.fetchall() + if len(result) != 1 or len(result[0]) != 1: + self.log.info("No matching bookmark") + return None + node = result[0][0] + self.log.info("Found node %r" % node) + return node + + def getbookmarks(self, query): + if not self._connected: + self.sqlconnect() + self.log.info( + "QUERY BOOKMARKS reponame: %r query: %r" % (self.reponame, query)) + query = _convertbookmarkpattern(query) + self.sqlcursor.execute( + "SELECT bookmark, node from bookmarkstonode WHERE " + "reponame = %s AND bookmark LIKE %s", + params=(self.reponame, query)) + result = self.sqlcursor.fetchall() + bookmarks = {} + for row in result: + if len(row) != 2: + self.log.info("Bad row returned: %s" % row) + continue + bookmarks[row[0]] = row[1] + return bookmarks + + def saveoptionaljsonmetadata(self, node, jsonmetadata): + if not self._connected: + self.sqlconnect() + self.log.info( + ("INSERT METADATA, QUERY BOOKMARKS reponame: %r " + + "node: %r, jsonmetadata: %s") % + (self.reponame, node, jsonmetadata)) + + self.sqlcursor.execute( + "UPDATE nodesmetadata SET optional_json_metadata=%s WHERE " + "reponame=%s AND node=%s", + params=(jsonmetadata, self.reponame, node)) + +class CustomConverter(mysql.connector.conversion.MySQLConverter): + """Ensure that all values being returned are returned as python string + (versus the default byte arrays).""" + def _STRING_to_python(self, value, dsc=None): + return str(value) + + def _VAR_STRING_to_python(self, value, dsc=None): + return str(value) + + def _BLOB_to_python(self, value, dsc=None): + return str(value) diff -r fb92df8b634c -r ed5448edcbfa hgext/infinitepush/store.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/hgext/infinitepush/store.py Wed Apr 18 15:32:08 2018 -0400 @@ -0,0 +1,155 @@ +# This software may be used and distributed according to the terms of the +# GNU General Public License version 2 or any later version. + +# based on bundleheads extension by Gregory Szorc + +from __future__ import absolute_import + +import abc +import hashlib +import os +import subprocess +import tempfile + +NamedTemporaryFile = tempfile.NamedTemporaryFile + +class BundleWriteException(Exception): + pass + +class BundleReadException(Exception): + pass + +class abstractbundlestore(object): + """Defines the interface for bundle stores. + + A bundle store is an entity that stores raw bundle data. It is a simple + key-value store. However, the keys are chosen by the store. The keys can + be any Python object understood by the corresponding bundle index (see + ``abstractbundleindex`` below). + """ + __metaclass__ = abc.ABCMeta + + @abc.abstractmethod + def write(self, data): + """Write bundle data to the store. + + This function receives the raw data to be written as a str. + Throws BundleWriteException + The key of the written data MUST be returned. + """ + + @abc.abstractmethod + def read(self, key): + """Obtain bundle data for a key. + + Returns None if the bundle isn't known. + Throws BundleReadException + The returned object should be a file object supporting read() + and close(). + """ + +class filebundlestore(object): + """bundle store in filesystem + + meant for storing bundles somewhere on disk and on network filesystems + """ + def __init__(self, ui, repo): + self.ui = ui + self.repo = repo + self.storepath = ui.configpath('scratchbranch', 'storepath') + if not self.storepath: + self.storepath = self.repo.vfs.join("scratchbranches", + "filebundlestore") + if not os.path.exists(self.storepath): + os.makedirs(self.storepath) + + def _dirpath(self, hashvalue): + """First two bytes of the hash are the name of the upper + level directory, next two bytes are the name of the + next level directory""" + return os.path.join(self.storepath, hashvalue[0:2], hashvalue[2:4]) + + def _filepath(self, filename): + return os.path.join(self._dirpath(filename), filename) + + def write(self, data): + filename = hashlib.sha1(data).hexdigest() + dirpath = self._dirpath(filename) + + if not os.path.exists(dirpath): + os.makedirs(dirpath) + + with open(self._filepath(filename), 'w') as f: + f.write(data) + + return filename + + def read(self, key): + try: + f = open(self._filepath(key), 'r') + except IOError: + return None + + return f.read() + +class externalbundlestore(abstractbundlestore): + def __init__(self, put_binary, put_args, get_binary, get_args): + """ + `put_binary` - path to binary file which uploads bundle to external + storage and prints key to stdout + `put_args` - format string with additional args to `put_binary` + {filename} replacement field can be used. + `get_binary` - path to binary file which accepts filename and key + (in that order), downloads bundle from store and saves it to file + `get_args` - format string with additional args to `get_binary`. + {filename} and {handle} replacement field can be used. + """ + + self.put_args = put_args + self.get_args = get_args + self.put_binary = put_binary + self.get_binary = get_binary + + def _call_binary(self, args): + p = subprocess.Popen( + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE, + close_fds=True) + stdout, stderr = p.communicate() + returncode = p.returncode + return returncode, stdout, stderr + + def write(self, data): + # Won't work on windows because you can't open file second time without + # closing it + with NamedTemporaryFile() as temp: + temp.write(data) + temp.flush() + temp.seek(0) + formatted_args = [arg.format(filename=temp.name) + for arg in self.put_args] + returncode, stdout, stderr = self._call_binary( + [self.put_binary] + formatted_args) + + if returncode != 0: + raise BundleWriteException( + 'Failed to upload to external store: %s' % stderr) + stdout_lines = stdout.splitlines() + if len(stdout_lines) == 1: + return stdout_lines[0] + else: + raise BundleWriteException( + 'Bad output from %s: %s' % (self.put_binary, stdout)) + + def read(self, handle): + # Won't work on windows because you can't open file second time without + # closing it + with NamedTemporaryFile() as temp: + formatted_args = [arg.format(filename=temp.name, handle=handle) + for arg in self.get_args] + returncode, stdout, stderr = self._call_binary( + [self.get_binary] + formatted_args) + + if returncode != 0: + raise BundleReadException( + 'Failed to download from external store: %s' % stderr) + return temp.read() diff -r fb92df8b634c -r ed5448edcbfa hgext/journal.py --- a/hgext/journal.py Wed Apr 04 10:35:09 2018 -0400 +++ b/hgext/journal.py Wed Apr 18 15:32:08 2018 -0400 @@ -24,18 +24,23 @@ bookmarks, cmdutil, dispatch, + encoding, error, extensions, hg, localrepo, lock, + logcmdutil, node, pycompat, registrar, util, ) - -from . import share +from mercurial.utils import ( + dateutil, + procutil, + stringutil, +) cmdtable = {} command = registrar.command(cmdtable) @@ -168,7 +173,7 @@ """Copy shared journal entries into this repo when unsharing""" if (repo.path == repopath and repo.shared() and util.safehasattr(repo, 'journal')): - sharedrepo = share._getsrcrepo(repo) + sharedrepo = hg.sharedreposource(repo) sharedfeatures = _readsharedfeatures(repo) if sharedrepo and sharedfeatures > {'journal'}: # there is a shared repository and there are shared journal entries @@ -219,15 +224,17 @@ (timestamp, tz), user, command, namespace, name, oldhashes, newhashes) - def __str__(self): - """String representation for storage""" - time = ' '.join(map(str, self.timestamp)) + def __bytes__(self): + """bytes representation for storage""" + time = ' '.join(map(pycompat.bytestr, self.timestamp)) oldhashes = ','.join([node.hex(hash) for hash in self.oldhashes]) newhashes = ','.join([node.hex(hash) for hash in self.newhashes]) return '\n'.join(( time, self.user, self.command, self.namespace, self.name, oldhashes, newhashes)) + __str__ = encoding.strmethod(__bytes__) + class journalstorage(object): """Storage for journal entries @@ -249,7 +256,7 @@ _lockref = None def __init__(self, repo): - self.user = util.getuser() + self.user = procutil.getuser() self.ui = repo.ui self.vfs = repo.vfs @@ -257,7 +264,7 @@ self.sharedfeatures = self.sharedvfs = None if repo.shared(): features = _readsharedfeatures(repo) - sharedrepo = share._getsrcrepo(repo) + sharedrepo = hg.sharedreposource(repo) if sharedrepo is not None and 'journal' in features: self.sharedvfs = sharedrepo.vfs self.sharedfeatures = features @@ -266,7 +273,7 @@ @property def command(self): commandstr = ' '.join( - map(util.shellquote, journalstorage._currentcommand)) + map(procutil.shellquote, journalstorage._currentcommand)) if '\n' in commandstr: # truncate multi-line commands commandstr = commandstr.partition('\n')[0] + ' ...' @@ -327,7 +334,7 @@ newhashes = [newhashes] entry = journalentry( - util.makedate(), self.user, self.command, namespace, name, + dateutil.makedate(), self.user, self.command, namespace, name, oldhashes, newhashes) vfs = self.vfs @@ -348,7 +355,7 @@ # Read just enough bytes to get a version number (up to 2 # digits plus separator) version = f.read(3).partition('\0')[0] - if version and version != str(storageversion): + if version and version != "%d" % storageversion: # different version of the storage. Exit early (and not # write anything) if this is not a version we can handle or # the file is corrupt. In future, perhaps rotate the file @@ -358,9 +365,9 @@ return if not version: # empty file, write version first - f.write(str(storageversion) + '\0') + f.write(("%d" % storageversion) + '\0') f.seek(0, os.SEEK_END) - f.write(str(entry) + '\0') + f.write(bytes(entry) + '\0') def filtered(self, namespace=None, name=None): """Yield all journal entries with the given namespace or name @@ -373,9 +380,9 @@ """ if namespace is not None: - namespace = util.stringmatcher(namespace)[-1] + namespace = stringutil.stringmatcher(namespace)[-1] if name is not None: - name = util.stringmatcher(name)[-1] + name = stringutil.stringmatcher(name)[-1] for entry in self: if namespace is not None and not namespace(entry.namespace): continue @@ -410,7 +417,7 @@ lines = raw.split('\0') version = lines and lines[0] - if version != str(storageversion): + if version != "%d" % storageversion: version = version or _('not available') raise error.Abort(_("unknown journal file version '%s'") % version) @@ -478,7 +485,7 @@ displayname = "'%s'" % name ui.status(_("previous locations of %s:\n") % displayname) - limit = cmdutil.loglimit(opts) + limit = logcmdutil.getlimit(opts) entry = None ui.pager('journal') for count, entry in enumerate(repo.journal.filtered(name=name)): @@ -502,13 +509,13 @@ fm.write('command', ' %s\n', entry.command) if opts.get("commits"): - displayer = cmdutil.show_changeset(ui, repo, opts, buffered=False) + displayer = logcmdutil.changesetdisplayer(ui, repo, opts) for hash in entry.newhashes: try: ctx = repo[hash] displayer.show(ctx) except error.RepoLookupError as e: - fm.write('repolookuperror', "%s\n\n", str(e)) + fm.write('repolookuperror', "%s\n\n", pycompat.bytestr(e)) displayer.close() fm.end() diff -r fb92df8b634c -r ed5448edcbfa hgext/keyword.py --- a/hgext/keyword.py Wed Apr 04 10:35:09 2018 -0400 +++ b/hgext/keyword.py Wed Apr 18 15:32:08 2018 -0400 @@ -101,6 +101,7 @@ extensions, filelog, localrepo, + logcmdutil, match, patch, pathutil, @@ -108,8 +109,13 @@ registrar, scmutil, templatefilters, + templateutil, util, ) +from mercurial.utils import ( + dateutil, + stringutil, +) cmdtable = {} command = registrar.command(cmdtable) @@ -151,25 +157,27 @@ default=False, ) # date like in cvs' $Date -@templatefilter('utcdate') -def utcdate(text): +@templatefilter('utcdate', intype=templateutil.date) +def utcdate(date): '''Date. Returns a UTC-date in this format: "2009/08/18 11:00:13". ''' - return util.datestr((util.parsedate(text)[0], 0), '%Y/%m/%d %H:%M:%S') + dateformat = '%Y/%m/%d %H:%M:%S' + return dateutil.datestr((date[0], 0), dateformat) # date like in svn's $Date -@templatefilter('svnisodate') -def svnisodate(text): +@templatefilter('svnisodate', intype=templateutil.date) +def svnisodate(date): '''Date. Returns a date in this format: "2009-08-18 13:00:13 +0200 (Tue, 18 Aug 2009)". ''' - return util.datestr(text, '%Y-%m-%d %H:%M:%S %1%2 (%a, %d %b %Y)') + return dateutil.datestr(date, '%Y-%m-%d %H:%M:%S %1%2 (%a, %d %b %Y)') # date like in svn's $Id -@templatefilter('svnutcdate') -def svnutcdate(text): +@templatefilter('svnutcdate', intype=templateutil.date) +def svnutcdate(date): '''Date. Returns a UTC-date in this format: "2009-08-18 11:00:13Z". ''' - return util.datestr((util.parsedate(text)[0], 0), '%Y-%m-%d %H:%M:%SZ') + dateformat = '%Y-%m-%d %H:%M:%SZ' + return dateutil.datestr((date[0], 0), dateformat) # make keyword tools accessible kwtools = {'hgcmd': ''} @@ -254,7 +262,7 @@ '''Replaces keywords in data with expanded template.''' def kwsub(mobj): kw = mobj.group(1) - ct = cmdutil.makelogtemplater(self.ui, self.repo, + ct = logcmdutil.maketemplater(self.ui, self.repo, self.templates[kw]) self.ui.pushbuffer() ct.show(ctx, root=self.repo.root, file=path) @@ -268,7 +276,8 @@ def expand(self, path, node, data): '''Returns data with keywords expanded.''' - if not self.restrict and self.match(path) and not util.binary(data): + if (not self.restrict and self.match(path) + and not stringutil.binary(data)): ctx = self.linkctx(path, node) return self.substitute(data, path, ctx, self.rekw.sub) return data @@ -300,7 +309,7 @@ data = self.repo.file(f).read(mf[f]) else: data = self.repo.wread(f) - if util.binary(data): + if stringutil.binary(data): continue if expand: parents = ctx.parents() @@ -331,7 +340,7 @@ def shrink(self, fname, text): '''Returns text with all keyword substitutions removed.''' - if self.match(fname) and not util.binary(text): + if self.match(fname) and not stringutil.binary(text): return _shrinktext(text, self.rekwexp.sub) return text @@ -339,7 +348,7 @@ '''Returns lines with keyword substitutions removed.''' if self.match(fname): text = ''.join(lines) - if not util.binary(text): + if not stringutil.binary(text): return _shrinktext(text, self.rekwexp.sub).splitlines(True) return lines @@ -610,14 +619,14 @@ if kwt: kwt.restrict = restrict -def kwweb_skip(orig, web, req, tmpl): +def kwweb_skip(orig, web): '''Wraps webcommands.x turning off keyword expansion.''' kwt = getattr(web.repo, '_keywordkwt', None) if kwt: origmatch = kwt.match kwt.match = util.never try: - for chunk in orig(web, req, tmpl): + for chunk in orig(web): yield chunk finally: if kwt: diff -r fb92df8b634c -r ed5448edcbfa hgext/largefiles/__init__.py --- a/hgext/largefiles/__init__.py Wed Apr 04 10:35:09 2018 -0400 +++ b/hgext/largefiles/__init__.py Wed Apr 18 15:32:08 2018 -0400 @@ -146,7 +146,7 @@ supported |= {'largefiles'} def uisetup(ui): - localrepo.localrepository.featuresetupfuncs.add(featuresetup) + localrepo.featuresetupfuncs.add(featuresetup) hg.wirepeersetupfuncs.append(proto.wirereposetup) uisetupmod.uisetup(ui) diff -r fb92df8b634c -r ed5448edcbfa hgext/largefiles/lfcommands.py --- a/hgext/largefiles/lfcommands.py Wed Apr 04 10:35:09 2018 -0400 +++ b/hgext/largefiles/lfcommands.py Wed Apr 18 15:32:08 2018 -0400 @@ -365,7 +365,7 @@ at = 0 ui.debug("sending statlfile command for %d largefiles\n" % len(files)) retval = store.exists(files) - files = filter(lambda h: not retval[h], files) + files = [h for h in files if not retval[h]] ui.debug("%d largefiles need to be uploaded\n" % len(files)) for hash in files: @@ -589,7 +589,7 @@ numcached = 0 for rev in revs: - ui.note(_('pulling largefiles for revision %s\n') % rev) + ui.note(_('pulling largefiles for revision %d\n') % rev) (cached, missing) = cachelfiles(ui, repo, rev) numcached += len(cached) ui.status(_("%d largefiles cached\n") % numcached) diff -r fb92df8b634c -r ed5448edcbfa hgext/largefiles/lfutil.py --- a/hgext/largefiles/lfutil.py Wed Apr 04 10:35:09 2018 -0400 +++ b/hgext/largefiles/lfutil.py Wed Apr 18 15:32:08 2018 -0400 @@ -15,6 +15,7 @@ import stat from mercurial.i18n import _ +from mercurial.node import hex from mercurial import ( dirstate, @@ -371,7 +372,7 @@ for data in instream: hasher.update(data) outfile.write(data) - return hasher.hexdigest() + return hex(hasher.digest()) def hashfile(file): if not os.path.exists(file): @@ -404,7 +405,7 @@ h = hashlib.sha1() for chunk in util.filechunkiter(fileobj): h.update(chunk) - return h.hexdigest() + return hex(h.digest()) def httpsendfile(ui, filename): return httpconnection.httpsendfile(ui, filename, 'rb') diff -r fb92df8b634c -r ed5448edcbfa hgext/largefiles/overrides.py --- a/hgext/largefiles/overrides.py Wed Apr 04 10:35:09 2018 -0400 +++ b/hgext/largefiles/overrides.py Wed Apr 18 15:32:08 2018 -0400 @@ -19,6 +19,7 @@ cmdutil, error, hg, + logcmdutil, match as matchmod, pathutil, pycompat, @@ -41,7 +42,7 @@ matcher''' m = copy.copy(match) lfile = lambda f: lfutil.standin(f) in manifest - m._files = filter(lfile, m._files) + m._files = [lf for lf in m._files if lfile(lf)] m._fileset = set(m._files) m.always = lambda: False origmatchfn = m.matchfn @@ -56,7 +57,7 @@ m = copy.copy(match) notlfile = lambda f: not (lfutil.isstandin(f) or lfutil.standin(f) in manifest or f in excluded) - m._files = filter(notlfile, m._files) + m._files = [lf for lf in m._files if notlfile(lf)] m._fileset = set(m._files) m.always = lambda: False origmatchfn = m.matchfn @@ -177,7 +178,7 @@ added = [f for f in lfnames if f not in bad] return added, bad -def removelargefiles(ui, repo, isaddremove, matcher, **opts): +def removelargefiles(ui, repo, isaddremove, matcher, dryrun, **opts): after = opts.get(r'after') m = composelargefilematcher(matcher, repo[None].manifest()) try: @@ -222,11 +223,11 @@ name = m.rel(f) ui.status(_('removing %s\n') % name) - if not opts.get(r'dry_run'): + if not dryrun: if not after: repo.wvfs.unlinkpath(f, ignoremissing=True) - if opts.get(r'dry_run'): + if dryrun: return result remove = [lfutil.standin(f) for f in remove] @@ -270,10 +271,12 @@ bad.extend(f for f in lbad) return bad -def cmdutilremove(orig, ui, repo, matcher, prefix, after, force, subrepos): +def cmdutilremove(orig, ui, repo, matcher, prefix, after, force, subrepos, + dryrun): normalmatcher = composenormalfilematcher(matcher, repo[None].manifest()) - result = orig(ui, repo, normalmatcher, prefix, after, force, subrepos) - return removelargefiles(ui, repo, False, matcher, after=after, + result = orig(ui, repo, normalmatcher, prefix, after, force, subrepos, + dryrun) + return removelargefiles(ui, repo, False, matcher, dryrun, after=after, force=force) or result def overridestatusfn(orig, repo, rev2, **opts): @@ -388,20 +391,20 @@ # (2) to determine what files to print out diffs for. # The magic matchandpats override should be used for case (1) but not for # case (2). - def overridemakelogfilematcher(repo, pats, opts, badfn=None): + def overridemakefilematcher(repo, pats, opts, badfn=None): wctx = repo[None] match, pats = oldmatchandpats(wctx, pats, opts, badfn=badfn) - return lambda rev: match + return lambda ctx: match oldmatchandpats = installmatchandpatsfn(overridematchandpats) - oldmakelogfilematcher = cmdutil._makenofollowlogfilematcher - setattr(cmdutil, '_makenofollowlogfilematcher', overridemakelogfilematcher) + oldmakefilematcher = logcmdutil._makenofollowfilematcher + setattr(logcmdutil, '_makenofollowfilematcher', overridemakefilematcher) try: return orig(ui, repo, *pats, **opts) finally: restorematchandpatsfn() - setattr(cmdutil, '_makenofollowlogfilematcher', oldmakelogfilematcher) + setattr(logcmdutil, '_makenofollowfilematcher', oldmakefilematcher) def overrideverify(orig, ui, repo, *pats, **opts): large = opts.pop(r'large', False) @@ -597,7 +600,7 @@ try: result = orig(ui, repo, pats, opts, rename) except error.Abort as e: - if str(e) != _('no files to copy'): + if pycompat.bytestr(e) != _('no files to copy'): raise e else: nonormalfiles = True @@ -666,7 +669,7 @@ try: origcopyfile = util.copyfile copiedfiles = [] - def overridecopyfile(src, dest): + def overridecopyfile(src, dest, *args, **kwargs): if (lfutil.shortname in src and dest.startswith(repo.wjoin(lfutil.shortname))): destlfile = dest.replace(lfutil.shortname, '') @@ -674,7 +677,7 @@ raise IOError('', _('destination largefile already exists')) copiedfiles.append((src, dest)) - origcopyfile(src, dest) + origcopyfile(src, dest, *args, **kwargs) util.copyfile = overridecopyfile result += orig(ui, repo, listpats, opts, rename) @@ -704,7 +707,7 @@ lfdirstate.add(destlfile) lfdirstate.write() except error.Abort as e: - if str(e) != _('no files to copy'): + if pycompat.bytestr(e) != _('no files to copy'): raise e else: nolfiles = True @@ -811,7 +814,7 @@ repo.firstpulled = revsprepull # for pulled() revset expression try: for rev in scmutil.revrange(repo, lfrevs): - ui.note(_('pulling largefiles for revision %s\n') % rev) + ui.note(_('pulling largefiles for revision %d\n') % rev) (cached, missing) = lfcommands.cachelfiles(ui, repo, rev) numcached += len(cached) finally: @@ -823,7 +826,7 @@ """Override push command and store --lfrev parameters in opargs""" lfrevs = kwargs.pop(r'lfrev', None) if lfrevs: - opargs = kwargs.setdefault('opargs', {}) + opargs = kwargs.setdefault(r'opargs', {}) opargs['lfrevs'] = scmutil.revrange(repo, lfrevs) return orig(ui, repo, *args, **kwargs) @@ -894,7 +897,7 @@ # Caching is implicitly limited to 'rev' option, since the dest repo was # truncated at that point. The user may expect a download count with # this option, so attempt whether or not this is a largefile repo. - if opts.get(r'all_largefiles'): + if opts.get('all_largefiles'): success, missing = lfcommands.downloadlfiles(ui, repo, None) if missing != 0: @@ -931,11 +934,11 @@ finally: repo.unfiltered().lfstatus = False -def hgwebarchive(orig, web, req, tmpl): +def hgwebarchive(orig, web): web.repo.lfstatus = True try: - return orig(web, req, tmpl) + return orig(web) finally: web.repo.lfstatus = False @@ -1076,9 +1079,11 @@ finally: repo.lfstatus = False -def cmdutilforget(orig, ui, repo, match, prefix, explicitonly): +def cmdutilforget(orig, ui, repo, match, prefix, explicitonly, dryrun, + interactive): normalmatcher = composenormalfilematcher(match, repo[None].manifest()) - bad, forgot = orig(ui, repo, normalmatcher, prefix, explicitonly) + bad, forgot = orig(ui, repo, normalmatcher, prefix, explicitonly, dryrun, + interactive) m = composelargefilematcher(match, repo[None].manifest()) try: @@ -1211,12 +1216,11 @@ finally: repo.lfstatus = False -def scmutiladdremove(orig, repo, matcher, prefix, opts=None, dry_run=None, - similarity=None): +def scmutiladdremove(orig, repo, matcher, prefix, opts=None): if opts is None: opts = {} if not lfutil.islfilesrepo(repo): - return orig(repo, matcher, prefix, opts, dry_run, similarity) + return orig(repo, matcher, prefix, opts) # Get the list of missing largefiles so we can remove them lfdirstate = lfutil.openlfdirstate(repo.ui, repo) unsure, s = lfdirstate.status(matchmod.always(repo.root, repo.getcwd()), @@ -1237,15 +1241,17 @@ matchfn = m.matchfn m.matchfn = lambda f: f in s.deleted and matchfn(f) - removelargefiles(repo.ui, repo, True, m, **opts) + removelargefiles(repo.ui, repo, True, m, opts.get('dry_run'), + **pycompat.strkwargs(opts)) # Call into the normal add code, and any files that *should* be added as # largefiles will be - added, bad = addlargefiles(repo.ui, repo, True, matcher, **opts) + added, bad = addlargefiles(repo.ui, repo, True, matcher, + **pycompat.strkwargs(opts)) # Now that we've handled largefiles, hand off to the original addremove # function to take care of the rest. Make sure it doesn't do anything with # largefiles by passing a matcher that will ignore them. matcher = composenormalfilematcher(matcher, repo[None].manifest(), added) - return orig(repo, matcher, prefix, opts, dry_run, similarity) + return orig(repo, matcher, prefix, opts) # Calling purge with --all will cause the largefiles to be deleted. # Override repo.status to prevent this from happening. @@ -1358,8 +1364,7 @@ m.visitdir = lfvisitdirfn for f in ctx.walk(m): - with cmdutil.makefileobj(repo, opts.get('output'), ctx.node(), - pathname=f) as fp: + with cmdutil.makefileobj(ctx, opts.get('output'), pathname=f) as fp: lf = lfutil.splitstandin(f) if lf is None or origmatchfn(f): # duplicating unreachable code from commands.cat diff -r fb92df8b634c -r ed5448edcbfa hgext/largefiles/proto.py --- a/hgext/largefiles/proto.py Wed Apr 04 10:35:09 2018 -0400 +++ b/hgext/largefiles/proto.py Wed Apr 18 15:32:08 2018 -0400 @@ -13,7 +13,8 @@ error, httppeer, util, - wireproto, + wireprototypes, + wireprotov1peer, ) from . import ( @@ -34,27 +35,28 @@ def putlfile(repo, proto, sha): '''Server command for putting a largefile into a repository's local store and into the user cache.''' - proto.redirect() - - path = lfutil.storepath(repo, sha) - util.makedirs(os.path.dirname(path)) - tmpfp = util.atomictempfile(path, createmode=repo.store.createmode) + with proto.mayberedirectstdio() as output: + path = lfutil.storepath(repo, sha) + util.makedirs(os.path.dirname(path)) + tmpfp = util.atomictempfile(path, createmode=repo.store.createmode) - try: - proto.getfile(tmpfp) - tmpfp._fp.seek(0) - if sha != lfutil.hexsha1(tmpfp._fp): - raise IOError(0, _('largefile contents do not match hash')) - tmpfp.close() - lfutil.linktousercache(repo, sha) - except IOError as e: - repo.ui.warn(_('largefiles: failed to put %s into store: %s\n') % - (sha, e.strerror)) - return wireproto.pushres(1) - finally: - tmpfp.discard() + try: + for p in proto.getpayload(): + tmpfp.write(p) + tmpfp._fp.seek(0) + if sha != lfutil.hexsha1(tmpfp._fp): + raise IOError(0, _('largefile contents do not match hash')) + tmpfp.close() + lfutil.linktousercache(repo, sha) + except IOError as e: + repo.ui.warn(_('largefiles: failed to put %s into store: %s\n') % + (sha, e.strerror)) + return wireprototypes.pushres( + 1, output.getvalue() if output else '') + finally: + tmpfp.discard() - return wireproto.pushres(0) + return wireprototypes.pushres(0, output.getvalue() if output else '') def getlfile(repo, proto, sha): '''Server command for retrieving a largefile from the repository-local @@ -75,7 +77,7 @@ yield '%d\n' % length for chunk in util.filechunkiter(f): yield chunk - return wireproto.streamres_legacy(gen=generator()) + return wireprototypes.streamreslegacy(gen=generator()) def statlfile(repo, proto, sha): '''Server command for checking if a largefile is present - returns '2\n' if @@ -86,8 +88,8 @@ server side.''' filename = lfutil.findfile(repo, sha) if not filename: - return '2\n' - return '0\n' + return wireprototypes.bytesresponse('2\n') + return wireprototypes.bytesresponse('0\n') def wirereposetup(ui, repo): class lfileswirerepository(repo.__class__): @@ -97,7 +99,7 @@ # it ... if issubclass(self.__class__, httppeer.httppeer): res = self._call('putlfile', data=fd, sha=sha, - headers={'content-type':'application/mercurial-0.1'}) + headers={r'content-type': r'application/mercurial-0.1'}) try: d, output = res.split('\n', 1) for l in output.splitlines(True): @@ -143,9 +145,9 @@ self._abort(error.ResponseError(_("unexpected response:"), chunk)) - @wireproto.batchable + @wireprotov1peer.batchable def statlfile(self, sha): - f = wireproto.future() + f = wireprotov1peer.future() result = {'sha': sha} yield result, f try: @@ -166,12 +168,13 @@ caps.append('largefiles=serve') return caps -def heads(repo, proto): +def heads(orig, repo, proto): '''Wrap server command - largefile capable clients will know to call lheads instead''' if lfutil.islfilesrepo(repo): - return wireproto.ooberror(LARGEFILES_REQUIRED_MSG) - return wireproto.heads(repo, proto) + return wireprototypes.ooberror(LARGEFILES_REQUIRED_MSG) + + return orig(repo, proto) def sshrepocallstream(self, cmd, **args): if cmd == 'heads' and self.capable('largefiles'): @@ -180,7 +183,7 @@ args[r'cmds'] = args[r'cmds'].replace('heads ', 'lheads ') return ssholdcallstream(self, cmd, **args) -headsre = re.compile(r'(^|;)heads\b') +headsre = re.compile(br'(^|;)heads\b') def httprepocallstream(self, cmd, **args): if cmd == 'heads' and self.capable('largefiles'): diff -r fb92df8b634c -r ed5448edcbfa hgext/largefiles/remotestore.py --- a/hgext/largefiles/remotestore.py Wed Apr 04 10:35:09 2018 -0400 +++ b/hgext/largefiles/remotestore.py Wed Apr 18 15:32:08 2018 -0400 @@ -14,6 +14,10 @@ util, ) +from mercurial.utils import ( + stringutil, +) + from . import ( basestore, lfutil, @@ -52,7 +56,7 @@ except IOError as e: raise error.Abort( _('remotestore: could not open file %s: %s') - % (filename, str(e))) + % (filename, stringutil.forcebytestr(e))) def _getfile(self, tmpfile, filename, hash): try: @@ -60,7 +64,8 @@ except urlerr.httperror as e: # 401s get converted to error.Aborts; everything else is fine being # turned into a StoreError - raise basestore.StoreError(filename, hash, self.url, str(e)) + raise basestore.StoreError(filename, hash, self.url, + stringutil.forcebytestr(e)) except urlerr.urlerror as e: # This usually indicates a connection problem, so don't # keep trying with the other files... they will probably @@ -68,7 +73,8 @@ raise error.Abort('%s: %s' % (util.hidepassword(self.url), e.reason)) except IOError as e: - raise basestore.StoreError(filename, hash, self.url, str(e)) + raise basestore.StoreError(filename, hash, self.url, + stringutil.forcebytestr(e)) return lfutil.copyandhash(chunks, tmpfile) diff -r fb92df8b634c -r ed5448edcbfa hgext/largefiles/storefactory.py --- a/hgext/largefiles/storefactory.py Wed Apr 04 10:35:09 2018 -0400 +++ b/hgext/largefiles/storefactory.py Wed Apr 18 15:32:08 2018 -0400 @@ -80,7 +80,7 @@ 'ssh': [wirestore.wirestore], } -_scheme_re = re.compile(r'^([a-zA-Z0-9+-.]+)://') +_scheme_re = re.compile(br'^([a-zA-Z0-9+-.]+)://') def getlfile(ui, hash): return util.chunkbuffer(openstore(ui=ui)._get(hash)) diff -r fb92df8b634c -r ed5448edcbfa hgext/largefiles/uisetup.py --- a/hgext/largefiles/uisetup.py Wed Apr 04 10:35:09 2018 -0400 +++ b/hgext/largefiles/uisetup.py Wed Apr 18 15:32:08 2018 -0400 @@ -31,7 +31,7 @@ subrepo, upgrade, url, - wireproto, + wireprotov1server, ) from . import ( @@ -164,30 +164,30 @@ overrides.openlargefile) # create the new wireproto commands ... - wireproto.commands['putlfile'] = (proto.putlfile, 'sha') - wireproto.commands['getlfile'] = (proto.getlfile, 'sha') - wireproto.commands['statlfile'] = (proto.statlfile, 'sha') + wireprotov1server.wireprotocommand('putlfile', 'sha', permission='push')( + proto.putlfile) + wireprotov1server.wireprotocommand('getlfile', 'sha', permission='pull')( + proto.getlfile) + wireprotov1server.wireprotocommand('statlfile', 'sha', permission='pull')( + proto.statlfile) + wireprotov1server.wireprotocommand('lheads', '', permission='pull')( + wireprotov1server.heads) # ... and wrap some existing ones - wireproto.commands['heads'] = (proto.heads, '') - wireproto.commands['lheads'] = (wireproto.heads, '') - - # make putlfile behave the same as push and {get,stat}lfile behave - # the same as pull w.r.t. permissions checks - wireproto.permissions['putlfile'] = 'push' - wireproto.permissions['getlfile'] = 'pull' - wireproto.permissions['statlfile'] = 'pull' - wireproto.permissions['lheads'] = 'pull' + extensions.wrapfunction(wireprotov1server.commands['heads'], 'func', + proto.heads) + # TODO also wrap wireproto.commandsv2 once heads is implemented there. extensions.wrapfunction(webcommands, 'decodepath', overrides.decodepath) - extensions.wrapfunction(wireproto, '_capabilities', proto._capabilities) + extensions.wrapfunction(wireprotov1server, '_capabilities', + proto._capabilities) # can't do this in reposetup because it needs to have happened before # wirerepo.__init__ is called - proto.ssholdcallstream = sshpeer.sshpeer._callstream + proto.ssholdcallstream = sshpeer.sshv1peer._callstream proto.httpoldcallstream = httppeer.httppeer._callstream - sshpeer.sshpeer._callstream = proto.sshrepocallstream + sshpeer.sshv1peer._callstream = proto.sshrepocallstream httppeer.httppeer._callstream = proto.httprepocallstream # override some extensions' stuff as well diff -r fb92df8b634c -r ed5448edcbfa hgext/largefiles/wirestore.py --- a/hgext/largefiles/wirestore.py Wed Apr 04 10:35:09 2018 -0400 +++ b/hgext/largefiles/wirestore.py Wed Apr 18 15:32:08 2018 -0400 @@ -32,8 +32,12 @@ '''For each hash, return 0 if it is available, other values if not. It is usually 2 if the largefile is missing, but might be 1 the server has a corrupted copy.''' - batch = self.remote.iterbatch() - for hash in hashes: - batch.statlfile(hash) - batch.submit() - return dict(zip(hashes, batch.results())) + + with self.remote.commandexecutor() as e: + fs = [] + for hash in hashes: + fs.append((hash, e.callcommand('statlfile', { + 'sha': hash, + }))) + + return {hash: f.result() for hash, f in fs} diff -r fb92df8b634c -r ed5448edcbfa hgext/lfs/__init__.py --- a/hgext/lfs/__init__.py Wed Apr 04 10:35:09 2018 -0400 +++ b/hgext/lfs/__init__.py Wed Apr 18 15:32:08 2018 -0400 @@ -87,7 +87,9 @@ # git-lfs endpoint # - file:///tmp/path # local filesystem, usually for testing - # if unset, lfs will prompt setting this when it must use this value. + # if unset, lfs will assume the remote repository also handles blob storage + # for http(s) URLs. Otherwise, lfs will prompt to set this when it must + # use this value. # (default: unset) url = https://example.com/repo.git/info/lfs @@ -143,15 +145,17 @@ registrar, revlog, scmutil, - templatekw, + templateutil, upgrade, util, vfs as vfsmod, - wireproto, + wireprotoserver, + wireprotov1server, ) from . import ( blobstore, + wireprotolfsserver, wrapper, ) @@ -164,9 +168,15 @@ configtable = {} configitem = registrar.configitem(configtable) +configitem('experimental', 'lfs.serve', + default=True, +) configitem('experimental', 'lfs.user-agent', default=None, ) +configitem('experimental', 'lfs.disableusercache', + default=False, +) configitem('experimental', 'lfs.worker-enable', default=False, ) @@ -192,13 +202,14 @@ command = registrar.command(cmdtable) templatekeyword = registrar.templatekeyword() +filesetpredicate = registrar.filesetpredicate() def featuresetup(ui, supported): # don't die on seeing a repo with the lfs requirement supported |= {'lfs'} def uisetup(ui): - localrepo.localrepository.featuresetupfuncs.add(featuresetup) + localrepo.featuresetupfuncs.add(featuresetup) def reposetup(ui, repo): # Nothing to do with a remote repo @@ -211,7 +222,7 @@ class lfsrepo(repo.__class__): @localrepo.unfilteredmethod def commitctx(self, ctx, error=False): - repo.svfs.options['lfstrack'] = _trackedmatcher(self, ctx) + repo.svfs.options['lfstrack'] = _trackedmatcher(self) return super(lfsrepo, self).commitctx(ctx, error) repo.__class__ = lfsrepo @@ -219,15 +230,17 @@ if 'lfs' not in repo.requirements: def checkrequireslfs(ui, repo, **kwargs): if 'lfs' not in repo.requirements: - last = kwargs.get('node_last') + last = kwargs.get(r'node_last') _bin = node.bin if last: - s = repo.set('%n:%n', _bin(kwargs['node']), _bin(last)) + s = repo.set('%n:%n', _bin(kwargs[r'node']), _bin(last)) else: - s = repo.set('%n', _bin(kwargs['node'])) + s = repo.set('%n', _bin(kwargs[r'node'])) + match = repo.narrowmatch() for ctx in s: # TODO: is there a way to just walk the files in the commit? - if any(ctx[f].islfs() for f in ctx.files() if f in ctx): + if any(ctx[f].islfs() for f in ctx.files() + if f in ctx and match(f)): repo.requirements.add('lfs') repo._writerequirements() repo.prepushoutgoinghooks.add('lfs', wrapper.prepush) @@ -238,7 +251,7 @@ else: repo.prepushoutgoinghooks.add('lfs', wrapper.prepush) -def _trackedmatcher(repo, ctx): +def _trackedmatcher(repo): """Return a function (path, size) -> bool indicating whether or not to track a given file with lfs.""" if not repo.wvfs.exists('.hglfs'): @@ -306,14 +319,13 @@ wrapper.upgraderequirements) wrapfunction(changegroup, - 'supportedoutgoingversions', - wrapper.supportedoutgoingversions) - wrapfunction(changegroup, 'allsupportedversions', wrapper.allsupportedversions) wrapfunction(exchange, 'push', wrapper.push) - wrapfunction(wireproto, '_capabilities', wrapper._capabilities) + wrapfunction(wireprotov1server, '_capabilities', wrapper._capabilities) + wrapfunction(wireprotoserver, 'handlewsgirequest', + wireprotolfsserver.handlewsgirequest) wrapfunction(context.basefilectx, 'cmp', wrapper.filectxcmp) wrapfunction(context.basefilectx, 'isbinary', wrapper.filectxisbinary) @@ -331,12 +343,12 @@ wrapfunction(hg, 'clone', wrapper.hgclone) wrapfunction(hg, 'postshare', wrapper.hgpostshare) + scmutil.fileprefetchhooks.add('lfs', wrapper._prefetchfiles) + # Make bundle choose changegroup3 instead of changegroup2. This affects # "hg bundle" command. Note: it does not cover all bundle formats like # "packed1". Using "packed1" with lfs will likely cause trouble. - names = [k for k, v in exchange._bundlespeccgversions.items() if v == '02'] - for k in names: - exchange._bundlespeccgversions[k] = '03' + exchange._bundlespeccontentopts["v2"]["cg.version"] = "03" # bundlerepo uses "vfsmod.readonlyvfs(othervfs)", we need to make sure lfs # options and blob stores are passed from othervfs to the new readonlyvfs. @@ -345,12 +357,21 @@ # when writing a bundle via "hg bundle" command, upload related LFS blobs wrapfunction(bundle2, 'writenewbundle', wrapper.writenewbundle) -@templatekeyword('lfs_files') -def lfsfiles(repo, ctx, **args): - """List of strings. LFS files added or modified by the changeset.""" - args = pycompat.byteskwargs(args) +@filesetpredicate('lfs()', callstatus=True) +def lfsfileset(mctx, x): + """File that uses LFS storage.""" + # i18n: "lfs" is a keyword + fileset.getargs(x, 0, 0, _("lfs takes no arguments")) + return [f for f in mctx.subset + if wrapper.pointerfromctx(mctx.ctx, f, removed=True) is not None] - pointers = wrapper.pointersfromctx(ctx) # {path: pointer} +@templatekeyword('lfs_files', requires={'ctx'}) +def lfsfiles(context, mapping): + """List of strings. All files modified, added, or removed by this + changeset.""" + ctx = context.resource(mapping, 'ctx') + + pointers = wrapper.pointersfromctx(ctx, removed=True) # {path: pointer} files = sorted(pointers.keys()) def pointer(v): @@ -361,18 +382,18 @@ makemap = lambda v: { 'file': v, - 'lfsoid': pointers[v].oid(), - 'lfspointer': templatekw.hybriddict(pointer(v)), + 'lfsoid': pointers[v].oid() if pointers[v] else None, + 'lfspointer': templateutil.hybriddict(pointer(v)), } # TODO: make the separator ', '? - f = templatekw._showlist('lfs_file', files, args) - return templatekw._hybrid(f, files, makemap, pycompat.identity) + f = templateutil._showcompatlist(context, mapping, 'lfs_file', files) + return templateutil.hybrid(f, files, makemap, pycompat.identity) @command('debuglfsupload', [('r', 'rev', [], _('upload large files introduced by REV'))]) def debuglfsupload(ui, repo, **opts): """upload lfs blobs added by the working copy parent or given revisions""" - revs = opts.get('rev', []) + revs = opts.get(r'rev', []) pointers = wrapper.extractpointers(repo, scmutil.revrange(repo, revs)) wrapper.uploadblobs(repo, pointers) diff -r fb92df8b634c -r ed5448edcbfa hgext/lfs/blobstore.py --- a/hgext/lfs/blobstore.py Wed Apr 04 10:35:09 2018 -0400 +++ b/hgext/lfs/blobstore.py Wed Apr 18 15:32:08 2018 -0400 @@ -7,6 +7,7 @@ from __future__ import absolute_import +import errno import hashlib import json import os @@ -18,6 +19,7 @@ from mercurial import ( error, pathutil, + pycompat, url as urlmod, util, vfs as vfsmod, @@ -27,7 +29,7 @@ from ..largefiles import lfutil # 64 bytes for SHA256 -_lfsre = re.compile(r'\A[a-f0-9]{64}\Z') +_lfsre = re.compile(br'\A[a-f0-9]{64}\Z') class lfsvfs(vfsmod.vfs): def join(self, path): @@ -58,6 +60,26 @@ yield ('', [], oids) +class nullvfs(lfsvfs): + def __init__(self): + pass + + def exists(self, oid): + return False + + def read(self, oid): + # store.read() calls into here if the blob doesn't exist in its + # self.vfs. Raise the same error as a normal vfs when asked to read a + # file that doesn't exist. The only difference is the full file path + # isn't available in the error. + raise IOError(errno.ENOENT, '%s: No such file or directory' % oid) + + def walk(self, path=None, onerror=None): + return ('', [], []) + + def write(self, oid, data): + pass + class filewithprogress(object): """a file-like object that supports __len__ and read. @@ -96,8 +118,12 @@ def __init__(self, repo): fullpath = repo.svfs.join('lfs/objects') self.vfs = lfsvfs(fullpath) - usercache = lfutil._usercachedir(repo.ui, 'lfs') - self.cachevfs = lfsvfs(usercache) + + if repo.ui.configbool('experimental', 'lfs.disableusercache'): + self.cachevfs = nullvfs() + else: + usercache = lfutil._usercachedir(repo.ui, 'lfs') + self.cachevfs = lfsvfs(usercache) self.ui = repo.ui def open(self, oid): @@ -126,13 +152,10 @@ realoid = sha256.hexdigest() if realoid != oid: - raise error.Abort(_('corrupt remote lfs object: %s') % oid) + raise LfsCorruptionError(_('corrupt remote lfs object: %s') + % oid) - # XXX: should we verify the content of the cache, and hardlink back to - # the local store on success, but truncate, write and link on failure? - if not self.cachevfs.exists(oid): - self.ui.note(_('lfs: adding %s to the usercache\n') % oid) - lfutil.link(self.vfs.join(oid), self.cachevfs.join(oid)) + self._linktousercache(oid) def write(self, oid, data): """Write blob to local blobstore. @@ -143,9 +166,13 @@ with self.vfs(oid, 'wb', atomictemp=True) as fp: fp.write(data) + self._linktousercache(oid) + + def _linktousercache(self, oid): # XXX: should we verify the content of the cache, and hardlink back to # the local store on success, but truncate, write and link on failure? - if not self.cachevfs.exists(oid): + if (not self.cachevfs.exists(oid) + and not isinstance(self.cachevfs, nullvfs)): self.ui.note(_('lfs: adding %s to the usercache\n') % oid) lfutil.link(self.vfs.join(oid), self.cachevfs.join(oid)) @@ -174,6 +201,17 @@ _verify(oid, blob) return blob + def verify(self, oid): + """Indicate whether or not the hash of the underlying file matches its + name.""" + sha256 = hashlib.sha256() + + with self.open(oid) as fp: + for chunk in util.filechunkiter(fp, size=1048576): + sha256.update(chunk) + + return oid == sha256.hexdigest() + def has(self, oid): """Returns True if the local blobstore contains the requested blob, False otherwise.""" @@ -194,11 +232,11 @@ def writebatch(self, pointers, fromstore): """Batch upload from local to remote blobstore.""" - self._batch(pointers, fromstore, 'upload') + self._batch(_deduplicate(pointers), fromstore, 'upload') def readbatch(self, pointers, tostore): """Batch download from remote to local blostore.""" - self._batch(pointers, tostore, 'download') + self._batch(_deduplicate(pointers), tostore, 'download') def _batchrequest(self, pointers, action): """Get metadata about objects pointed by pointers for given action @@ -216,7 +254,8 @@ batchreq.add_header('Accept', 'application/vnd.git-lfs+json') batchreq.add_header('Content-Type', 'application/vnd.git-lfs+json') try: - rawjson = self.urlopener.open(batchreq).read() + rsp = self.urlopener.open(batchreq) + rawjson = rsp.read() except util.urlerr.httperror as ex: raise LfsRemoteError(_('LFS HTTP error: %s (action=%s)') % (ex, action)) @@ -225,6 +264,20 @@ except ValueError: raise LfsRemoteError(_('LFS server returns invalid JSON: %s') % rawjson) + + if self.ui.debugflag: + self.ui.debug('Status: %d\n' % rsp.status) + # lfs-test-server and hg serve return headers in different order + self.ui.debug('%s\n' + % '\n'.join(sorted(str(rsp.info()).splitlines()))) + + if 'objects' in response: + response['objects'] = sorted(response['objects'], + key=lambda p: p['oid']) + self.ui.debug('%s\n' + % json.dumps(response, indent=2, + separators=('', ': '), sort_keys=True)) + return response def _checkforservererror(self, pointers, responses, action): @@ -236,23 +289,34 @@ # server implementation (ex. lfs-test-server) does not set "error" # but just removes "download" from "actions". Treat that case # as the same as 404 error. - notfound = (response.get('error', {}).get('code') == 404 - or (action == 'download' - and action not in response.get('actions', []))) - if notfound: - ptrmap = {p.oid(): p for p in pointers} - p = ptrmap.get(response['oid'], None) - if p: - filename = getattr(p, 'filename', 'unknown') - raise LfsRemoteError( - _(('LFS server error. Remote object ' - 'for "%s" not found: %r')) % (filename, response)) + if 'error' not in response: + if (action == 'download' + and action not in response.get('actions', [])): + code = 404 else: - raise LfsRemoteError( - _('LFS server error. Unsolicited response for oid %s') - % response['oid']) - if 'error' in response: - raise LfsRemoteError(_('LFS server error: %r') % response) + continue + else: + # An error dict without a code doesn't make much sense, so + # treat as a server error. + code = response.get('error').get('code', 500) + + ptrmap = {p.oid(): p for p in pointers} + p = ptrmap.get(response['oid'], None) + if p: + filename = getattr(p, 'filename', 'unknown') + errors = { + 404: 'The object does not exist', + 410: 'The object was removed by the owner', + 422: 'Validation error', + 500: 'Internal server error', + } + msg = errors.get(code, 'status code %d' % code) + raise LfsRemoteError(_('LFS server error for "%s": %s') + % (filename, msg)) + else: + raise LfsRemoteError( + _('LFS server error. Unsolicited response for oid %s') + % response['oid']) def _extractobjects(self, response, pointers, action): """extract objects from response of the batch API @@ -281,18 +345,20 @@ See https://github.com/git-lfs/git-lfs/blob/master/docs/api/\ basic-transfers.md """ - oid = str(obj['oid']) + oid = pycompat.bytestr(obj['oid']) - href = str(obj['actions'][action].get('href')) + href = pycompat.bytestr(obj['actions'][action].get('href')) headers = obj['actions'][action].get('header', {}).items() request = util.urlreq.request(href) if action == 'upload': # If uploading blobs, read data from local blobstore. - with localstore.open(oid) as fp: - _verifyfile(oid, fp) + if not localstore.verify(oid): + raise error.Abort(_('detected corrupt lfs object: %s') % oid, + hint=_('run hg verify')) request.data = filewithprogress(localstore.open(oid), None) request.get_method = lambda: 'PUT' + request.add_header('Content-Type', 'application/octet-stream') for k, v in headers: request.add_header(k, v) @@ -300,6 +366,13 @@ response = b'' try: req = self.urlopener.open(request) + + if self.ui.debugflag: + self.ui.debug('Status: %d\n' % req.status) + # lfs-test-server and hg serve return headers in different order + self.ui.debug('%s\n' + % '\n'.join(sorted(str(req.info()).splitlines()))) + if action == 'download': # If downloading blobs, store downloaded data to local blobstore localstore.download(oid, req) @@ -366,12 +439,22 @@ oids = transfer(sorted(objects, key=lambda o: o.get('oid'))) processed = 0 + blobs = 0 for _one, oid in oids: processed += sizes[oid] + blobs += 1 self.ui.progress(topic, processed, total=total) self.ui.note(_('lfs: processed: %s\n') % oid) self.ui.progress(topic, pos=None, total=total) + if blobs > 0: + if action == 'upload': + self.ui.status(_('lfs: uploaded %d files (%s)\n') + % (blobs, util.bytecount(processed))) + elif action == 'download': + self.ui.status(_('lfs: downloaded %d files (%s)\n') + % (blobs, util.bytecount(processed))) + def __del__(self): # copied from mercurial/httppeer.py urlopener = getattr(self, 'urlopener', None) @@ -388,13 +471,13 @@ self.vfs = lfsvfs(fullpath) def writebatch(self, pointers, fromstore): - for p in pointers: + for p in _deduplicate(pointers): content = fromstore.read(p.oid(), verify=True) with self.vfs(p.oid(), 'wb', atomictemp=True) as fp: fp.write(content) def readbatch(self, pointers, tostore): - for p in pointers: + for p in _deduplicate(pointers): with self.vfs(p.oid(), 'rb') as fp: tostore.download(p.oid(), fp) @@ -433,27 +516,56 @@ None: _promptremote, } +def _deduplicate(pointers): + """Remove any duplicate oids that exist in the list""" + reduced = util.sortdict() + for p in pointers: + reduced[p.oid()] = p + return reduced.values() + def _verify(oid, content): realoid = hashlib.sha256(content).hexdigest() if realoid != oid: - raise error.Abort(_('detected corrupt lfs object: %s') % oid, - hint=_('run hg verify')) + raise LfsCorruptionError(_('detected corrupt lfs object: %s') % oid, + hint=_('run hg verify')) + +def remote(repo, remote=None): + """remotestore factory. return a store in _storemap depending on config + + If ``lfs.url`` is specified, use that remote endpoint. Otherwise, try to + infer the endpoint, based on the remote repository using the same path + adjustments as git. As an extension, 'http' is supported as well so that + ``hg serve`` works out of the box. -def _verifyfile(oid, fp): - sha256 = hashlib.sha256() - while True: - data = fp.read(1024 * 1024) - if not data: - break - sha256.update(data) - realoid = sha256.hexdigest() - if realoid != oid: - raise error.Abort(_('detected corrupt lfs object: %s') % oid, - hint=_('run hg verify')) + https://github.com/git-lfs/git-lfs/blob/master/docs/api/server-discovery.md + """ + lfsurl = repo.ui.config('lfs', 'url') + url = util.url(lfsurl or '') + if lfsurl is None: + if remote: + path = remote + elif util.safehasattr(repo, '_subtoppath'): + # The pull command sets this during the optional update phase, which + # tells exactly where the pull originated, whether 'paths.default' + # or explicit. + path = repo._subtoppath + else: + # TODO: investigate 'paths.remote:lfsurl' style path customization, + # and fall back to inferring from 'paths.remote' if unspecified. + path = repo.ui.config('paths', 'default') or '' -def remote(repo): - """remotestore factory. return a store in _storemap depending on config""" - url = util.url(repo.ui.config('lfs', 'url') or '') + defaulturl = util.url(path) + + # TODO: support local paths as well. + # TODO: consider the ssh -> https transformation that git applies + if defaulturl.scheme in (b'http', b'https'): + if defaulturl.path and defaulturl.path[:-1] != b'/': + defaulturl.path += b'/' + defaulturl.path = (defaulturl.path or b'') + b'.git/info/lfs' + + url = util.url(bytes(defaulturl)) + repo.ui.note(_('lfs: assuming remote store: %s\n') % url) + scheme = url.scheme if scheme not in _storemap: raise error.Abort(_('lfs: unknown url scheme: %s') % scheme) @@ -461,3 +573,8 @@ class LfsRemoteError(error.RevlogError): pass + +class LfsCorruptionError(error.Abort): + """Raised when a corrupt blob is detected, aborting an operation + + It exists to allow specialized handling on the server side.""" diff -r fb92df8b634c -r ed5448edcbfa hgext/lfs/pointer.py --- a/hgext/lfs/pointer.py Wed Apr 04 10:35:09 2018 -0400 +++ b/hgext/lfs/pointer.py Wed Apr 18 15:32:08 2018 -0400 @@ -13,6 +13,7 @@ from mercurial import ( error, + pycompat, ) class InvalidPointer(error.RevlogError): @@ -23,7 +24,8 @@ def __init__(self, *args, **kwargs): self['version'] = self.VERSION - super(gitlfspointer, self).__init__(*args, **kwargs) + super(gitlfspointer, self).__init__(*args) + self.update(pycompat.byteskwargs(kwargs)) @classmethod def deserialize(cls, text): @@ -45,12 +47,12 @@ # regular expressions used by _validate # see https://github.com/git-lfs/git-lfs/blob/master/docs/spec.md - _keyre = re.compile(r'\A[a-z0-9.-]+\Z') - _valuere = re.compile(r'\A[^\n]*\Z') + _keyre = re.compile(br'\A[a-z0-9.-]+\Z') + _valuere = re.compile(br'\A[^\n]*\Z') _requiredre = { - 'size': re.compile(r'\A[0-9]+\Z'), - 'oid': re.compile(r'\Asha256:[0-9a-f]{64}\Z'), - 'version': re.compile(r'\A%s\Z' % re.escape(VERSION)), + 'size': re.compile(br'\A[0-9]+\Z'), + 'oid': re.compile(br'\Asha256:[0-9a-f]{64}\Z'), + 'version': re.compile(br'\A%s\Z' % re.escape(VERSION)), } def validate(self): diff -r fb92df8b634c -r ed5448edcbfa hgext/lfs/wireprotolfsserver.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/hgext/lfs/wireprotolfsserver.py Wed Apr 18 15:32:08 2018 -0400 @@ -0,0 +1,336 @@ +# wireprotolfsserver.py - lfs protocol server side implementation +# +# Copyright 2018 Matt Harbison +# +# This software may be used and distributed according to the terms of the +# GNU General Public License version 2 or any later version. + +from __future__ import absolute_import + +import datetime +import errno +import json +import traceback + +from mercurial.hgweb import ( + common as hgwebcommon, +) + +from mercurial import ( + pycompat, +) + +from . import blobstore + +HTTP_OK = hgwebcommon.HTTP_OK +HTTP_CREATED = hgwebcommon.HTTP_CREATED +HTTP_BAD_REQUEST = hgwebcommon.HTTP_BAD_REQUEST +HTTP_NOT_FOUND = hgwebcommon.HTTP_NOT_FOUND +HTTP_METHOD_NOT_ALLOWED = hgwebcommon.HTTP_METHOD_NOT_ALLOWED +HTTP_NOT_ACCEPTABLE = hgwebcommon.HTTP_NOT_ACCEPTABLE +HTTP_UNSUPPORTED_MEDIA_TYPE = hgwebcommon.HTTP_UNSUPPORTED_MEDIA_TYPE + +def handlewsgirequest(orig, rctx, req, res, checkperm): + """Wrap wireprotoserver.handlewsgirequest() to possibly process an LFS + request if it is left unprocessed by the wrapped method. + """ + if orig(rctx, req, res, checkperm): + return True + + if not rctx.repo.ui.configbool('experimental', 'lfs.serve'): + return False + + if not req.dispatchpath: + return False + + try: + if req.dispatchpath == b'.git/info/lfs/objects/batch': + checkperm(rctx, req, 'pull') + return _processbatchrequest(rctx.repo, req, res) + # TODO: reserve and use a path in the proposed http wireprotocol /api/ + # namespace? + elif req.dispatchpath.startswith(b'.hg/lfs/objects'): + return _processbasictransfer(rctx.repo, req, res, + lambda perm: + checkperm(rctx, req, perm)) + return False + except hgwebcommon.ErrorResponse as e: + # XXX: copied from the handler surrounding wireprotoserver._callhttp() + # in the wrapped function. Should this be moved back to hgweb to + # be a common handler? + for k, v in e.headers: + res.headers[k] = v + res.status = hgwebcommon.statusmessage(e.code, pycompat.bytestr(e)) + res.setbodybytes(b'0\n%s\n' % pycompat.bytestr(e)) + return True + +def _sethttperror(res, code, message=None): + res.status = hgwebcommon.statusmessage(code, message=message) + res.headers[b'Content-Type'] = b'text/plain; charset=utf-8' + res.setbodybytes(b'') + +def _logexception(req): + """Write information about the current exception to wsgi.errors.""" + tb = pycompat.sysbytes(traceback.format_exc()) + errorlog = req.rawenv[r'wsgi.errors'] + + uri = b'' + if req.apppath: + uri += req.apppath + uri += b'/' + req.dispatchpath + + errorlog.write(b"Exception happened while processing request '%s':\n%s" % + (uri, tb)) + +def _processbatchrequest(repo, req, res): + """Handle a request for the Batch API, which is the gateway to granting file + access. + + https://github.com/git-lfs/git-lfs/blob/master/docs/api/batch.md + """ + + # Mercurial client request: + # + # HOST: localhost:$HGPORT + # ACCEPT: application/vnd.git-lfs+json + # ACCEPT-ENCODING: identity + # USER-AGENT: git-lfs/2.3.4 (Mercurial 4.5.2+1114-f48b9754f04c+20180316) + # Content-Length: 125 + # Content-Type: application/vnd.git-lfs+json + # + # { + # "objects": [ + # { + # "oid": "31cf...8e5b" + # "size": 12 + # } + # ] + # "operation": "upload" + # } + + if req.method != b'POST': + _sethttperror(res, HTTP_METHOD_NOT_ALLOWED) + return True + + if req.headers[b'Content-Type'] != b'application/vnd.git-lfs+json': + _sethttperror(res, HTTP_UNSUPPORTED_MEDIA_TYPE) + return True + + if req.headers[b'Accept'] != b'application/vnd.git-lfs+json': + _sethttperror(res, HTTP_NOT_ACCEPTABLE) + return True + + # XXX: specify an encoding? + lfsreq = json.loads(req.bodyfh.read()) + + # If no transfer handlers are explicitly requested, 'basic' is assumed. + if 'basic' not in lfsreq.get('transfers', ['basic']): + _sethttperror(res, HTTP_BAD_REQUEST, + b'Only the basic LFS transfer handler is supported') + return True + + operation = lfsreq.get('operation') + if operation not in ('upload', 'download'): + _sethttperror(res, HTTP_BAD_REQUEST, + b'Unsupported LFS transfer operation: %s' % operation) + return True + + localstore = repo.svfs.lfslocalblobstore + + objects = [p for p in _batchresponseobjects(req, lfsreq.get('objects', []), + operation, localstore)] + + rsp = { + 'transfer': 'basic', + 'objects': objects, + } + + res.status = hgwebcommon.statusmessage(HTTP_OK) + res.headers[b'Content-Type'] = b'application/vnd.git-lfs+json' + res.setbodybytes(pycompat.bytestr(json.dumps(rsp))) + + return True + +def _batchresponseobjects(req, objects, action, store): + """Yield one dictionary of attributes for the Batch API response for each + object in the list. + + req: The parsedrequest for the Batch API request + objects: The list of objects in the Batch API object request list + action: 'upload' or 'download' + store: The local blob store for servicing requests""" + + # Successful lfs-test-server response to solict an upload: + # { + # u'objects': [{ + # u'size': 12, + # u'oid': u'31cf...8e5b', + # u'actions': { + # u'upload': { + # u'href': u'http://localhost:$HGPORT/objects/31cf...8e5b', + # u'expires_at': u'0001-01-01T00:00:00Z', + # u'header': { + # u'Accept': u'application/vnd.git-lfs' + # } + # } + # } + # }] + # } + + # TODO: Sort out the expires_at/expires_in/authenticated keys. + + for obj in objects: + # Convert unicode to ASCII to create a filesystem path + oid = obj.get('oid').encode('ascii') + rsp = { + 'oid': oid, + 'size': obj.get('size'), # XXX: should this check the local size? + #'authenticated': True, + } + + exists = True + verifies = False + + # Verify an existing file on the upload request, so that the client is + # solicited to re-upload if it corrupt locally. Download requests are + # also verified, so the error can be flagged in the Batch API response. + # (Maybe we can use this to short circuit the download for `hg verify`, + # IFF the client can assert that the remote end is an hg server.) + # Otherwise, it's potentially overkill on download, since it is also + # verified as the file is streamed to the caller. + try: + verifies = store.verify(oid) + except IOError as inst: + if inst.errno != errno.ENOENT: + _logexception(req) + + rsp['error'] = { + 'code': 500, + 'message': inst.strerror or 'Internal Server Server' + } + yield rsp + continue + + exists = False + + # Items are always listed for downloads. They are dropped for uploads + # IFF they already exist locally. + if action == 'download': + if not exists: + rsp['error'] = { + 'code': 404, + 'message': "The object does not exist" + } + yield rsp + continue + + elif not verifies: + rsp['error'] = { + 'code': 422, # XXX: is this the right code? + 'message': "The object is corrupt" + } + yield rsp + continue + + elif verifies: + yield rsp # Skip 'actions': already uploaded + continue + + expiresat = datetime.datetime.now() + datetime.timedelta(minutes=10) + + def _buildheader(): + # The spec doesn't mention the Accept header here, but avoid + # a gratuitous deviation from lfs-test-server in the test + # output. + hdr = { + 'Accept': 'application/vnd.git-lfs' + } + + auth = req.headers.get('Authorization', '') + if auth.startswith('Basic '): + hdr['Authorization'] = auth + + return hdr + + rsp['actions'] = { + '%s' % action: { + 'href': '%s%s/.hg/lfs/objects/%s' + % (req.baseurl, req.apppath, oid), + # datetime.isoformat() doesn't include the 'Z' suffix + "expires_at": expiresat.strftime('%Y-%m-%dT%H:%M:%SZ'), + 'header': _buildheader(), + } + } + + yield rsp + +def _processbasictransfer(repo, req, res, checkperm): + """Handle a single file upload (PUT) or download (GET) action for the Basic + Transfer Adapter. + + After determining if the request is for an upload or download, the access + must be checked by calling ``checkperm()`` with either 'pull' or 'upload' + before accessing the files. + + https://github.com/git-lfs/git-lfs/blob/master/docs/api/basic-transfers.md + """ + + method = req.method + oid = req.dispatchparts[-1] + localstore = repo.svfs.lfslocalblobstore + + if len(req.dispatchparts) != 4: + _sethttperror(res, HTTP_NOT_FOUND) + return True + + if method == b'PUT': + checkperm('upload') + + # TODO: verify Content-Type? + + existed = localstore.has(oid) + + # TODO: how to handle timeouts? The body proxy handles limiting to + # Content-Length, but what happens if a client sends less than it + # says it will? + + statusmessage = hgwebcommon.statusmessage + try: + localstore.download(oid, req.bodyfh) + res.status = statusmessage(HTTP_OK if existed else HTTP_CREATED) + except blobstore.LfsCorruptionError: + _logexception(req) + + # XXX: Is this the right code? + res.status = statusmessage(422, b'corrupt blob') + + # There's no payload here, but this is the header that lfs-test-server + # sends back. This eliminates some gratuitous test output conditionals. + res.headers[b'Content-Type'] = b'text/plain; charset=utf-8' + res.setbodybytes(b'') + + return True + elif method == b'GET': + checkperm('pull') + + res.status = hgwebcommon.statusmessage(HTTP_OK) + res.headers[b'Content-Type'] = b'application/octet-stream' + + try: + # TODO: figure out how to send back the file in chunks, instead of + # reading the whole thing. (Also figure out how to send back + # an error status if an IOError occurs after a partial write + # in that case. Here, everything is read before starting.) + res.setbodybytes(localstore.read(oid)) + except blobstore.LfsCorruptionError: + _logexception(req) + + # XXX: Is this the right code? + res.status = hgwebcommon.statusmessage(422, b'corrupt blob') + res.setbodybytes(b'') + + return True + else: + _sethttperror(res, HTTP_METHOD_NOT_ALLOWED, + message=b'Unsupported LFS transfer method: %s' % method) + return True diff -r fb92df8b634c -r ed5448edcbfa hgext/lfs/wrapper.py --- a/hgext/lfs/wrapper.py Wed Apr 04 10:35:09 2018 -0400 +++ b/hgext/lfs/wrapper.py Wed Apr 18 15:32:08 2018 -0400 @@ -10,15 +10,18 @@ import hashlib from mercurial.i18n import _ -from mercurial.node import bin, nullid, short +from mercurial.node import bin, hex, nullid, short from mercurial import ( error, - filelog, revlog, util, ) +from mercurial.utils import ( + stringutil, +) + from ..largefiles import lfutil from . import ( @@ -26,14 +29,6 @@ pointer, ) -def supportedoutgoingversions(orig, repo): - versions = orig(repo) - if 'lfs' in repo.requirements: - versions.discard('01') - versions.discard('02') - versions.add('03') - return versions - def allsupportedversions(orig, ui): versions = orig(ui) versions.add('03') @@ -73,29 +68,29 @@ name = k[len('x-hg-'):] hgmeta[name] = p[k] if hgmeta or text.startswith('\1\n'): - text = filelog.packmeta(hgmeta, text) + text = revlog.packmeta(hgmeta, text) return (text, True) def writetostore(self, text): # hg filelog metadata (includes rename, etc) - hgmeta, offset = filelog.parsemeta(text) + hgmeta, offset = revlog.parsemeta(text) if offset and offset > 0: # lfs blob does not contain hg filelog metadata text = text[offset:] # git-lfs only supports sha256 - oid = hashlib.sha256(text).hexdigest() + oid = hex(hashlib.sha256(text).digest()) self.opener.lfslocalblobstore.write(oid, text) # replace contents with metadata longoid = 'sha256:%s' % oid - metadata = pointer.gitlfspointer(oid=longoid, size=str(len(text))) + metadata = pointer.gitlfspointer(oid=longoid, size='%d' % len(text)) # by default, we expect the content to be binary. however, LFS could also # be used for non-binary content. add a special entry for non-binary data. # this will be used by filectx.isbinary(). - if not util.binary(text): + if not stringutil.binary(text): # not hg filelog metadata (affecting commit hash), no "x-hg-" prefix metadata['x-is-binary'] = '0' @@ -125,7 +120,7 @@ flags=revlog.REVIDX_DEFAULT_FLAGS, **kwds): textlen = len(text) # exclude hg rename meta from file size - meta, offset = filelog.parsemeta(text) + meta, offset = revlog.parsemeta(text) if offset: textlen -= offset @@ -249,6 +244,27 @@ if 'lfs' in destrepo.requirements: destrepo.vfs.append('hgrc', util.tonativeeol('\n[extensions]\nlfs=\n')) +def _prefetchfiles(repo, revs, match): + """Ensure that required LFS blobs are present, fetching them as a group if + needed.""" + pointers = [] + oids = set() + localstore = repo.svfs.lfslocalblobstore + + for rev in revs: + ctx = repo[rev] + for f in ctx.walk(match): + p = pointerfromctx(ctx, f) + if p and p.oid() not in oids and not localstore.has(p.oid()): + p.filename = f + pointers.append(p) + oids.add(p.oid()) + + if pointers: + # Recalculating the repo store here allows 'paths.default' that is set + # on the repo by a clone command to be used for the update. + blobstore.remote(repo).readbatch(pointers, localstore) + def _canskipupload(repo): # if remotestore is a null store, upload is a no-op and can be skipped return isinstance(repo.svfs.lfsremoteblobstore, blobstore._nullremote) @@ -277,7 +293,8 @@ return uploadblobsfromrevs(pushop.repo, pushop.outgoing.missing) def push(orig, repo, remote, *args, **kwargs): - """bail on push if the extension isn't enabled on remote when needed""" + """bail on push if the extension isn't enabled on remote when needed, and + update the remote store based on the destination path.""" if 'lfs' in repo.requirements: # If the remote peer is for a local repo, the requirement tests in the # base class method enforce lfs support. Otherwise, some revisions in @@ -288,7 +305,18 @@ m = _("required features are not supported in the destination: %s") raise error.Abort(m % 'lfs', hint=_('enable the lfs extension on the server')) - return orig(repo, remote, *args, **kwargs) + + # Repositories where this extension is disabled won't have the field. + # But if there's a requirement, then the extension must be loaded AND + # there may be blobs to push. + remotestore = repo.svfs.lfsremoteblobstore + try: + repo.svfs.lfsremoteblobstore = blobstore.remote(repo, remote.url()) + return orig(repo, remote, *args, **kwargs) + finally: + repo.svfs.lfsremoteblobstore = remotestore + else: + return orig(repo, remote, *args, **kwargs) def writenewbundle(orig, ui, repo, source, filename, bundletype, outgoing, *args, **kwargs): @@ -307,20 +335,47 @@ pointers[p.oid()] = p return sorted(pointers.values()) -def pointersfromctx(ctx): - """return a dict {path: pointer} for given single changectx""" +def pointerfromctx(ctx, f, removed=False): + """return a pointer for the named file from the given changectx, or None if + the file isn't LFS. + + Optionally, the pointer for a file deleted from the context can be returned. + Since no such pointer is actually stored, and to distinguish from a non LFS + file, this pointer is represented by an empty dict. + """ + _ctx = ctx + if f not in ctx: + if not removed: + return None + if f in ctx.p1(): + _ctx = ctx.p1() + elif f in ctx.p2(): + _ctx = ctx.p2() + else: + return None + fctx = _ctx[f] + if not _islfs(fctx.filelog(), fctx.filenode()): + return None + try: + p = pointer.deserialize(fctx.rawdata()) + if ctx == _ctx: + return p + return {} + except pointer.InvalidPointer as ex: + raise error.Abort(_('lfs: corrupted pointer (%s@%s): %s\n') + % (f, short(_ctx.node()), ex)) + +def pointersfromctx(ctx, removed=False): + """return a dict {path: pointer} for given single changectx. + + If ``removed`` == True and the LFS file was removed from ``ctx``, the value + stored for the path is an empty dict. + """ result = {} for f in ctx.files(): - if f not in ctx: - continue - fctx = ctx[f] - if not _islfs(fctx.filelog(), fctx.filenode()): - continue - try: - result[f] = pointer.deserialize(fctx.rawdata()) - except pointer.InvalidPointer as ex: - raise error.Abort(_('lfs: corrupted pointer (%s@%s): %s\n') - % (f, short(ctx.node()), ex)) + p = pointerfromctx(ctx, f, removed=removed) + if p is not None: + result[f] = p return result def uploadblobs(repo, pointers): diff -r fb92df8b634c -r ed5448edcbfa hgext/mq.py --- a/hgext/mq.py Wed Apr 04 10:35:09 2018 -0400 +++ b/hgext/mq.py Wed Apr 18 15:32:08 2018 -0400 @@ -86,6 +86,7 @@ hg, localrepo, lock as lockmod, + logcmdutil, patch as patchmod, phases, pycompat, @@ -93,10 +94,14 @@ revsetlang, scmutil, smartset, - subrepo, + subrepoutil, util, vfs as vfsmod, ) +from mercurial.utils import ( + dateutil, + stringutil, +) release = lockmod.release seriesopts = [('s', 'summary', None, _('print first line of patch header'))] @@ -148,9 +153,13 @@ class statusentry(object): def __init__(self, node, name): self.node, self.name = node, name - def __repr__(self): + + def __bytes__(self): return hex(self.node) + ':' + self.name + __str__ = encoding.strmethod(__bytes__) + __repr__ = encoding.strmethod(__bytes__) + # The order of the headers in 'hg export' HG patches: HGHEADERS = [ # '# HG changeset patch', @@ -276,7 +285,7 @@ nodeid = None diffstart = 0 - for line in file(pf): + for line in open(pf, 'rb'): line = line.rstrip() if (line.startswith('diff --git') or (diffstart and line.startswith('+++ '))): @@ -391,12 +400,14 @@ self.comments.append('') self.comments.append(message) - def __str__(self): + def __bytes__(self): s = '\n'.join(self.comments).rstrip() if not s: return '' return s + '\n\n' + __str__ = encoding.strmethod(__bytes__) + def _delmsg(self): '''Remove existing message, keeping the rest of the comments fields. If comments contains 'subject: ', message will prepend @@ -438,9 +449,9 @@ def __init__(self, ui, baseui, path, patchdir=None): self.basepath = path try: - fh = open(os.path.join(path, 'patches.queue')) - cur = fh.read().rstrip() - fh.close() + with open(os.path.join(path, 'patches.queue'), r'rb') as fh: + cur = fh.read().rstrip() + if not cur: curpath = os.path.join(path, 'patches') else: @@ -461,7 +472,7 @@ self.guardsdirty = False # Handle mq.git as a bool with extended values gitmode = ui.config('mq', 'git').lower() - boolmode = util.parsebool(gitmode) + boolmode = stringutil.parsebool(gitmode) if boolmode is not None: if boolmode: gitmode = 'yes' @@ -546,10 +557,8 @@ for patchfn in patches: patchf = self.opener(patchfn, 'r') # if the patch was a git patch, refresh it as a git patch - for line in patchf: - if line.startswith('diff --git'): - diffopts.git = True - break + diffopts.git = any(line.startswith('diff --git') + for line in patchf) patchf.close() return diffopts @@ -643,21 +652,22 @@ self.seriesdirty = True def pushable(self, idx): - if isinstance(idx, str): + if isinstance(idx, bytes): idx = self.series.index(idx) patchguards = self.seriesguards[idx] if not patchguards: return True, None guards = self.active() - exactneg = [g for g in patchguards if g[0] == '-' and g[1:] in guards] + exactneg = [g for g in patchguards + if g.startswith('-') and g[1:] in guards] if exactneg: - return False, repr(exactneg[0]) - pos = [g for g in patchguards if g[0] == '+'] + return False, pycompat.byterepr(exactneg[0]) + pos = [g for g in patchguards if g.startswith('+')] exactpos = [g for g in pos if g[1:] in guards] if pos: if exactpos: - return True, repr(exactpos[0]) - return False, ' '.join(map(repr, pos)) + return True, pycompat.byterepr(exactpos[0]) + return False, ' '.join([pycompat.byterepr(p) for p in pos]) return True, '' def explainpushable(self, idx, all_patches=False): @@ -667,7 +677,7 @@ write = self.ui.warn if all_patches or self.ui.verbose: - if isinstance(idx, str): + if isinstance(idx, bytes): idx = self.series.index(idx) pushable, why = self.pushable(idx) if all_patches and pushable: @@ -691,12 +701,12 @@ def savedirty(self): def writelist(items, path): - fp = self.opener(path, 'w') + fp = self.opener(path, 'wb') for i in items: fp.write("%s\n" % i) fp.close() if self.applieddirty: - writelist(map(str, self.applied), self.statuspath) + writelist(map(bytes, self.applied), self.statuspath) self.applieddirty = False if self.seriesdirty: writelist(self.fullseries, self.seriespath) @@ -717,7 +727,8 @@ try: os.unlink(undo) except OSError as inst: - self.ui.warn(_('error removing undo: %s\n') % str(inst)) + self.ui.warn(_('error removing undo: %s\n') % + stringutil.forcebytestr(inst)) def backup(self, repo, files, copy=False): # backup local changes in --force case @@ -739,8 +750,8 @@ opts = {} stat = opts.get('stat') m = scmutil.match(repo[node1], files, opts) - cmdutil.diffordiffstat(self.ui, repo, diffopts, node1, node2, m, - changes, stat, fp) + logcmdutil.diffordiffstat(self.ui, repo, diffopts, node1, node2, m, + changes, stat, fp) def mergeone(self, repo, mergeq, head, patch, rev, diffopts): # first try just applying the patch @@ -773,7 +784,7 @@ diffopts = self.patchopts(diffopts, patch) patchf = self.opener(patch, "w") - comments = str(ph) + comments = bytes(ph) if comments: patchf.write(comments) self.printdiff(repo, diffopts, head, n, fp=patchf) @@ -850,7 +861,7 @@ files=files, eolmode=None) return (True, list(files), fuzz) except Exception as inst: - self.ui.note(str(inst) + '\n') + self.ui.note(stringutil.forcebytestr(inst) + '\n') if not self.ui.verbose: self.ui.warn(_("patch failed, unable to continue (try -v)\n")) self.ui.traceback() @@ -963,8 +974,8 @@ wctx = repo[None] pctx = repo['.'] overwrite = False - mergedsubstate = subrepo.submerge(repo, pctx, wctx, wctx, - overwrite) + mergedsubstate = subrepoutil.submerge(repo, pctx, wctx, wctx, + overwrite) files += mergedsubstate.keys() match = scmutil.matchfiles(repo, files or []) @@ -1011,9 +1022,17 @@ unknown = [] - for (i, p) in sorted([(self.findseries(p), p) for p in patches], - reverse=True): - if i is not None: + sortedseries = [] + for p in patches: + idx = self.findseries(p) + if idx is None: + sortedseries.append((-1, p)) + else: + sortedseries.append((idx, p)) + + sortedseries.sort(reverse=True) + for (i, p) in sortedseries: + if i != -1: del self.fullseries[i] else: unknown.append(p) @@ -1145,7 +1164,7 @@ for c in ('#', ':', '\r', '\n'): if c in name: raise error.Abort(_('%r cannot be used in the name of a patch') - % c) + % pycompat.bytestr(c)) def checkpatchname(self, name, force=False): self.checkreservedname(name) @@ -1178,7 +1197,7 @@ except error.Abort: pass i += 1 - name = '%s__%s' % (namebase, i) + name = '%s__%d' % (namebase, i) return name def checkkeepchanges(self, keepchanges, force): @@ -1189,13 +1208,14 @@ """options: msg: a string or a no-argument function returning a string """ + opts = pycompat.byteskwargs(opts) msg = opts.get('msg') edit = opts.get('edit') editform = opts.get('editform', 'mq.qnew') user = opts.get('user') date = opts.get('date') if date: - date = util.parsedate(date) + date = dateutil.parsedate(date) diffopts = self.diffopts({'git': opts.get('git')}, plain=True) if opts.get('checkname', True): self.checkpatchname(patchfn) @@ -1259,13 +1279,13 @@ if user: ph.setuser(user) if date: - ph.setdate('%s %s' % date) + ph.setdate('%d %d' % date) ph.setparent(hex(nctx.p1().node())) msg = nctx.description().strip() if msg == defaultmsg.strip(): msg = '' ph.setmessage(msg) - p.write(str(ph)) + p.write(bytes(ph)) if commitfiles: parent = self.qparents(repo, n) if inclsubs: @@ -1550,12 +1570,8 @@ update = True else: parents = [p.node() for p in repo[None].parents()] - needupdate = False - for entry in self.applied[start:]: - if entry.node in parents: - needupdate = True - break - update = needupdate + update = any(entry.node in parents + for entry in self.applied[start:]) tobackup = set() if update: @@ -1632,6 +1648,7 @@ self.printdiff(repo, diffopts, node1, node2, files=pats, opts=opts) def refresh(self, repo, pats=None, **opts): + opts = pycompat.byteskwargs(opts) if not self.applied: self.ui.write(_("no patches applied\n")) return 1 @@ -1641,7 +1658,7 @@ newuser = opts.get('user') newdate = opts.get('date') if newdate: - newdate = '%d %d' % util.parsedate(newdate) + newdate = '%d %d' % dateutil.parsedate(newdate) wlock = repo.wlock() try: @@ -1656,7 +1673,7 @@ cparents = repo.changelog.parents(top) patchparent = self.qparents(repo, top) - inclsubs = checksubstate(repo, hex(patchparent)) + inclsubs = checksubstate(repo, patchparent) if inclsubs: substatestate = repo.dirstate['.hgsubstate'] @@ -1846,7 +1863,7 @@ self.putsubstate2changes(substatestate, c) chunks = patchmod.diff(repo, patchparent, changes=c, opts=diffopts) - comments = str(ph) + comments = bytes(ph) if comments: patchf.write(comments) for chunk in chunks: @@ -1912,7 +1929,7 @@ if self.ui.formatted(): width = self.ui.termwidth() - len(pfx) - len(patchname) - 2 if width > 0: - msg = util.ellipsis(msg, width) + msg = stringutil.ellipsis(msg, width) else: msg = '' self.ui.write(patchname, label='qseries.' + state) @@ -1927,7 +1944,7 @@ length = len(self.series) - start if not missing: if self.ui.verbose: - idxwidth = len(str(start + length - 1)) + idxwidth = len("%d" % (start + length - 1)) for i in xrange(start, start + length): patch = self.series[i] if patch in applied: @@ -2093,7 +2110,7 @@ if not self.ui.verbose: p = pname else: - p = str(self.series.index(pname)) + " " + pname + p = ("%d" % self.series.index(pname)) + " " + pname return p def qimport(self, repo, files, patchname=None, rev=None, existing=None, @@ -2165,9 +2182,8 @@ self.checkpatchname(patchname, force) self.fullseries.insert(0, patchname) - patchf = self.opener(patchname, "w") - cmdutil.export(repo, [n], fp=patchf, opts=diffopts) - patchf.close() + with self.opener(patchname, "w") as fp: + cmdutil.exportfile(repo, [n], fp, opts=diffopts) se = statusentry(n, patchname) self.applied.insert(0, se) @@ -2260,7 +2276,7 @@ To stop managing a patch and move it into permanent history, use the :hg:`qfinish` command.""" q = repo.mq - q.delete(repo, patches, opts) + q.delete(repo, patches, pycompat.byteskwargs(opts)) q.savedirty() return 0 @@ -2497,7 +2513,7 @@ ui.note(_('cloning main repository\n')) sr, dr = hg.clone(ui, opts, sr.url(), dest, pull=opts.get('pull'), - rev=destrev, + revs=destrev, update=False, stream=opts.get('uncompressed')) @@ -2593,7 +2609,7 @@ if not opts.get('user') and opts.get('currentuser'): opts['user'] = ui.username() if not opts.get('date') and opts.get('currentdate'): - opts['date'] = "%d %d" % util.makedate() + opts['date'] = "%d %d" % dateutil.makedate() @command("^qnew", [('e', 'edit', None, _('invoke editor on commit messages')), @@ -3189,7 +3205,7 @@ guards[g] += 1 if ui.verbose: guards['NONE'] = noguards - guards = guards.items() + guards = list(guards.items()) guards.sort(key=lambda x: x[0][1:]) if guards: ui.note(_('guards in series file:\n')) diff -r fb92df8b634c -r ed5448edcbfa hgext/narrow/TODO.rst --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/hgext/narrow/TODO.rst Wed Apr 18 15:32:08 2018 -0400 @@ -0,0 +1,37 @@ +Integration with the share extension needs improvement. Right now +we've seen some odd bugs, and the way we modify the contents of the +.hg/shared file is unfortunate. See wrappostshare() and unsharenarrowspec(). + +Resolve commentary on narrowrepo.wraprepo.narrowrepository.status +about the filtering of status being done at an awkward layer. This +came up the import to hgext, but nobody's got concrete improvement +ideas as of then. + +Fold most (or preferably all) of narrowrevlog.py into core. + +Address commentary in narrowrevlog.excludedmanifestrevlog.add - +specifically we should improve the collaboration with core so that +add() never gets called on an excluded directory and we can improve +the stand-in to raise a ProgrammingError. + +Figure out how to correctly produce narrowmanifestrevlog and +narrowfilelog instances instead of monkeypatching regular revlogs at +runtime to our subclass. Even better, merge the narrowing logic +directly into core. + +Reason more completely about rename-filtering logic in +narrowfilelog. There could be some surprises lurking there. + +Formally document the narrowspec format. Unify with sparse, if at all +possible. For bonus points, unify with the server-specified narrowspec +format. + +narrowrepo.setnarrowpats() or narrowspec.save() need to make sure +they're holding the wlock. + +Implement a simple version of the expandnarrow wireproto command for +core. Having configurable shorthands for narrowspecs has been useful +at Google (and sparse has a similar feature from Facebook), so it +probably makes sense to implement the feature in core. (Google's +handler is entirely custom to Google, with a custom format related to +bazel's build language, so it's not in the narrowhg distribution.) diff -r fb92df8b634c -r ed5448edcbfa hgext/narrow/__init__.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/hgext/narrow/__init__.py Wed Apr 18 15:32:08 2018 -0400 @@ -0,0 +1,97 @@ +# __init__.py - narrowhg extension +# +# Copyright 2017 Google, Inc. +# +# This software may be used and distributed according to the terms of the +# GNU General Public License version 2 or any later version. +'''create clones which fetch history data for subset of files (EXPERIMENTAL)''' + +from __future__ import absolute_import + +# Note for extension authors: ONLY specify testedwith = 'ships-with-hg-core' for +# extensions which SHIP WITH MERCURIAL. Non-mainline extensions should +# be specifying the version(s) of Mercurial they are tested with, or +# leave the attribute unspecified. +testedwith = 'ships-with-hg-core' + +from mercurial import ( + changegroup, + extensions, + hg, + localrepo, + registrar, + verify as verifymod, +) + +from . import ( + narrowbundle2, + narrowchangegroup, + narrowcommands, + narrowcopies, + narrowdirstate, + narrowmerge, + narrowpatch, + narrowrepo, + narrowrevlog, + narrowtemplates, + narrowwirepeer, +) + +configtable = {} +configitem = registrar.configitem(configtable) +# Narrowhg *has* support for serving ellipsis nodes (which are used at +# least by Google's internal server), but that support is pretty +# fragile and has a lot of problems on real-world repositories that +# have complex graph topologies. This could probably be corrected, but +# absent someone needing the full support for ellipsis nodes in +# repositories with merges, it's unlikely this work will get done. As +# of this writining in late 2017, all repositories large enough for +# ellipsis nodes to be a hard requirement also enforce strictly linear +# history for other scaling reasons. +configitem('experimental', 'narrowservebrokenellipses', + default=False, + alias=[('narrow', 'serveellipses')], +) + +# Export the commands table for Mercurial to see. +cmdtable = narrowcommands.table + +def featuresetup(ui, features): + features.add(changegroup.NARROW_REQUIREMENT) + +def uisetup(ui): + """Wraps user-facing mercurial commands with narrow-aware versions.""" + localrepo.featuresetupfuncs.add(featuresetup) + narrowrevlog.setup() + narrowbundle2.setup() + narrowmerge.setup() + narrowcommands.setup() + narrowchangegroup.setup() + narrowwirepeer.uisetup() + +def reposetup(ui, repo): + """Wraps local repositories with narrow repo support.""" + if not repo.local(): + return + + narrowrepo.wraprepo(repo) + if changegroup.NARROW_REQUIREMENT in repo.requirements: + narrowcopies.setup(repo) + narrowdirstate.setup(repo) + narrowpatch.setup(repo) + narrowwirepeer.reposetup(repo) + +def _verifierinit(orig, self, repo, matcher=None): + # The verifier's matcher argument was desgined for narrowhg, so it should + # be None from core. If another extension passes a matcher (unlikely), + # we'll have to fail until matchers can be composed more easily. + assert matcher is None + orig(self, repo, repo.narrowmatch()) + +def extsetup(ui): + extensions.wrapfunction(verifymod.verifier, '__init__', _verifierinit) + extensions.wrapfunction(hg, 'postshare', narrowrepo.wrappostshare) + extensions.wrapfunction(hg, 'copystore', narrowrepo.unsharenarrowspec) + +templatekeyword = narrowtemplates.templatekeyword +revsetpredicate = narrowtemplates.revsetpredicate diff -r fb92df8b634c -r ed5448edcbfa hgext/narrow/narrowbundle2.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/hgext/narrow/narrowbundle2.py Wed Apr 18 15:32:08 2018 -0400 @@ -0,0 +1,507 @@ +# narrowbundle2.py - bundle2 extensions for narrow repository support +# +# Copyright 2017 Google, Inc. +# +# This software may be used and distributed according to the terms of the +# GNU General Public License version 2 or any later version. + +from __future__ import absolute_import + +import collections +import errno +import struct + +from mercurial.i18n import _ +from mercurial.node import ( + bin, + nullid, + nullrev, +) +from mercurial import ( + bundle2, + changegroup, + dagutil, + error, + exchange, + extensions, + narrowspec, + repair, + util, + wireprototypes, +) +from mercurial.utils import ( + stringutil, +) + +NARROWCAP = 'narrow' +_NARROWACL_SECTION = 'narrowhgacl' +_CHANGESPECPART = NARROWCAP + ':changespec' +_SPECPART = NARROWCAP + ':spec' +_SPECPART_INCLUDE = 'include' +_SPECPART_EXCLUDE = 'exclude' +_KILLNODESIGNAL = 'KILL' +_DONESIGNAL = 'DONE' +_ELIDEDCSHEADER = '>20s20s20sl' # cset id, p1, p2, len(text) +_ELIDEDMFHEADER = '>20s20s20s20sl' # manifest id, p1, p2, link id, len(text) +_CSHEADERSIZE = struct.calcsize(_ELIDEDCSHEADER) +_MFHEADERSIZE = struct.calcsize(_ELIDEDMFHEADER) + +# When advertising capabilities, always include narrow clone support. +def getrepocaps_narrow(orig, repo, **kwargs): + caps = orig(repo, **kwargs) + caps[NARROWCAP] = ['v0'] + return caps + +def _computeellipsis(repo, common, heads, known, match, depth=None): + """Compute the shape of a narrowed DAG. + + Args: + repo: The repository we're transferring. + common: The roots of the DAG range we're transferring. + May be just [nullid], which means all ancestors of heads. + heads: The heads of the DAG range we're transferring. + match: The narrowmatcher that allows us to identify relevant changes. + depth: If not None, only consider nodes to be full nodes if they are at + most depth changesets away from one of heads. + + Returns: + A tuple of (visitnodes, relevant_nodes, ellipsisroots) where: + + visitnodes: The list of nodes (either full or ellipsis) which + need to be sent to the client. + relevant_nodes: The set of changelog nodes which change a file inside + the narrowspec. The client needs these as non-ellipsis nodes. + ellipsisroots: A dict of {rev: parents} that is used in + narrowchangegroup to produce ellipsis nodes with the + correct parents. + """ + cl = repo.changelog + mfl = repo.manifestlog + + cldag = dagutil.revlogdag(cl) + # dagutil does not like nullid/nullrev + commonrevs = cldag.internalizeall(common - set([nullid])) | set([nullrev]) + headsrevs = cldag.internalizeall(heads) + if depth: + revdepth = {h: 0 for h in headsrevs} + + ellipsisheads = collections.defaultdict(set) + ellipsisroots = collections.defaultdict(set) + + def addroot(head, curchange): + """Add a root to an ellipsis head, splitting heads with 3 roots.""" + ellipsisroots[head].add(curchange) + # Recursively split ellipsis heads with 3 roots by finding the + # roots' youngest common descendant which is an elided merge commit. + # That descendant takes 2 of the 3 roots as its own, and becomes a + # root of the head. + while len(ellipsisroots[head]) > 2: + child, roots = splithead(head) + splitroots(head, child, roots) + head = child # Recurse in case we just added a 3rd root + + def splitroots(head, child, roots): + ellipsisroots[head].difference_update(roots) + ellipsisroots[head].add(child) + ellipsisroots[child].update(roots) + ellipsisroots[child].discard(child) + + def splithead(head): + r1, r2, r3 = sorted(ellipsisroots[head]) + for nr1, nr2 in ((r2, r3), (r1, r3), (r1, r2)): + mid = repo.revs('sort(merge() & %d::%d & %d::%d, -rev)', + nr1, head, nr2, head) + for j in mid: + if j == nr2: + return nr2, (nr1, nr2) + if j not in ellipsisroots or len(ellipsisroots[j]) < 2: + return j, (nr1, nr2) + raise error.Abort('Failed to split up ellipsis node! head: %d, ' + 'roots: %d %d %d' % (head, r1, r2, r3)) + + missing = list(cl.findmissingrevs(common=commonrevs, heads=headsrevs)) + visit = reversed(missing) + relevant_nodes = set() + visitnodes = [cl.node(m) for m in missing] + required = set(headsrevs) | known + for rev in visit: + clrev = cl.changelogrevision(rev) + ps = cldag.parents(rev) + if depth is not None: + curdepth = revdepth[rev] + for p in ps: + revdepth[p] = min(curdepth + 1, revdepth.get(p, depth + 1)) + needed = False + shallow_enough = depth is None or revdepth[rev] <= depth + if shallow_enough: + curmf = mfl[clrev.manifest].read() + if ps: + # We choose to not trust the changed files list in + # changesets because it's not always correct. TODO: could + # we trust it for the non-merge case? + p1mf = mfl[cl.changelogrevision(ps[0]).manifest].read() + needed = bool(curmf.diff(p1mf, match)) + if not needed and len(ps) > 1: + # For merge changes, the list of changed files is not + # helpful, since we need to emit the merge if a file + # in the narrow spec has changed on either side of the + # merge. As a result, we do a manifest diff to check. + p2mf = mfl[cl.changelogrevision(ps[1]).manifest].read() + needed = bool(curmf.diff(p2mf, match)) + else: + # For a root node, we need to include the node if any + # files in the node match the narrowspec. + needed = any(curmf.walk(match)) + + if needed: + for head in ellipsisheads[rev]: + addroot(head, rev) + for p in ps: + required.add(p) + relevant_nodes.add(cl.node(rev)) + else: + if not ps: + ps = [nullrev] + if rev in required: + for head in ellipsisheads[rev]: + addroot(head, rev) + for p in ps: + ellipsisheads[p].add(rev) + else: + for p in ps: + ellipsisheads[p] |= ellipsisheads[rev] + + # add common changesets as roots of their reachable ellipsis heads + for c in commonrevs: + for head in ellipsisheads[c]: + addroot(head, c) + return visitnodes, relevant_nodes, ellipsisroots + +def _packellipsischangegroup(repo, common, match, relevant_nodes, + ellipsisroots, visitnodes, depth, source, version): + if version in ('01', '02'): + raise error.Abort( + 'ellipsis nodes require at least cg3 on client and server, ' + 'but negotiated version %s' % version) + # We wrap cg1packer.revchunk, using a side channel to pass + # relevant_nodes into that area. Then if linknode isn't in the + # set, we know we have an ellipsis node and we should defer + # sending that node's data. We override close() to detect + # pending ellipsis nodes and flush them. + packer = changegroup.getbundler(version, repo) + # Let the packer have access to the narrow matcher so it can + # omit filelogs and dirlogs as needed + packer._narrow_matcher = lambda : match + # Give the packer the list of nodes which should not be + # ellipsis nodes. We store this rather than the set of nodes + # that should be an ellipsis because for very large histories + # we expect this to be significantly smaller. + packer.full_nodes = relevant_nodes + # Maps ellipsis revs to their roots at the changelog level. + packer.precomputed_ellipsis = ellipsisroots + # Maps CL revs to per-revlog revisions. Cleared in close() at + # the end of each group. + packer.clrev_to_localrev = {} + packer.next_clrev_to_localrev = {} + # Maps changelog nodes to changelog revs. Filled in once + # during changelog stage and then left unmodified. + packer.clnode_to_rev = {} + packer.changelog_done = False + # If true, informs the packer that it is serving shallow content and might + # need to pack file contents not introduced by the changes being packed. + packer.is_shallow = depth is not None + + return packer.generate(common, visitnodes, False, source) + +# Serve a changegroup for a client with a narrow clone. +def getbundlechangegrouppart_narrow(bundler, repo, source, + bundlecaps=None, b2caps=None, heads=None, + common=None, **kwargs): + cgversions = b2caps.get('changegroup') + if cgversions: # 3.1 and 3.2 ship with an empty value + cgversions = [v for v in cgversions + if v in changegroup.supportedoutgoingversions(repo)] + if not cgversions: + raise ValueError(_('no common changegroup version')) + version = max(cgversions) + else: + raise ValueError(_("server does not advertise changegroup version," + " can't negotiate support for ellipsis nodes")) + + include = sorted(filter(bool, kwargs.get(r'includepats', []))) + exclude = sorted(filter(bool, kwargs.get(r'excludepats', []))) + newmatch = narrowspec.match(repo.root, include=include, exclude=exclude) + if not repo.ui.configbool("experimental", "narrowservebrokenellipses"): + outgoing = exchange._computeoutgoing(repo, heads, common) + if not outgoing.missing: + return + def wrappedgetbundler(orig, *args, **kwargs): + bundler = orig(*args, **kwargs) + bundler._narrow_matcher = lambda : newmatch + return bundler + with extensions.wrappedfunction(changegroup, 'getbundler', + wrappedgetbundler): + cg = changegroup.makestream(repo, outgoing, version, source) + part = bundler.newpart('changegroup', data=cg) + part.addparam('version', version) + if 'treemanifest' in repo.requirements: + part.addparam('treemanifest', '1') + + if include or exclude: + narrowspecpart = bundler.newpart(_SPECPART) + if include: + narrowspecpart.addparam( + _SPECPART_INCLUDE, '\n'.join(include), mandatory=True) + if exclude: + narrowspecpart.addparam( + _SPECPART_EXCLUDE, '\n'.join(exclude), mandatory=True) + + return + + depth = kwargs.get(r'depth', None) + if depth is not None: + depth = int(depth) + if depth < 1: + raise error.Abort(_('depth must be positive, got %d') % depth) + + heads = set(heads or repo.heads()) + common = set(common or [nullid]) + oldinclude = sorted(filter(bool, kwargs.get(r'oldincludepats', []))) + oldexclude = sorted(filter(bool, kwargs.get(r'oldexcludepats', []))) + known = {bin(n) for n in kwargs.get(r'known', [])} + if known and (oldinclude != include or oldexclude != exclude): + # Steps: + # 1. Send kill for "$known & ::common" + # + # 2. Send changegroup for ::common + # + # 3. Proceed. + # + # In the future, we can send kills for only the specific + # nodes we know should go away or change shape, and then + # send a data stream that tells the client something like this: + # + # a) apply this changegroup + # b) apply nodes XXX, YYY, ZZZ that you already have + # c) goto a + # + # until they've built up the full new state. + # Convert to revnums and intersect with "common". The client should + # have made it a subset of "common" already, but let's be safe. + known = set(repo.revs("%ln & ::%ln", known, common)) + # TODO: we could send only roots() of this set, and the + # list of nodes in common, and the client could work out + # what to strip, instead of us explicitly sending every + # single node. + deadrevs = known + def genkills(): + for r in deadrevs: + yield _KILLNODESIGNAL + yield repo.changelog.node(r) + yield _DONESIGNAL + bundler.newpart(_CHANGESPECPART, data=genkills()) + newvisit, newfull, newellipsis = _computeellipsis( + repo, set(), common, known, newmatch) + if newvisit: + cg = _packellipsischangegroup( + repo, common, newmatch, newfull, newellipsis, + newvisit, depth, source, version) + part = bundler.newpart('changegroup', data=cg) + part.addparam('version', version) + if 'treemanifest' in repo.requirements: + part.addparam('treemanifest', '1') + + visitnodes, relevant_nodes, ellipsisroots = _computeellipsis( + repo, common, heads, set(), newmatch, depth=depth) + + repo.ui.debug('Found %d relevant revs\n' % len(relevant_nodes)) + if visitnodes: + cg = _packellipsischangegroup( + repo, common, newmatch, relevant_nodes, ellipsisroots, + visitnodes, depth, source, version) + part = bundler.newpart('changegroup', data=cg) + part.addparam('version', version) + if 'treemanifest' in repo.requirements: + part.addparam('treemanifest', '1') + +def applyacl_narrow(repo, kwargs): + ui = repo.ui + username = ui.shortuser(ui.environ.get('REMOTE_USER') or ui.username()) + user_includes = ui.configlist( + _NARROWACL_SECTION, username + '.includes', + ui.configlist(_NARROWACL_SECTION, 'default.includes')) + user_excludes = ui.configlist( + _NARROWACL_SECTION, username + '.excludes', + ui.configlist(_NARROWACL_SECTION, 'default.excludes')) + if not user_includes: + raise error.Abort(_("{} configuration for user {} is empty") + .format(_NARROWACL_SECTION, username)) + + user_includes = [ + 'path:.' if p == '*' else 'path:' + p for p in user_includes] + user_excludes = [ + 'path:.' if p == '*' else 'path:' + p for p in user_excludes] + + req_includes = set(kwargs.get(r'includepats', [])) + req_excludes = set(kwargs.get(r'excludepats', [])) + + req_includes, req_excludes, invalid_includes = narrowspec.restrictpatterns( + req_includes, req_excludes, user_includes, user_excludes) + + if invalid_includes: + raise error.Abort( + _("The following includes are not accessible for {}: {}") + .format(username, invalid_includes)) + + new_args = {} + new_args.update(kwargs) + new_args['includepats'] = req_includes + if req_excludes: + new_args['excludepats'] = req_excludes + return new_args + +@bundle2.parthandler(_SPECPART, (_SPECPART_INCLUDE, _SPECPART_EXCLUDE)) +def _handlechangespec_2(op, inpart): + includepats = set(inpart.params.get(_SPECPART_INCLUDE, '').splitlines()) + excludepats = set(inpart.params.get(_SPECPART_EXCLUDE, '').splitlines()) + if not changegroup.NARROW_REQUIREMENT in op.repo.requirements: + op.repo.requirements.add(changegroup.NARROW_REQUIREMENT) + op.repo._writerequirements() + op.repo.setnarrowpats(includepats, excludepats) + +@bundle2.parthandler(_CHANGESPECPART) +def _handlechangespec(op, inpart): + repo = op.repo + cl = repo.changelog + + # changesets which need to be stripped entirely. either they're no longer + # needed in the new narrow spec, or the server is sending a replacement + # in the changegroup part. + clkills = set() + + # A changespec part contains all the updates to ellipsis nodes + # that will happen as a result of widening or narrowing a + # repo. All the changes that this block encounters are ellipsis + # nodes or flags to kill an existing ellipsis. + chunksignal = changegroup.readexactly(inpart, 4) + while chunksignal != _DONESIGNAL: + if chunksignal == _KILLNODESIGNAL: + # a node used to be an ellipsis but isn't anymore + ck = changegroup.readexactly(inpart, 20) + if cl.hasnode(ck): + clkills.add(ck) + else: + raise error.Abort( + _('unexpected changespec node chunk type: %s') % chunksignal) + chunksignal = changegroup.readexactly(inpart, 4) + + if clkills: + # preserve bookmarks that repair.strip() would otherwise strip + bmstore = repo._bookmarks + class dummybmstore(dict): + def applychanges(self, repo, tr, changes): + pass + def recordchange(self, tr): # legacy version + pass + repo._bookmarks = dummybmstore() + chgrpfile = repair.strip(op.ui, repo, list(clkills), backup=True, + topic='widen') + repo._bookmarks = bmstore + if chgrpfile: + # presence of _widen_bundle attribute activates widen handler later + op._widen_bundle = chgrpfile + # Set the new narrowspec if we're widening. The setnewnarrowpats() method + # will currently always be there when using the core+narrowhg server, but + # other servers may include a changespec part even when not widening (e.g. + # because we're deepening a shallow repo). + if util.safehasattr(repo, 'setnewnarrowpats'): + repo.setnewnarrowpats() + +def handlechangegroup_widen(op, inpart): + """Changegroup exchange handler which restores temporarily-stripped nodes""" + # We saved a bundle with stripped node data we must now restore. + # This approach is based on mercurial/repair.py@6ee26a53c111. + repo = op.repo + ui = op.ui + + chgrpfile = op._widen_bundle + del op._widen_bundle + vfs = repo.vfs + + ui.note(_("adding branch\n")) + f = vfs.open(chgrpfile, "rb") + try: + gen = exchange.readbundle(ui, f, chgrpfile, vfs) + if not ui.verbose: + # silence internal shuffling chatter + ui.pushbuffer() + if isinstance(gen, bundle2.unbundle20): + with repo.transaction('strip') as tr: + bundle2.processbundle(repo, gen, lambda: tr) + else: + gen.apply(repo, 'strip', 'bundle:' + vfs.join(chgrpfile), True) + if not ui.verbose: + ui.popbuffer() + finally: + f.close() + + # remove undo files + for undovfs, undofile in repo.undofiles(): + try: + undovfs.unlink(undofile) + except OSError as e: + if e.errno != errno.ENOENT: + ui.warn(_('error removing %s: %s\n') % + (undovfs.join(undofile), stringutil.forcebytestr(e))) + + # Remove partial backup only if there were no exceptions + vfs.unlink(chgrpfile) + +def setup(): + """Enable narrow repo support in bundle2-related extension points.""" + extensions.wrapfunction(bundle2, 'getrepocaps', getrepocaps_narrow) + + getbundleargs = wireprototypes.GETBUNDLE_ARGUMENTS + + getbundleargs['narrow'] = 'boolean' + getbundleargs['depth'] = 'plain' + getbundleargs['oldincludepats'] = 'csv' + getbundleargs['oldexcludepats'] = 'csv' + getbundleargs['includepats'] = 'csv' + getbundleargs['excludepats'] = 'csv' + getbundleargs['known'] = 'csv' + + # Extend changegroup serving to handle requests from narrow clients. + origcgfn = exchange.getbundle2partsmapping['changegroup'] + def wrappedcgfn(*args, **kwargs): + repo = args[1] + if repo.ui.has_section(_NARROWACL_SECTION): + getbundlechangegrouppart_narrow( + *args, **applyacl_narrow(repo, kwargs)) + elif kwargs.get(r'narrow', False): + getbundlechangegrouppart_narrow(*args, **kwargs) + else: + origcgfn(*args, **kwargs) + exchange.getbundle2partsmapping['changegroup'] = wrappedcgfn + + # disable rev branch cache exchange when serving a narrow bundle + # (currently incompatible with that part) + origrbcfn = exchange.getbundle2partsmapping['cache:rev-branch-cache'] + def wrappedcgfn(*args, **kwargs): + repo = args[1] + if repo.ui.has_section(_NARROWACL_SECTION): + return + elif kwargs.get(r'narrow', False): + return + else: + origrbcfn(*args, **kwargs) + exchange.getbundle2partsmapping['cache:rev-branch-cache'] = wrappedcgfn + + # Extend changegroup receiver so client can fixup after widen requests. + origcghandler = bundle2.parthandlermapping['changegroup'] + def wrappedcghandler(op, inpart): + origcghandler(op, inpart) + if util.safehasattr(op, '_widen_bundle'): + handlechangegroup_widen(op, inpart) + wrappedcghandler.params = origcghandler.params + bundle2.parthandlermapping['changegroup'] = wrappedcghandler diff -r fb92df8b634c -r ed5448edcbfa hgext/narrow/narrowchangegroup.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/hgext/narrow/narrowchangegroup.py Wed Apr 18 15:32:08 2018 -0400 @@ -0,0 +1,373 @@ +# narrowchangegroup.py - narrow clone changegroup creation and consumption +# +# Copyright 2017 Google, Inc. +# +# This software may be used and distributed according to the terms of the +# GNU General Public License version 2 or any later version. + +from __future__ import absolute_import + +from mercurial.i18n import _ +from mercurial import ( + changegroup, + error, + extensions, + manifest, + match as matchmod, + mdiff, + node, + revlog, + util, +) + +def setup(): + + def _cgmatcher(cgpacker): + localmatcher = cgpacker._repo.narrowmatch() + remotematcher = getattr(cgpacker, '_narrow_matcher', lambda: None)() + if remotematcher: + return matchmod.intersectmatchers(localmatcher, remotematcher) + else: + return localmatcher + + def prune(orig, self, revlog, missing, commonrevs): + if isinstance(revlog, manifest.manifestrevlog): + matcher = _cgmatcher(self) + if (matcher and + not matcher.visitdir(revlog._dir[:-1] or '.')): + return [] + return orig(self, revlog, missing, commonrevs) + + extensions.wrapfunction(changegroup.cg1packer, 'prune', prune) + + def generatefiles(orig, self, changedfiles, linknodes, commonrevs, + source): + matcher = _cgmatcher(self) + if matcher: + changedfiles = list(filter(matcher, changedfiles)) + if getattr(self, 'is_shallow', False): + # See comment in generate() for why this sadness is a thing. + mfdicts = self._mfdicts + del self._mfdicts + # In a shallow clone, the linknodes callback needs to also include + # those file nodes that are in the manifests we sent but weren't + # introduced by those manifests. + commonctxs = [self._repo[c] for c in commonrevs] + oldlinknodes = linknodes + clrev = self._repo.changelog.rev + def linknodes(flog, fname): + for c in commonctxs: + try: + fnode = c.filenode(fname) + self.clrev_to_localrev[c.rev()] = flog.rev(fnode) + except error.ManifestLookupError: + pass + links = oldlinknodes(flog, fname) + if len(links) != len(mfdicts): + for mf, lr in mfdicts: + fnode = mf.get(fname, None) + if fnode in links: + links[fnode] = min(links[fnode], lr, key=clrev) + elif fnode: + links[fnode] = lr + return links + return orig(self, changedfiles, linknodes, commonrevs, source) + extensions.wrapfunction( + changegroup.cg1packer, 'generatefiles', generatefiles) + + def ellipsisdata(packer, rev, revlog_, p1, p2, data, linknode): + n = revlog_.node(rev) + p1n, p2n = revlog_.node(p1), revlog_.node(p2) + flags = revlog_.flags(rev) + flags |= revlog.REVIDX_ELLIPSIS + meta = packer.builddeltaheader( + n, p1n, p2n, node.nullid, linknode, flags) + # TODO: try and actually send deltas for ellipsis data blocks + diffheader = mdiff.trivialdiffheader(len(data)) + l = len(meta) + len(diffheader) + len(data) + return ''.join((changegroup.chunkheader(l), + meta, + diffheader, + data)) + + def close(orig, self): + getattr(self, 'clrev_to_localrev', {}).clear() + if getattr(self, 'next_clrev_to_localrev', {}): + self.clrev_to_localrev = self.next_clrev_to_localrev + del self.next_clrev_to_localrev + self.changelog_done = True + return orig(self) + extensions.wrapfunction(changegroup.cg1packer, 'close', close) + + # In a perfect world, we'd generate better ellipsis-ified graphs + # for non-changelog revlogs. In practice, we haven't started doing + # that yet, so the resulting DAGs for the manifestlog and filelogs + # are actually full of bogus parentage on all the ellipsis + # nodes. This has the side effect that, while the contents are + # correct, the individual DAGs might be completely out of whack in + # a case like 882681bc3166 and its ancestors (back about 10 + # revisions or so) in the main hg repo. + # + # The one invariant we *know* holds is that the new (potentially + # bogus) DAG shape will be valid if we order the nodes in the + # order that they're introduced in dramatis personae by the + # changelog, so what we do is we sort the non-changelog histories + # by the order in which they are used by the changelog. + def _sortgroup(orig, self, revlog, nodelist, lookup): + if not util.safehasattr(self, 'full_nodes') or not self.clnode_to_rev: + return orig(self, revlog, nodelist, lookup) + key = lambda n: self.clnode_to_rev[lookup(n)] + return [revlog.rev(n) for n in sorted(nodelist, key=key)] + + extensions.wrapfunction(changegroup.cg1packer, '_sortgroup', _sortgroup) + + def generate(orig, self, commonrevs, clnodes, fastpathlinkrev, source): + '''yield a sequence of changegroup chunks (strings)''' + # Note: other than delegating to orig, the only deviation in + # logic from normal hg's generate is marked with BEGIN/END + # NARROW HACK. + if not util.safehasattr(self, 'full_nodes'): + # not sending a narrow bundle + for x in orig(self, commonrevs, clnodes, fastpathlinkrev, source): + yield x + return + + repo = self._repo + cl = repo.changelog + mfl = repo.manifestlog + mfrevlog = mfl._revlog + + clrevorder = {} + mfs = {} # needed manifests + fnodes = {} # needed file nodes + changedfiles = set() + + # Callback for the changelog, used to collect changed files and manifest + # nodes. + # Returns the linkrev node (identity in the changelog case). + def lookupcl(x): + c = cl.read(x) + clrevorder[x] = len(clrevorder) + # BEGIN NARROW HACK + # + # Only update mfs if x is going to be sent. Otherwise we + # end up with bogus linkrevs specified for manifests and + # we skip some manifest nodes that we should otherwise + # have sent. + if x in self.full_nodes or cl.rev(x) in self.precomputed_ellipsis: + n = c[0] + # record the first changeset introducing this manifest version + mfs.setdefault(n, x) + # Set this narrow-specific dict so we have the lowest manifest + # revnum to look up for this cl revnum. (Part of mapping + # changelog ellipsis parents to manifest ellipsis parents) + self.next_clrev_to_localrev.setdefault(cl.rev(x), + mfrevlog.rev(n)) + # We can't trust the changed files list in the changeset if the + # client requested a shallow clone. + if self.is_shallow: + changedfiles.update(mfl[c[0]].read().keys()) + else: + changedfiles.update(c[3]) + # END NARROW HACK + # Record a complete list of potentially-changed files in + # this manifest. + return x + + self._verbosenote(_('uncompressed size of bundle content:\n')) + size = 0 + for chunk in self.group(clnodes, cl, lookupcl, units=_('changesets')): + size += len(chunk) + yield chunk + self._verbosenote(_('%8.i (changelog)\n') % size) + + # We need to make sure that the linkrev in the changegroup refers to + # the first changeset that introduced the manifest or file revision. + # The fastpath is usually safer than the slowpath, because the filelogs + # are walked in revlog order. + # + # When taking the slowpath with reorder=None and the manifest revlog + # uses generaldelta, the manifest may be walked in the "wrong" order. + # Without 'clrevorder', we would get an incorrect linkrev (see fix in + # cc0ff93d0c0c). + # + # When taking the fastpath, we are only vulnerable to reordering + # of the changelog itself. The changelog never uses generaldelta, so + # it is only reordered when reorder=True. To handle this case, we + # simply take the slowpath, which already has the 'clrevorder' logic. + # This was also fixed in cc0ff93d0c0c. + fastpathlinkrev = fastpathlinkrev and not self._reorder + # Treemanifests don't work correctly with fastpathlinkrev + # either, because we don't discover which directory nodes to + # send along with files. This could probably be fixed. + fastpathlinkrev = fastpathlinkrev and ( + 'treemanifest' not in repo.requirements) + # Shallow clones also don't work correctly with fastpathlinkrev + # because file nodes may need to be sent for a manifest even if they + # weren't introduced by that manifest. + fastpathlinkrev = fastpathlinkrev and not self.is_shallow + + for chunk in self.generatemanifests(commonrevs, clrevorder, + fastpathlinkrev, mfs, fnodes, source): + yield chunk + # BEGIN NARROW HACK + mfdicts = None + if self.is_shallow: + mfdicts = [(self._repo.manifestlog[n].read(), lr) + for (n, lr) in mfs.iteritems()] + # END NARROW HACK + mfs.clear() + clrevs = set(cl.rev(x) for x in clnodes) + + if not fastpathlinkrev: + def linknodes(unused, fname): + return fnodes.get(fname, {}) + else: + cln = cl.node + def linknodes(filerevlog, fname): + llr = filerevlog.linkrev + fln = filerevlog.node + revs = ((r, llr(r)) for r in filerevlog) + return dict((fln(r), cln(lr)) for r, lr in revs if lr in clrevs) + + # BEGIN NARROW HACK + # + # We need to pass the mfdicts variable down into + # generatefiles(), but more than one command might have + # wrapped generatefiles so we can't modify the function + # signature. Instead, we pass the data to ourselves using an + # instance attribute. I'm sorry. + self._mfdicts = mfdicts + # END NARROW HACK + for chunk in self.generatefiles(changedfiles, linknodes, commonrevs, + source): + yield chunk + + yield self.close() + + if clnodes: + repo.hook('outgoing', node=node.hex(clnodes[0]), source=source) + extensions.wrapfunction(changegroup.cg1packer, 'generate', generate) + + def revchunk(orig, self, revlog, rev, prev, linknode): + if not util.safehasattr(self, 'full_nodes'): + # not sending a narrow changegroup + for x in orig(self, revlog, rev, prev, linknode): + yield x + return + # build up some mapping information that's useful later. See + # the local() nested function below. + if not self.changelog_done: + self.clnode_to_rev[linknode] = rev + linkrev = rev + self.clrev_to_localrev[linkrev] = rev + else: + linkrev = self.clnode_to_rev[linknode] + self.clrev_to_localrev[linkrev] = rev + # This is a node to send in full, because the changeset it + # corresponds to was a full changeset. + if linknode in self.full_nodes: + for x in orig(self, revlog, rev, prev, linknode): + yield x + return + # At this point, a node can either be one we should skip or an + # ellipsis. If it's not an ellipsis, bail immediately. + if linkrev not in self.precomputed_ellipsis: + return + linkparents = self.precomputed_ellipsis[linkrev] + def local(clrev): + """Turn a changelog revnum into a local revnum. + + The ellipsis dag is stored as revnums on the changelog, + but when we're producing ellipsis entries for + non-changelog revlogs, we need to turn those numbers into + something local. This does that for us, and during the + changelog sending phase will also expand the stored + mappings as needed. + """ + if clrev == node.nullrev: + return node.nullrev + if not self.changelog_done: + # If we're doing the changelog, it's possible that we + # have a parent that is already on the client, and we + # need to store some extra mapping information so that + # our contained ellipsis nodes will be able to resolve + # their parents. + if clrev not in self.clrev_to_localrev: + clnode = revlog.node(clrev) + self.clnode_to_rev[clnode] = clrev + return clrev + # Walk the ellipsis-ized changelog breadth-first looking for a + # change that has been linked from the current revlog. + # + # For a flat manifest revlog only a single step should be necessary + # as all relevant changelog entries are relevant to the flat + # manifest. + # + # For a filelog or tree manifest dirlog however not every changelog + # entry will have been relevant, so we need to skip some changelog + # nodes even after ellipsis-izing. + walk = [clrev] + while walk: + p = walk[0] + walk = walk[1:] + if p in self.clrev_to_localrev: + return self.clrev_to_localrev[p] + elif p in self.full_nodes: + walk.extend([pp for pp in self._repo.changelog.parentrevs(p) + if pp != node.nullrev]) + elif p in self.precomputed_ellipsis: + walk.extend([pp for pp in self.precomputed_ellipsis[p] + if pp != node.nullrev]) + else: + # In this case, we've got an ellipsis with parents + # outside the current bundle (likely an + # incremental pull). We "know" that we can use the + # value of this same revlog at whatever revision + # is pointed to by linknode. "Know" is in scare + # quotes because I haven't done enough examination + # of edge cases to convince myself this is really + # a fact - it works for all the (admittedly + # thorough) cases in our testsuite, but I would be + # somewhat unsurprised to find a case in the wild + # where this breaks down a bit. That said, I don't + # know if it would hurt anything. + for i in xrange(rev, 0, -1): + if revlog.linkrev(i) == clrev: + return i + # We failed to resolve a parent for this node, so + # we crash the changegroup construction. + raise error.Abort( + 'unable to resolve parent while packing %r %r' + ' for changeset %r' % (revlog.indexfile, rev, clrev)) + return node.nullrev + + if not linkparents or ( + revlog.parentrevs(rev) == (node.nullrev, node.nullrev)): + p1, p2 = node.nullrev, node.nullrev + elif len(linkparents) == 1: + p1, = sorted(local(p) for p in linkparents) + p2 = node.nullrev + else: + p1, p2 = sorted(local(p) for p in linkparents) + n = revlog.node(rev) + yield ellipsisdata( + self, rev, revlog, p1, p2, revlog.revision(n), linknode) + extensions.wrapfunction(changegroup.cg1packer, 'revchunk', revchunk) + + def deltaparent(orig, self, revlog, rev, p1, p2, prev): + if util.safehasattr(self, 'full_nodes'): + # TODO: send better deltas when in narrow mode. + # + # changegroup.group() loops over revisions to send, + # including revisions we'll skip. What this means is that + # `prev` will be a potentially useless delta base for all + # ellipsis nodes, as the client likely won't have it. In + # the future we should do bookkeeping about which nodes + # have been sent to the client, and try to be + # significantly smarter about delta bases. This is + # slightly tricky because this same code has to work for + # all revlogs, and we don't have the linkrev/linknode here. + return p1 + return orig(self, revlog, rev, p1, p2, prev) + extensions.wrapfunction(changegroup.cg2packer, 'deltaparent', deltaparent) diff -r fb92df8b634c -r ed5448edcbfa hgext/narrow/narrowcommands.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/hgext/narrow/narrowcommands.py Wed Apr 18 15:32:08 2018 -0400 @@ -0,0 +1,401 @@ +# narrowcommands.py - command modifications for narrowhg extension +# +# Copyright 2017 Google, Inc. +# +# This software may be used and distributed according to the terms of the +# GNU General Public License version 2 or any later version. +from __future__ import absolute_import + +import itertools + +from mercurial.i18n import _ +from mercurial import ( + changegroup, + cmdutil, + commands, + discovery, + error, + exchange, + extensions, + hg, + merge, + narrowspec, + node, + pycompat, + registrar, + repair, + repoview, + util, +) + +from . import ( + narrowbundle2, +) + +table = {} +command = registrar.command(table) + +def setup(): + """Wraps user-facing mercurial commands with narrow-aware versions.""" + + entry = extensions.wrapcommand(commands.table, 'clone', clonenarrowcmd) + entry[1].append(('', 'narrow', None, + _("create a narrow clone of select files"))) + entry[1].append(('', 'depth', '', + _("limit the history fetched by distance from heads"))) + # TODO(durin42): unify sparse/narrow --include/--exclude logic a bit + if 'sparse' not in extensions.enabled(): + entry[1].append(('', 'include', [], + _("specifically fetch this file/directory"))) + entry[1].append( + ('', 'exclude', [], + _("do not fetch this file/directory, even if included"))) + + entry = extensions.wrapcommand(commands.table, 'pull', pullnarrowcmd) + entry[1].append(('', 'depth', '', + _("limit the history fetched by distance from heads"))) + + extensions.wrapcommand(commands.table, 'archive', archivenarrowcmd) + +def expandpull(pullop, includepats, excludepats): + if not narrowspec.needsexpansion(includepats): + return includepats, excludepats + + heads = pullop.heads or pullop.rheads + includepats, excludepats = pullop.remote.expandnarrow( + includepats, excludepats, heads) + pullop.repo.ui.debug('Expanded narrowspec to inc=%s, exc=%s\n' % ( + includepats, excludepats)) + return set(includepats), set(excludepats) + +def clonenarrowcmd(orig, ui, repo, *args, **opts): + """Wraps clone command, so 'hg clone' first wraps localrepo.clone().""" + opts = pycompat.byteskwargs(opts) + wrappedextraprepare = util.nullcontextmanager() + opts_narrow = opts['narrow'] + if opts_narrow: + def pullbundle2extraprepare_widen(orig, pullop, kwargs): + # Create narrow spec patterns from clone flags + includepats = narrowspec.parsepatterns(opts['include']) + excludepats = narrowspec.parsepatterns(opts['exclude']) + + # If necessary, ask the server to expand the narrowspec. + includepats, excludepats = expandpull( + pullop, includepats, excludepats) + + if not includepats and excludepats: + # If nothing was included, we assume the user meant to include + # everything, except what they asked to exclude. + includepats = {'path:.'} + + pullop.repo.setnarrowpats(includepats, excludepats) + + # This will populate 'includepats' etc with the values from the + # narrowspec we just saved. + orig(pullop, kwargs) + + if opts.get('depth'): + kwargs['depth'] = opts['depth'] + wrappedextraprepare = extensions.wrappedfunction(exchange, + '_pullbundle2extraprepare', pullbundle2extraprepare_widen) + + def pullnarrow(orig, repo, *args, **kwargs): + if opts_narrow: + repo.requirements.add(changegroup.NARROW_REQUIREMENT) + repo._writerequirements() + + return orig(repo, *args, **kwargs) + + wrappedpull = extensions.wrappedfunction(exchange, 'pull', pullnarrow) + + with wrappedextraprepare, wrappedpull: + return orig(ui, repo, *args, **pycompat.strkwargs(opts)) + +def pullnarrowcmd(orig, ui, repo, *args, **opts): + """Wraps pull command to allow modifying narrow spec.""" + wrappedextraprepare = util.nullcontextmanager() + if changegroup.NARROW_REQUIREMENT in repo.requirements: + + def pullbundle2extraprepare_widen(orig, pullop, kwargs): + orig(pullop, kwargs) + if opts.get(r'depth'): + kwargs['depth'] = opts[r'depth'] + wrappedextraprepare = extensions.wrappedfunction(exchange, + '_pullbundle2extraprepare', pullbundle2extraprepare_widen) + + with wrappedextraprepare: + return orig(ui, repo, *args, **opts) + +def archivenarrowcmd(orig, ui, repo, *args, **opts): + """Wraps archive command to narrow the default includes.""" + if changegroup.NARROW_REQUIREMENT in repo.requirements: + repo_includes, repo_excludes = repo.narrowpats + includes = set(opts.get(r'include', [])) + excludes = set(opts.get(r'exclude', [])) + includes, excludes, unused_invalid = narrowspec.restrictpatterns( + includes, excludes, repo_includes, repo_excludes) + if includes: + opts[r'include'] = includes + if excludes: + opts[r'exclude'] = excludes + return orig(ui, repo, *args, **opts) + +def pullbundle2extraprepare(orig, pullop, kwargs): + repo = pullop.repo + if changegroup.NARROW_REQUIREMENT not in repo.requirements: + return orig(pullop, kwargs) + + if narrowbundle2.NARROWCAP not in pullop.remotebundle2caps: + raise error.Abort(_("server doesn't support narrow clones")) + orig(pullop, kwargs) + kwargs['narrow'] = True + include, exclude = repo.narrowpats + kwargs['oldincludepats'] = include + kwargs['oldexcludepats'] = exclude + kwargs['includepats'] = include + kwargs['excludepats'] = exclude + kwargs['known'] = [node.hex(ctx.node()) for ctx in + repo.set('::%ln', pullop.common) + if ctx.node() != node.nullid] + if not kwargs['known']: + # Mercurial serialized an empty list as '' and deserializes it as + # [''], so delete it instead to avoid handling the empty string on the + # server. + del kwargs['known'] + +extensions.wrapfunction(exchange,'_pullbundle2extraprepare', + pullbundle2extraprepare) + +def _narrow(ui, repo, remote, commoninc, oldincludes, oldexcludes, + newincludes, newexcludes, force): + oldmatch = narrowspec.match(repo.root, oldincludes, oldexcludes) + newmatch = narrowspec.match(repo.root, newincludes, newexcludes) + + # This is essentially doing "hg outgoing" to find all local-only + # commits. We will then check that the local-only commits don't + # have any changes to files that will be untracked. + unfi = repo.unfiltered() + outgoing = discovery.findcommonoutgoing(unfi, remote, + commoninc=commoninc) + ui.status(_('looking for local changes to affected paths\n')) + localnodes = [] + for n in itertools.chain(outgoing.missing, outgoing.excluded): + if any(oldmatch(f) and not newmatch(f) for f in unfi[n].files()): + localnodes.append(n) + revstostrip = unfi.revs('descendants(%ln)', localnodes) + hiddenrevs = repoview.filterrevs(repo, 'visible') + visibletostrip = list(repo.changelog.node(r) + for r in (revstostrip - hiddenrevs)) + if visibletostrip: + ui.status(_('The following changeset(s) or their ancestors have ' + 'local changes not on the remote:\n')) + maxnodes = 10 + if ui.verbose or len(visibletostrip) <= maxnodes: + for n in visibletostrip: + ui.status('%s\n' % node.short(n)) + else: + for n in visibletostrip[:maxnodes]: + ui.status('%s\n' % node.short(n)) + ui.status(_('...and %d more, use --verbose to list all\n') % + (len(visibletostrip) - maxnodes)) + if not force: + raise error.Abort(_('local changes found'), + hint=_('use --force-delete-local-changes to ' + 'ignore')) + + if revstostrip: + tostrip = [unfi.changelog.node(r) for r in revstostrip] + if repo['.'].node() in tostrip: + # stripping working copy, so move to a different commit first + urev = max(repo.revs('(::%n) - %ln + null', + repo['.'].node(), visibletostrip)) + hg.clean(repo, urev) + repair.strip(ui, unfi, tostrip, topic='narrow') + + todelete = [] + for f, f2, size in repo.store.datafiles(): + if f.startswith('data/'): + file = f[5:-2] + if not newmatch(file): + todelete.append(f) + elif f.startswith('meta/'): + dir = f[5:-13] + dirs = ['.'] + sorted(util.dirs({dir})) + [dir] + include = True + for d in dirs: + visit = newmatch.visitdir(d) + if not visit: + include = False + break + if visit == 'all': + break + if not include: + todelete.append(f) + + repo.destroying() + + with repo.transaction("narrowing"): + for f in todelete: + ui.status(_('deleting %s\n') % f) + util.unlinkpath(repo.svfs.join(f)) + repo.store.markremoved(f) + + for f in repo.dirstate: + if not newmatch(f): + repo.dirstate.drop(f) + repo.wvfs.unlinkpath(f) + repo.setnarrowpats(newincludes, newexcludes) + + repo.destroyed() + +def _widen(ui, repo, remote, commoninc, newincludes, newexcludes): + newmatch = narrowspec.match(repo.root, newincludes, newexcludes) + + # TODO(martinvonz): Get expansion working with widening/narrowing. + if narrowspec.needsexpansion(newincludes): + raise error.Abort('Expansion not yet supported on pull') + + def pullbundle2extraprepare_widen(orig, pullop, kwargs): + orig(pullop, kwargs) + # The old{in,ex}cludepats have already been set by orig() + kwargs['includepats'] = newincludes + kwargs['excludepats'] = newexcludes + wrappedextraprepare = extensions.wrappedfunction(exchange, + '_pullbundle2extraprepare', pullbundle2extraprepare_widen) + + # define a function that narrowbundle2 can call after creating the + # backup bundle, but before applying the bundle from the server + def setnewnarrowpats(): + repo.setnarrowpats(newincludes, newexcludes) + repo.setnewnarrowpats = setnewnarrowpats + + ds = repo.dirstate + p1, p2 = ds.p1(), ds.p2() + with ds.parentchange(): + ds.setparents(node.nullid, node.nullid) + common = commoninc[0] + with wrappedextraprepare: + exchange.pull(repo, remote, heads=common) + with ds.parentchange(): + ds.setparents(p1, p2) + + actions = {k: [] for k in 'a am f g cd dc r dm dg m e k p pr'.split()} + addgaction = actions['g'].append + + mf = repo['.'].manifest().matches(newmatch) + for f, fn in mf.iteritems(): + if f not in repo.dirstate: + addgaction((f, (mf.flags(f), False), + "add from widened narrow clone")) + + merge.applyupdates(repo, actions, wctx=repo[None], + mctx=repo['.'], overwrite=False) + merge.recordupdates(repo, actions, branchmerge=False) + +# TODO(rdamazio): Make new matcher format and update description +@command('tracked', + [('', 'addinclude', [], _('new paths to include')), + ('', 'removeinclude', [], _('old paths to no longer include')), + ('', 'addexclude', [], _('new paths to exclude')), + ('', 'removeexclude', [], _('old paths to no longer exclude')), + ('', 'clear', False, _('whether to replace the existing narrowspec')), + ('', 'force-delete-local-changes', False, + _('forces deletion of local changes when narrowing')), + ] + commands.remoteopts, + _('[OPTIONS]... [REMOTE]'), + inferrepo=True) +def trackedcmd(ui, repo, remotepath=None, *pats, **opts): + """show or change the current narrowspec + + With no argument, shows the current narrowspec entries, one per line. Each + line will be prefixed with 'I' or 'X' for included or excluded patterns, + respectively. + + The narrowspec is comprised of expressions to match remote files and/or + directories that should be pulled into your client. + The narrowspec has *include* and *exclude* expressions, with excludes always + trumping includes: that is, if a file matches an exclude expression, it will + be excluded even if it also matches an include expression. + Excluding files that were never included has no effect. + + Each included or excluded entry is in the format described by + 'hg help patterns'. + + The options allow you to add or remove included and excluded expressions. + + If --clear is specified, then all previous includes and excludes are DROPPED + and replaced by the new ones specified to --addinclude and --addexclude. + If --clear is specified without any further options, the narrowspec will be + empty and will not match any files. + """ + opts = pycompat.byteskwargs(opts) + if changegroup.NARROW_REQUIREMENT not in repo.requirements: + ui.warn(_('The narrow command is only supported on respositories cloned' + ' with --narrow.\n')) + return 1 + + # Before supporting, decide whether it "hg tracked --clear" should mean + # tracking no paths or all paths. + if opts['clear']: + ui.warn(_('The --clear option is not yet supported.\n')) + return 1 + + if narrowspec.needsexpansion(opts['addinclude'] + opts['addexclude']): + raise error.Abort('Expansion not yet supported on widen/narrow') + + addedincludes = narrowspec.parsepatterns(opts['addinclude']) + removedincludes = narrowspec.parsepatterns(opts['removeinclude']) + addedexcludes = narrowspec.parsepatterns(opts['addexclude']) + removedexcludes = narrowspec.parsepatterns(opts['removeexclude']) + widening = addedincludes or removedexcludes + narrowing = removedincludes or addedexcludes + only_show = not widening and not narrowing + + # Only print the current narrowspec. + if only_show: + include, exclude = repo.narrowpats + + ui.pager('tracked') + fm = ui.formatter('narrow', opts) + for i in sorted(include): + fm.startitem() + fm.write('status', '%s ', 'I', label='narrow.included') + fm.write('pat', '%s\n', i, label='narrow.included') + for i in sorted(exclude): + fm.startitem() + fm.write('status', '%s ', 'X', label='narrow.excluded') + fm.write('pat', '%s\n', i, label='narrow.excluded') + fm.end() + return 0 + + with repo.wlock(), repo.lock(): + cmdutil.bailifchanged(repo) + + # Find the revisions we have in common with the remote. These will + # be used for finding local-only changes for narrowing. They will + # also define the set of revisions to update for widening. + remotepath = ui.expandpath(remotepath or 'default') + url, branches = hg.parseurl(remotepath) + ui.status(_('comparing with %s\n') % util.hidepassword(url)) + remote = hg.peer(repo, opts, url) + commoninc = discovery.findcommonincoming(repo, remote) + + oldincludes, oldexcludes = repo.narrowpats + if narrowing: + newincludes = oldincludes - removedincludes + newexcludes = oldexcludes | addedexcludes + _narrow(ui, repo, remote, commoninc, oldincludes, oldexcludes, + newincludes, newexcludes, + opts['force_delete_local_changes']) + # _narrow() updated the narrowspec and _widen() below needs to + # use the updated values as its base (otherwise removed includes + # and addedexcludes will be lost in the resulting narrowspec) + oldincludes = newincludes + oldexcludes = newexcludes + + if widening: + newincludes = oldincludes | addedincludes + newexcludes = oldexcludes - removedexcludes + _widen(ui, repo, remote, commoninc, newincludes, newexcludes) + + return 0 diff -r fb92df8b634c -r ed5448edcbfa hgext/narrow/narrowcopies.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/hgext/narrow/narrowcopies.py Wed Apr 18 15:32:08 2018 -0400 @@ -0,0 +1,34 @@ +# narrowcopies.py - extensions to mercurial copies module to support narrow +# clones +# +# Copyright 2017 Google, Inc. +# +# This software may be used and distributed according to the terms of the +# GNU General Public License version 2 or any later version. + +from __future__ import absolute_import + +from mercurial import ( + copies, + extensions, +) + +def setup(repo): + def _computeforwardmissing(orig, a, b, match=None): + missing = orig(a, b, match) + narrowmatch = repo.narrowmatch() + if narrowmatch.always(): + return missing + missing = [f for f in missing if narrowmatch(f)] + return missing + + def _checkcopies(orig, srcctx, dstctx, f, base, tca, remotebase, limit, + data): + narrowmatch = repo.narrowmatch() + if not narrowmatch(f): + return + orig(srcctx, dstctx, f, base, tca, remotebase, limit, data) + + extensions.wrapfunction(copies, '_computeforwardmissing', + _computeforwardmissing) + extensions.wrapfunction(copies, '_checkcopies', _checkcopies) diff -r fb92df8b634c -r ed5448edcbfa hgext/narrow/narrowdirstate.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/hgext/narrow/narrowdirstate.py Wed Apr 18 15:32:08 2018 -0400 @@ -0,0 +1,82 @@ +# narrowdirstate.py - extensions to mercurial dirstate to support narrow clones +# +# Copyright 2017 Google, Inc. +# +# This software may be used and distributed according to the terms of the +# GNU General Public License version 2 or any later version. + +from __future__ import absolute_import + +from mercurial.i18n import _ +from mercurial import ( + dirstate, + error, + extensions, + match as matchmod, + narrowspec, + util as hgutil, +) + +def setup(repo): + """Add narrow spec dirstate ignore, block changes outside narrow spec.""" + + def walk(orig, self, match, subrepos, unknown, ignored, full=True, + narrowonly=True): + if narrowonly: + # hack to not exclude explicitly-specified paths so that they can + # be warned later on e.g. dirstate.add() + em = matchmod.exact(match._root, match._cwd, match.files()) + nm = matchmod.unionmatcher([repo.narrowmatch(), em]) + match = matchmod.intersectmatchers(match, nm) + return orig(self, match, subrepos, unknown, ignored, full) + + extensions.wrapfunction(dirstate.dirstate, 'walk', walk) + + # Prevent adding files that are outside the sparse checkout + editfuncs = ['normal', 'add', 'normallookup', 'copy', 'remove', 'merge'] + for func in editfuncs: + def _wrapper(orig, self, *args): + dirstate = repo.dirstate + narrowmatch = repo.narrowmatch() + for f in args: + if f is not None and not narrowmatch(f) and f not in dirstate: + raise error.Abort(_("cannot track '%s' - it is outside " + + "the narrow clone") % f) + return orig(self, *args) + extensions.wrapfunction(dirstate.dirstate, func, _wrapper) + + def filterrebuild(orig, self, parent, allfiles, changedfiles=None): + if changedfiles is None: + # Rebuilding entire dirstate, let's filter allfiles to match the + # narrowspec. + allfiles = [f for f in allfiles if repo.narrowmatch()(f)] + orig(self, parent, allfiles, changedfiles) + + extensions.wrapfunction(dirstate.dirstate, 'rebuild', filterrebuild) + + def _narrowbackupname(backupname): + assert 'dirstate' in backupname + return backupname.replace('dirstate', narrowspec.FILENAME) + + def restorebackup(orig, self, tr, backupname): + self._opener.rename(_narrowbackupname(backupname), narrowspec.FILENAME, + checkambig=True) + orig(self, tr, backupname) + + extensions.wrapfunction(dirstate.dirstate, 'restorebackup', restorebackup) + + def savebackup(orig, self, tr, backupname): + orig(self, tr, backupname) + + narrowbackupname = _narrowbackupname(backupname) + self._opener.tryunlink(narrowbackupname) + hgutil.copyfile(self._opener.join(narrowspec.FILENAME), + self._opener.join(narrowbackupname), hardlink=True) + + extensions.wrapfunction(dirstate.dirstate, 'savebackup', savebackup) + + def clearbackup(orig, self, tr, backupname): + orig(self, tr, backupname) + self._opener.unlink(_narrowbackupname(backupname)) + + extensions.wrapfunction(dirstate.dirstate, 'clearbackup', clearbackup) diff -r fb92df8b634c -r ed5448edcbfa hgext/narrow/narrowmerge.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/hgext/narrow/narrowmerge.py Wed Apr 18 15:32:08 2018 -0400 @@ -0,0 +1,77 @@ +# narrowmerge.py - extensions to mercurial merge module to support narrow clones +# +# Copyright 2017 Google, Inc. +# +# This software may be used and distributed according to the terms of the +# GNU General Public License version 2 or any later version. + +from __future__ import absolute_import + +from mercurial.i18n import _ +from mercurial import ( + copies, + error, + extensions, + merge, +) + +def setup(): + def _manifestmerge(orig, repo, wctx, p2, pa, branchmerge, *args, **kwargs): + """Filter updates to only lay out files that match the narrow spec.""" + actions, diverge, renamedelete = orig( + repo, wctx, p2, pa, branchmerge, *args, **kwargs) + + narrowmatch = repo.narrowmatch() + if narrowmatch.always(): + return actions, diverge, renamedelete + + nooptypes = set(['k']) # TODO: handle with nonconflicttypes + nonconflicttypes = set('a am c cm f g r e'.split()) + # We mutate the items in the dict during iteration, so iterate + # over a copy. + for f, action in list(actions.items()): + if narrowmatch(f): + pass + elif not branchmerge: + del actions[f] # just updating, ignore changes outside clone + elif action[0] in nooptypes: + del actions[f] # merge does not affect file + elif action[0] in nonconflicttypes: + raise error.Abort(_('merge affects file \'%s\' outside narrow, ' + 'which is not yet supported') % f, + hint=_('merging in the other direction ' + 'may work')) + else: + raise error.Abort(_('conflict in file \'%s\' is outside ' + 'narrow clone') % f) + + return actions, diverge, renamedelete + + extensions.wrapfunction(merge, 'manifestmerge', _manifestmerge) + + def _checkcollision(orig, repo, wmf, actions): + narrowmatch = repo.narrowmatch() + if not narrowmatch.always(): + wmf = wmf.matches(narrowmatch) + if actions: + narrowactions = {} + for m, actionsfortype in actions.iteritems(): + narrowactions[m] = [] + for (f, args, msg) in actionsfortype: + if narrowmatch(f): + narrowactions[m].append((f, args, msg)) + actions = narrowactions + return orig(repo, wmf, actions) + + extensions.wrapfunction(merge, '_checkcollision', _checkcollision) + + def _computenonoverlap(orig, repo, *args, **kwargs): + u1, u2 = orig(repo, *args, **kwargs) + narrowmatch = repo.narrowmatch() + if narrowmatch.always(): + return u1, u2 + + u1 = [f for f in u1 if narrowmatch(f)] + u2 = [f for f in u2 if narrowmatch(f)] + return u1, u2 + extensions.wrapfunction(copies, '_computenonoverlap', _computenonoverlap) diff -r fb92df8b634c -r ed5448edcbfa hgext/narrow/narrowpatch.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/hgext/narrow/narrowpatch.py Wed Apr 18 15:32:08 2018 -0400 @@ -0,0 +1,41 @@ +# narrowpatch.py - extensions to mercurial patch module to support narrow clones +# +# Copyright 2017 Google, Inc. +# +# This software may be used and distributed according to the terms of the +# GNU General Public License version 2 or any later version. + +from __future__ import absolute_import + +from mercurial import ( + extensions, + patch, +) + +def setup(repo): + def _filepairs(orig, *args): + """Only includes files within the narrow spec in the diff.""" + narrowmatch = repo.narrowmatch() + if not narrowmatch.always(): + for x in orig(*args): + f1, f2, copyop = x + if ((not f1 or narrowmatch(f1)) and + (not f2 or narrowmatch(f2))): + yield x + else: + for x in orig(*args): + yield x + + def trydiff(orig, repo, revs, ctx1, ctx2, modified, added, removed, + copy, getfilectx, *args, **kwargs): + narrowmatch = repo.narrowmatch() + if not narrowmatch.always(): + modified = [f for f in modified if narrowmatch(f)] + added = [f for f in added if narrowmatch(f)] + removed = [f for f in removed if narrowmatch(f)] + copy = {k: v for k, v in copy.iteritems() if narrowmatch(k)} + return orig(repo, revs, ctx1, ctx2, modified, added, removed, copy, + getfilectx, *args, **kwargs) + + extensions.wrapfunction(patch, '_filepairs', _filepairs) + extensions.wrapfunction(patch, 'trydiff', trydiff) diff -r fb92df8b634c -r ed5448edcbfa hgext/narrow/narrowrepo.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/hgext/narrow/narrowrepo.py Wed Apr 18 15:32:08 2018 -0400 @@ -0,0 +1,65 @@ +# narrowrepo.py - repository which supports narrow revlogs, lazy loading +# +# Copyright 2017 Google, Inc. +# +# This software may be used and distributed according to the terms of the +# GNU General Public License version 2 or any later version. + +from __future__ import absolute_import + +from mercurial import ( + changegroup, + hg, + narrowspec, + scmutil, +) + +from . import ( + narrowrevlog, +) + +def wrappostshare(orig, sourcerepo, destrepo, **kwargs): + orig(sourcerepo, destrepo, **kwargs) + if changegroup.NARROW_REQUIREMENT in sourcerepo.requirements: + with destrepo.wlock(): + with destrepo.vfs('shared', 'a') as fp: + fp.write(narrowspec.FILENAME + '\n') + +def unsharenarrowspec(orig, ui, repo, repopath): + if (changegroup.NARROW_REQUIREMENT in repo.requirements + and repo.path == repopath and repo.shared()): + srcrepo = hg.sharedreposource(repo) + with srcrepo.vfs(narrowspec.FILENAME) as f: + spec = f.read() + with repo.vfs(narrowspec.FILENAME, 'w') as f: + f.write(spec) + return orig(ui, repo, repopath) + +def wraprepo(repo): + """Enables narrow clone functionality on a single local repository.""" + + class narrowrepository(repo.__class__): + + def file(self, f): + fl = super(narrowrepository, self).file(f) + narrowrevlog.makenarrowfilelog(fl, self.narrowmatch()) + return fl + + # I'm not sure this is the right place to do this filter. + # context._manifestmatches() would probably be better, or perhaps + # move it to a later place, in case some of the callers do want to know + # which directories changed. This seems to work for now, though. + def status(self, *args, **kwargs): + s = super(narrowrepository, self).status(*args, **kwargs) + narrowmatch = self.narrowmatch() + modified = list(filter(narrowmatch, s.modified)) + added = list(filter(narrowmatch, s.added)) + removed = list(filter(narrowmatch, s.removed)) + deleted = list(filter(narrowmatch, s.deleted)) + unknown = list(filter(narrowmatch, s.unknown)) + ignored = list(filter(narrowmatch, s.ignored)) + clean = list(filter(narrowmatch, s.clean)) + return scmutil.status(modified, added, removed, deleted, unknown, + ignored, clean) + + repo.__class__ = narrowrepository diff -r fb92df8b634c -r ed5448edcbfa hgext/narrow/narrowrevlog.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/hgext/narrow/narrowrevlog.py Wed Apr 18 15:32:08 2018 -0400 @@ -0,0 +1,80 @@ +# narrowrevlog.py - revlog storing irrelevant nodes as "ellipsis" nodes +# +# Copyright 2017 Google, Inc. +# +# This software may be used and distributed according to the terms of the +# GNU General Public License version 2 or any later version. + +from __future__ import absolute_import + +from mercurial import ( + revlog, + util, +) + +def readtransform(self, text): + return text, False + +def writetransform(self, text): + return text, False + +def rawtransform(self, text): + return False + +revlog.addflagprocessor(revlog.REVIDX_ELLIPSIS, + (readtransform, writetransform, rawtransform)) + +def setup(): + # We just wanted to add the flag processor, which is done at module + # load time. + pass + +def makenarrowfilelog(fl, narrowmatch): + class narrowfilelog(fl.__class__): + def renamed(self, node): + # Renames that come from outside the narrowspec are + # problematic at least for git-diffs, because we lack the + # base text for the rename. This logic was introduced in + # 3cd72b1 of narrowhg (authored by martinvonz, reviewed by + # adgar), but that revision doesn't have any additional + # commentary on what problems we can encounter. + m = super(narrowfilelog, self).renamed(node) + if m and not narrowmatch(m[0]): + return None + return m + + def size(self, rev): + # We take advantage of the fact that remotefilelog + # lacks a node() method to just skip the + # rename-checking logic when on remotefilelog. This + # might be incorrect on other non-revlog-based storage + # engines, but for now this seems to be fine. + # + # TODO: when remotefilelog is in core, improve this to + # explicitly look for remotefilelog instead of cheating + # with a hasattr check. + if util.safehasattr(self, 'node'): + node = self.node(rev) + # Because renamed() is overridden above to + # sometimes return None even if there is metadata + # in the revlog, size can be incorrect for + # copies/renames, so we need to make sure we call + # the super class's implementation of renamed() + # for the purpose of size calculation. + if super(narrowfilelog, self).renamed(node): + return len(self.read(node)) + return super(narrowfilelog, self).size(rev) + + def cmp(self, node, text): + different = super(narrowfilelog, self).cmp(node, text) + if different: + # Similar to size() above, if the file was copied from + # a file outside the narrowspec, the super class's + # would have returned True because we tricked it into + # thinking that the file was not renamed. + if super(narrowfilelog, self).renamed(node): + t2 = self.read(node) + return t2 != text + return different + + fl.__class__ = narrowfilelog diff -r fb92df8b634c -r ed5448edcbfa hgext/narrow/narrowtemplates.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/hgext/narrow/narrowtemplates.py Wed Apr 18 15:32:08 2018 -0400 @@ -0,0 +1,48 @@ +# narrowtemplates.py - added template keywords for narrow clones +# +# Copyright 2017 Google, Inc. +# +# This software may be used and distributed according to the terms of the +# GNU General Public License version 2 or any later version. + +from __future__ import absolute_import + +from mercurial import ( + registrar, + revlog, +) + +keywords = {} +templatekeyword = registrar.templatekeyword(keywords) +revsetpredicate = registrar.revsetpredicate() + +def _isellipsis(repo, rev): + if repo.changelog.flags(rev) & revlog.REVIDX_ELLIPSIS: + return True + return False + +@templatekeyword('ellipsis', requires={'repo', 'ctx'}) +def ellipsis(context, mapping): + """String. 'ellipsis' if the change is an ellipsis node, else ''.""" + repo = context.resource(mapping, 'repo') + ctx = context.resource(mapping, 'ctx') + if _isellipsis(repo, ctx.rev()): + return 'ellipsis' + return '' + +@templatekeyword('outsidenarrow', requires={'repo', 'ctx'}) +def outsidenarrow(context, mapping): + """String. 'outsidenarrow' if the change affects no tracked files, + else ''.""" + repo = context.resource(mapping, 'repo') + ctx = context.resource(mapping, 'ctx') + m = repo.narrowmatch() + if not m.always(): + if not any(m(f) for f in ctx.files()): + return 'outsidenarrow' + return '' + +@revsetpredicate('ellipsis') +def ellipsisrevset(repo, subset, x): + """Changesets that are ellipsis nodes.""" + return subset.filter(lambda r: _isellipsis(repo, r)) diff -r fb92df8b634c -r ed5448edcbfa hgext/narrow/narrowwirepeer.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/hgext/narrow/narrowwirepeer.py Wed Apr 18 15:32:08 2018 -0400 @@ -0,0 +1,52 @@ +# narrowwirepeer.py - passes narrow spec with unbundle command +# +# Copyright 2017 Google, Inc. +# +# This software may be used and distributed according to the terms of the +# GNU General Public License version 2 or any later version. + +from __future__ import absolute_import + +from mercurial.i18n import _ +from mercurial import ( + error, + extensions, + hg, + narrowspec, + node, +) + +def uisetup(): + def peersetup(ui, peer): + # We must set up the expansion before reposetup below, since it's used + # at clone time before we have a repo. + class expandingpeer(peer.__class__): + def expandnarrow(self, narrow_include, narrow_exclude, nodes): + ui.status(_("expanding narrowspec\n")) + if not self.capable('exp-expandnarrow'): + raise error.Abort( + 'peer does not support expanding narrowspecs') + + hex_nodes = (node.hex(n) for n in nodes) + new_narrowspec = self._call( + 'expandnarrow', + includepats=','.join(narrow_include), + excludepats=','.join(narrow_exclude), + nodes=','.join(hex_nodes)) + + return narrowspec.parseserverpatterns(new_narrowspec) + peer.__class__ = expandingpeer + hg.wirepeersetupfuncs.append(peersetup) + +def reposetup(repo): + def wirereposetup(ui, peer): + def wrapped(orig, cmd, *args, **kwargs): + if cmd == 'unbundle': + # TODO: don't blindly add include/exclude wireproto + # arguments to unbundle. + include, exclude = repo.narrowpats + kwargs[r"includepats"] = ','.join(include) + kwargs[r"excludepats"] = ','.join(exclude) + return orig(cmd, *args, **kwargs) + extensions.wrapfunction(peer, '_calltwowaystream', wrapped) + hg.wirepeersetupfuncs.append(wirereposetup) diff -r fb92df8b634c -r ed5448edcbfa hgext/notify.py --- a/hgext/notify.py Wed Apr 04 10:35:09 2018 -0400 +++ b/hgext/notify.py Wed Apr 18 15:32:08 2018 -0400 @@ -103,6 +103,10 @@ Maximum number of diff lines to include in notification email. Set to 0 to disable the diff, or -1 to include all of it. Default: 300. +notify.maxdiffstat + Maximum number of diffstat lines to include in notification email. Set to -1 + to include all of it. Default: -1. + notify.maxsubject Maximum number of characters in email's subject line. Default: 67. @@ -142,13 +146,17 @@ from mercurial.i18n import _ from mercurial import ( - cmdutil, error, + logcmdutil, mail, patch, registrar, util, ) +from mercurial.utils import ( + dateutil, + stringutil, +) # Note for extension authors: ONLY specify testedwith = 'ships-with-hg-core' for # extensions which SHIP WITH MERCURIAL. Non-mainline extensions should @@ -180,6 +188,9 @@ configitem('notify', 'maxdiff', default=300, ) +configitem('notify', 'maxdiffstat', + default=-1, +) configitem('notify', 'maxsubject', default=67, ) @@ -257,9 +268,8 @@ mapfile = self.ui.config('notify', 'style') if not mapfile and not template: template = deftemplates.get(hooktype) or single_template - spec = cmdutil.logtemplatespec(template, mapfile) - self.t = cmdutil.changeset_templater(self.ui, self.repo, spec, - False, None, False) + spec = logcmdutil.templatespec(template, mapfile) + self.t = logcmdutil.changesettemplater(self.ui, self.repo, spec) def strip(self, path): '''strip leading slashes from local path, turn into web-safe path.''' @@ -277,7 +287,7 @@ def fixmail(self, addr): '''try to clean up email addresses.''' - addr = util.email(addr.strip()) + addr = stringutil.email(addr.strip()) if self.domain: a = addr.find('@localhost') if a != -1: @@ -361,7 +371,7 @@ for k, v in headers: msg[k] = v - msg['Date'] = util.datestr(format="%a, %d %b %Y %H:%M:%S %1%2") + msg['Date'] = dateutil.datestr(format="%a, %d %b %Y %H:%M:%S %1%2") # try to make subject line exist and be useful if not subject: @@ -372,7 +382,7 @@ subject = '%s: %s' % (self.root, s) maxsubject = int(self.ui.config('notify', 'maxsubject')) if maxsubject: - subject = util.ellipsis(subject, maxsubject) + subject = stringutil.ellipsis(subject, maxsubject) msg['Subject'] = mail.headencode(self.ui, subject, self.charsets, self.test) @@ -399,7 +409,7 @@ else: self.ui.status(_('notify: sending %d subscribers %d changes\n') % (len(subs), count)) - mail.sendmail(self.ui, util.email(msg['From']), + mail.sendmail(self.ui, stringutil.email(msg['From']), subs, msgtext, mbox=self.mbox) def diff(self, ctx, ref=None): @@ -415,10 +425,17 @@ difflines = ''.join(chunks).splitlines() if self.ui.configbool('notify', 'diffstat'): + maxdiffstat = int(self.ui.config('notify', 'maxdiffstat')) s = patch.diffstat(difflines) # s may be nil, don't include the header if it is if s: - self.ui.write(_('\ndiffstat:\n\n%s') % s) + if maxdiffstat >= 0 and s.count("\n") > maxdiffstat + 1: + s = s.split("\n") + msg = _('\ndiffstat (truncated from %d to %d lines):\n\n') + self.ui.write(msg % (len(s) - 2, maxdiffstat)) + self.ui.write("\n".join(s[:maxdiffstat] + s[-2:])) + else: + self.ui.write(_('\ndiffstat:\n\n%s') % s) if maxdiff == 0: return diff -r fb92df8b634c -r ed5448edcbfa hgext/patchbomb.py --- a/hgext/patchbomb.py Wed Apr 04 10:35:09 2018 -0400 +++ b/hgext/patchbomb.py Wed Apr 18 15:32:08 2018 -0400 @@ -74,6 +74,8 @@ from __future__ import absolute_import import email as emailmod +import email.generator as emailgen +import email.utils as eutil import errno import os import socket @@ -83,6 +85,7 @@ from mercurial import ( cmdutil, commands, + encoding, error, formatter, hg, @@ -96,6 +99,7 @@ templater, util, ) +from mercurial.utils import dateutil stringio = util.stringio cmdtable = {} @@ -208,7 +212,7 @@ if not numbered: return '[PATCH%s]' % flag else: - tlen = len(str(total)) + tlen = len("%d" % total) return '[PATCH %0*d of %d%s]' % (tlen, idx, total, flag) def makepatch(ui, repo, rev, patchlines, opts, _charsets, idx, total, numbered, @@ -265,11 +269,10 @@ if patchtags: patchname = patchtags[0] elif total > 1: - patchname = cmdutil.makefilename(repo, '%b-%n.patch', - binnode, seqno=idx, - total=total) + patchname = cmdutil.makefilename(repo[node], '%b-%n.patch', + seqno=idx, total=total) else: - patchname = cmdutil.makefilename(repo, '%b.patch', binnode) + patchname = cmdutil.makefilename(repo[node], '%b.patch') disposition = 'inline' if opts.get('attach'): disposition = 'attachment' @@ -303,8 +306,8 @@ ui.warn(_('warning: working directory has ' 'uncommitted changes\n')) output = stringio() - cmdutil.export(repo, [r], fp=output, - opts=patch.difffeatureopts(ui, opts, git=True)) + cmdutil.exportfile(repo, [r], output, + opts=patch.difffeatureopts(ui, opts, git=True)) yield output.getvalue().split('\n') def _getbundle(repo, dest, **opts): """return a bundle containing changesets missing in "dest" @@ -627,7 +630,7 @@ if outgoing: revs = _getoutgoing(repo, dest, revs) if bundle: - opts['revs'] = [str(r) for r in revs] + opts['revs'] = ["%d" % r for r in revs] # check if revision exist on the public destination publicurl = repo.ui.config('patchbomb', 'publicurl') @@ -655,19 +658,21 @@ else: msg = _('public url %s is missing %s') msg %= (publicurl, missing[0]) + missingrevs = [ctx.rev() for ctx in missing] revhint = ' '.join('-r %s' % h - for h in repo.set('heads(%ld)', missing)) + for h in repo.set('heads(%ld)', missingrevs)) hint = _("use 'hg push %s %s'") % (publicurl, revhint) raise error.Abort(msg, hint=hint) # start if date: - start_time = util.parsedate(date) + start_time = dateutil.parsedate(date) else: - start_time = util.makedate() + start_time = dateutil.makedate() def genmsgid(id): - return '<%s.%s@%s>' % (id[:20], int(start_time[0]), socket.getfqdn()) + return '<%s.%d@%s>' % (id[:20], int(start_time[0]), + encoding.strtolocal(socket.getfqdn())) # deprecated config: patchbomb.from sender = (opts.get('from') or ui.config('email', 'from') or @@ -744,7 +749,7 @@ if not parent.endswith('>'): parent += '>' - sender_addr = emailmod.Utils.parseaddr(sender)[1] + sender_addr = eutil.parseaddr(encoding.strfromlocal(sender))[1] sender = mail.addressencode(ui, sender, _charsets, opts.get('test')) sendmail = None firstpatch = None @@ -763,7 +768,7 @@ parent = m['Message-Id'] m['User-Agent'] = 'Mercurial-patchbomb/%s' % util.version() - m['Date'] = emailmod.Utils.formatdate(start_time[0], localtime=True) + m['Date'] = eutil.formatdate(start_time[0], localtime=True) start_time = (start_time[0] + 1, start_time[1]) m['From'] = sender @@ -777,7 +782,7 @@ if opts.get('test'): ui.status(_('displaying '), subj, ' ...\n') ui.pager('email') - generator = emailmod.Generator.Generator(ui, mangle_from_=False) + generator = emailgen.Generator(ui, mangle_from_=False) try: generator.flatten(m, 0) ui.write('\n') @@ -794,7 +799,7 @@ # Exim does not remove the Bcc field del m['Bcc'] fp = stringio() - generator = emailmod.Generator.Generator(fp, mangle_from_=False) + generator = emailgen.Generator(fp, mangle_from_=False) generator.flatten(m, 0) sendmail(sender_addr, to + bcc + cc, fp.getvalue()) diff -r fb92df8b634c -r ed5448edcbfa hgext/purge.py --- a/hgext/purge.py Wed Apr 04 10:35:09 2018 -0400 +++ b/hgext/purge.py Wed Apr 18 15:32:08 2018 -0400 @@ -31,6 +31,7 @@ from mercurial import ( cmdutil, error, + pycompat, registrar, scmutil, util, @@ -84,6 +85,7 @@ list of files that this program would delete, use the --print option. ''' + opts = pycompat.byteskwargs(opts) act = not opts.get('print') eol = '\n' if opts.get('print0'): diff -r fb92df8b634c -r ed5448edcbfa hgext/rebase.py --- a/hgext/rebase.py Wed Apr 04 10:35:09 2018 -0400 +++ b/hgext/rebase.py Wed Apr 18 15:32:08 2018 -0400 @@ -21,7 +21,6 @@ from mercurial.i18n import _ from mercurial.node import ( - nullid, nullrev, short, ) @@ -87,17 +86,6 @@ def _savebranch(ctx, extra): extra['branch'] = ctx.branch() -def _makeextrafn(copiers): - """make an extrafn out of the given copy-functions. - - A copy function takes a context and an extra dict, and mutates the - extra dict as needed based on the given context. - """ - def extrafn(ctx, extra): - for c in copiers: - c(ctx, extra) - return extrafn - def _destrebase(repo, sourceset, destspace=None): """small wrapper around destmerge to pass the right extra args @@ -120,6 +108,25 @@ sourceset = revset.getset(repo, smartset.fullreposet(repo), x) return subset & smartset.baseset([_destrebase(repo, sourceset)]) +@revsetpredicate('_destautoorphanrebase') +def _revsetdestautoorphanrebase(repo, subset, x): + """automatic rebase destination for a single orphan revision""" + unfi = repo.unfiltered() + obsoleted = unfi.revs('obsolete()') + + src = revset.getset(repo, subset, x).first() + + # Empty src or already obsoleted - Do not return a destination + if not src or src in obsoleted: + return smartset.baseset() + dests = destutil.orphanpossibledestination(repo, src) + if len(dests) > 1: + raise error.Abort( + _("ambiguous automatic rebase: %r could end up on any of %r") % ( + src, dests)) + # We have zero or one destination, so we can just return here. + return smartset.baseset(dests) + def _ctxdesc(ctx): """short description for a context""" desc = '%d:%s "%s"' % (ctx.rev(), ctx, @@ -174,9 +181,6 @@ self.keepf = opts.get('keep', False) self.keepbranchesf = opts.get('keepbranches', False) - # keepopen is not meant for use on the command line, but by - # other extensions - self.keepopen = opts.get('keepopen', False) self.obsoletenotrebased = {} self.obsoletewithoutsuccessorindestination = set() self.inmemory = inmemory @@ -214,7 +218,7 @@ if v >= 0: newrev = repo[v].hex() else: - newrev = v + newrev = "%d" % v destnode = repo[destmap[d]].hex() f.write("%s:%s:%s\n" % (oldrev, newrev, destnode)) repo.ui.debug('rebase status stored\n') @@ -257,20 +261,20 @@ activebookmark = l else: args = l.split(':') - oldrev = args[0] + oldrev = repo[args[0]].rev() newrev = args[1] if newrev in legacystates: continue if len(args) > 2: - destnode = args[2] + destrev = repo[args[2]].rev() else: - destnode = legacydest - destmap[repo[oldrev].rev()] = repo[destnode].rev() - if newrev in (nullid, revtodostr): - state[repo[oldrev].rev()] = revtodo + destrev = legacydest + destmap[oldrev] = destrev + if newrev == revtodostr: + state[oldrev] = revtodo # Legacy compat special case else: - state[repo[oldrev].rev()] = repo[newrev].rev() + state[oldrev] = repo[newrev].rev() except IOError as err: if err.errno != errno.ENOENT: @@ -289,7 +293,7 @@ skipped.add(old) seen.add(new) repo.ui.debug('computed skipped revs: %s\n' % - (' '.join(str(r) for r in sorted(skipped)) or None)) + (' '.join('%d' % r for r in sorted(skipped)) or '')) repo.ui.debug('rebase status resumed\n') self.originalwd = originalwd @@ -312,10 +316,13 @@ if not self.ui.configbool('experimental', 'rebaseskipobsolete'): return obsoleteset = set(obsoleterevs) - self.obsoletenotrebased, self.obsoletewithoutsuccessorindestination = \ - _computeobsoletenotrebased(self.repo, obsoleteset, destmap) + (self.obsoletenotrebased, + self.obsoletewithoutsuccessorindestination, + obsoleteextinctsuccessors) = _computeobsoletenotrebased( + self.repo, obsoleteset, destmap) skippedset = set(self.obsoletenotrebased) skippedset.update(self.obsoletewithoutsuccessorindestination) + skippedset.update(obsoleteextinctsuccessors) _checkobsrebase(self.repo, self.ui, obsoleteset, skippedset) def _prepareabortorcontinue(self, isabort): @@ -390,7 +397,7 @@ else: self.wctx = self.repo[None] self.repo.ui.debug("rebasing on disk\n") - self.repo.ui.log("rebase", "", rebase_imm_used=self.wctx.isinmemory()) + self.repo.ui.log("rebase", "", rebase_imm_used=self.inmemory) def _performrebase(self, tr): self._assignworkingcopy() @@ -420,134 +427,168 @@ # Store the state before we begin so users can run 'hg rebase --abort' # if we fail before the transaction closes. self.storestatus() + if tr: + # When using single transaction, store state when transaction + # commits. + self.storestatus(tr) cands = [k for k, v in self.state.iteritems() if v == revtodo] total = len(cands) - pos = 0 + posholder = [0] + def progress(ctx): + posholder[0] += 1 + self.repo.ui.progress(_("rebasing"), posholder[0], + ("%d:%s" % (ctx.rev(), ctx)), + _('changesets'), total) + allowdivergence = self.ui.configbool( + 'experimental', 'evolution.allowdivergence') for subset in sortsource(self.destmap): - pos = self._performrebasesubset(tr, subset, pos, total) + sortedrevs = self.repo.revs('sort(%ld, -topo)', subset) + if not allowdivergence: + sortedrevs -= self.repo.revs( + 'descendants(%ld) and not %ld', + self.obsoletewithoutsuccessorindestination, + self.obsoletewithoutsuccessorindestination, + ) + for rev in sortedrevs: + self._rebasenode(tr, rev, allowdivergence, progress) ui.progress(_('rebasing'), None) ui.note(_('rebase merging completed\n')) - def _performrebasesubset(self, tr, subset, pos, total): + def _concludenode(self, rev, p1, p2, editor, commitmsg=None): + '''Commit the wd changes with parents p1 and p2. + + Reuse commit info from rev but also store useful information in extra. + Return node of committed revision.''' + repo = self.repo + ctx = repo[rev] + if commitmsg is None: + commitmsg = ctx.description() + date = self.date + if date is None: + date = ctx.date() + extra = {'rebase_source': ctx.hex()} + for c in self.extrafns: + c(ctx, extra) + keepbranch = self.keepbranchesf and repo[p1].branch() != ctx.branch() + destphase = max(ctx.phase(), phases.draft) + overrides = {('phases', 'new-commit'): destphase} + if keepbranch: + overrides[('ui', 'allowemptycommit')] = True + with repo.ui.configoverride(overrides, 'rebase'): + if self.inmemory: + newnode = commitmemorynode(repo, p1, p2, + wctx=self.wctx, + extra=extra, + commitmsg=commitmsg, + editor=editor, + user=ctx.user(), + date=date) + mergemod.mergestate.clean(repo) + else: + newnode = commitnode(repo, p1, p2, + extra=extra, + commitmsg=commitmsg, + editor=editor, + user=ctx.user(), + date=date) + + if newnode is None: + # If it ended up being a no-op commit, then the normal + # merge state clean-up path doesn't happen, so do it + # here. Fix issue5494 + mergemod.mergestate.clean(repo) + return newnode + + def _rebasenode(self, tr, rev, allowdivergence, progressfn): repo, ui, opts = self.repo, self.ui, self.opts - sortedrevs = repo.revs('sort(%ld, -topo)', subset) - allowdivergence = self.ui.configbool( - 'experimental', 'evolution.allowdivergence') - if not allowdivergence: - sortedrevs -= repo.revs( - 'descendants(%ld) and not %ld', - self.obsoletewithoutsuccessorindestination, - self.obsoletewithoutsuccessorindestination, - ) - for rev in sortedrevs: - dest = self.destmap[rev] - ctx = repo[rev] - desc = _ctxdesc(ctx) - if self.state[rev] == rev: - ui.status(_('already rebased %s\n') % desc) - elif (not allowdivergence - and rev in self.obsoletewithoutsuccessorindestination): - msg = _('note: not rebasing %s and its descendants as ' - 'this would cause divergence\n') % desc - repo.ui.status(msg) - self.skipped.add(rev) - elif rev in self.obsoletenotrebased: - succ = self.obsoletenotrebased[rev] - if succ is None: - msg = _('note: not rebasing %s, it has no ' - 'successor\n') % desc - else: - succdesc = _ctxdesc(repo[succ]) - msg = (_('note: not rebasing %s, already in ' - 'destination as %s\n') % (desc, succdesc)) - repo.ui.status(msg) - # Make clearrebased aware state[rev] is not a true successor - self.skipped.add(rev) - # Record rev as moved to its desired destination in self.state. - # This helps bookmark and working parent movement. - dest = max(adjustdest(repo, rev, self.destmap, self.state, - self.skipped)) - self.state[rev] = dest - elif self.state[rev] == revtodo: - pos += 1 - ui.status(_('rebasing %s\n') % desc) - ui.progress(_("rebasing"), pos, ("%d:%s" % (rev, ctx)), - _('changesets'), total) - p1, p2, base = defineparents(repo, rev, self.destmap, - self.state, self.skipped, - self.obsoletenotrebased) - self.storestatus(tr=tr) - storecollapsemsg(repo, self.collapsemsg) - if len(repo[None].parents()) == 2: - repo.ui.debug('resuming interrupted rebase\n') + dest = self.destmap[rev] + ctx = repo[rev] + desc = _ctxdesc(ctx) + if self.state[rev] == rev: + ui.status(_('already rebased %s\n') % desc) + elif (not allowdivergence + and rev in self.obsoletewithoutsuccessorindestination): + msg = _('note: not rebasing %s and its descendants as ' + 'this would cause divergence\n') % desc + repo.ui.status(msg) + self.skipped.add(rev) + elif rev in self.obsoletenotrebased: + succ = self.obsoletenotrebased[rev] + if succ is None: + msg = _('note: not rebasing %s, it has no ' + 'successor\n') % desc + else: + succdesc = _ctxdesc(repo[succ]) + msg = (_('note: not rebasing %s, already in ' + 'destination as %s\n') % (desc, succdesc)) + repo.ui.status(msg) + # Make clearrebased aware state[rev] is not a true successor + self.skipped.add(rev) + # Record rev as moved to its desired destination in self.state. + # This helps bookmark and working parent movement. + dest = max(adjustdest(repo, rev, self.destmap, self.state, + self.skipped)) + self.state[rev] = dest + elif self.state[rev] == revtodo: + ui.status(_('rebasing %s\n') % desc) + progressfn(ctx) + p1, p2, base = defineparents(repo, rev, self.destmap, + self.state, self.skipped, + self.obsoletenotrebased) + if len(repo[None].parents()) == 2: + repo.ui.debug('resuming interrupted rebase\n') + else: + overrides = {('ui', 'forcemerge'): opts.get('tool', '')} + with ui.configoverride(overrides, 'rebase'): + stats = rebasenode(repo, rev, p1, base, self.collapsef, + dest, wctx=self.wctx) + if stats.unresolvedcount > 0: + if self.inmemory: + raise error.InMemoryMergeConflictsError() + else: + raise error.InterventionRequired( + _('unresolved conflicts (see hg ' + 'resolve, then hg rebase --continue)')) + if not self.collapsef: + merging = p2 != nullrev + editform = cmdutil.mergeeditform(merging, 'rebase') + editor = cmdutil.getcommiteditor(editform=editform, + **pycompat.strkwargs(opts)) + newnode = self._concludenode(rev, p1, p2, editor) + else: + # Skip commit if we are collapsing + if self.inmemory: + self.wctx.setbase(repo[p1]) else: - try: - ui.setconfig('ui', 'forcemerge', opts.get('tool', ''), - 'rebase') - stats = rebasenode(repo, rev, p1, base, self.state, - self.collapsef, dest, wctx=self.wctx) - if stats and stats[3] > 0: - if self.wctx.isinmemory(): - raise error.InMemoryMergeConflictsError() - else: - raise error.InterventionRequired( - _('unresolved conflicts (see hg ' - 'resolve, then hg rebase --continue)')) - finally: - ui.setconfig('ui', 'forcemerge', '', 'rebase') + repo.setparents(repo[p1].node()) + newnode = None + # Update the state + if newnode is not None: + self.state[rev] = repo[newnode].rev() + ui.debug('rebased as %s\n' % short(newnode)) + else: if not self.collapsef: - merging = p2 != nullrev - editform = cmdutil.mergeeditform(merging, 'rebase') - editor = cmdutil.getcommiteditor(editform=editform, **opts) - if self.wctx.isinmemory(): - newnode = concludememorynode(repo, rev, p1, p2, - wctx=self.wctx, - extrafn=_makeextrafn(self.extrafns), - editor=editor, - keepbranches=self.keepbranchesf, - date=self.date) - mergemod.mergestate.clean(repo) - else: - newnode = concludenode(repo, rev, p1, p2, - extrafn=_makeextrafn(self.extrafns), - editor=editor, - keepbranches=self.keepbranchesf, - date=self.date) - - if newnode is None: - # If it ended up being a no-op commit, then the normal - # merge state clean-up path doesn't happen, so do it - # here. Fix issue5494 - mergemod.mergestate.clean(repo) - else: - # Skip commit if we are collapsing - if self.wctx.isinmemory(): - self.wctx.setbase(repo[p1]) - else: - repo.setparents(repo[p1].node()) - newnode = None - # Update the state - if newnode is not None: - self.state[rev] = repo[newnode].rev() - ui.debug('rebased as %s\n' % short(newnode)) - else: - if not self.collapsef: - ui.warn(_('note: rebase of %d:%s created no changes ' - 'to commit\n') % (rev, ctx)) - self.skipped.add(rev) - self.state[rev] = p1 - ui.debug('next revision set to %s\n' % p1) - else: - ui.status(_('already rebased %s as %s\n') % - (desc, repo[self.state[rev]])) - return pos + ui.warn(_('note: rebase of %d:%s created no changes ' + 'to commit\n') % (rev, ctx)) + self.skipped.add(rev) + self.state[rev] = p1 + ui.debug('next revision set to %d\n' % p1) + else: + ui.status(_('already rebased %s as %s\n') % + (desc, repo[self.state[rev]])) + if not tr: + # When not using single transaction, store state after each + # commit is completely done. On InterventionRequired, we thus + # won't store the status. Instead, we'll hit the "len(parents) == 2" + # case and realize that the commit was in progress. + self.storestatus() def _finishrebase(self): repo, ui, opts = self.repo, self.ui, self.opts fm = ui.formatter('rebase', opts) fm.startitem() - if self.collapsef and not self.keepopen: + if self.collapsef: p1, p2, _base = defineparents(repo, min(self.state), self.destmap, self.state, self.skipped, self.obsoletenotrebased) @@ -564,32 +605,17 @@ editor = cmdutil.getcommiteditor(edit=editopt, editform=editform) revtoreuse = max(self.state) - dsguard = None - if self.inmemory: - newnode = concludememorynode(repo, revtoreuse, p1, - self.external, - commitmsg=commitmsg, - extrafn=_makeextrafn(self.extrafns), - editor=editor, - keepbranches=self.keepbranchesf, - date=self.date, wctx=self.wctx) - else: - if ui.configbool('rebase', 'singletransaction'): - dsguard = dirstateguard.dirstateguard(repo, 'rebase') - with util.acceptintervention(dsguard): - newnode = concludenode(repo, revtoreuse, p1, self.external, - commitmsg=commitmsg, - extrafn=_makeextrafn(self.extrafns), - editor=editor, - keepbranches=self.keepbranchesf, - date=self.date) + newnode = self._concludenode(revtoreuse, p1, self.external, + editor, commitmsg=commitmsg) + if newnode is not None: newrev = repo[newnode].rev() - for oldrev in self.state.iterkeys(): + for oldrev in self.state: self.state[oldrev] = newrev if 'qtip' in repo.tags(): - updatemq(repo, self.state, self.skipped, **opts) + updatemq(repo, self.state, self.skipped, + **pycompat.strkwargs(opts)) # restore original working directory # (we do this before stripping) @@ -597,15 +623,13 @@ if newwd < 0: # original directory is a parent of rebase set root or ignored newwd = self.originalwd - if (newwd not in [c.rev() for c in repo[None].parents()] and - not self.inmemory): + if newwd not in [c.rev() for c in repo[None].parents()]: ui.note(_("update back to initial working directory parent\n")) hg.updaterepo(repo, newwd, False) collapsedas = None - if not self.keepf: - if self.collapsef: - collapsedas = newnode + if self.collapsef and not self.keepf: + collapsedas = newnode clearrebased(ui, repo, self.destmap, self.state, self.skipped, collapsedas, self.keepf, fm=fm) @@ -646,7 +670,10 @@ ('i', 'interactive', False, _('(DEPRECATED)')), ('t', 'tool', '', _('specify merge tool')), ('c', 'continue', False, _('continue an interrupted rebase')), - ('a', 'abort', False, _('abort an interrupted rebase'))] + + ('a', 'abort', False, _('abort an interrupted rebase')), + ('', 'auto-orphans', '', _('automatically rebase orphan revisions ' + 'in the specified revset (EXPERIMENTAL)')), + ] + cmdutil.formatteropts, _('[-s REV | -b REV] [-d REV] [OPTION]')) def rebase(ui, repo, **opts): @@ -778,11 +805,22 @@ # fail the entire transaction.) inmemory = False + if opts.get('auto_orphans'): + for key in opts: + if key != 'auto_orphans' and opts.get(key): + raise error.Abort(_('--auto-orphans is incompatible with %s') % + ('--' + key)) + userrevs = list(repo.revs(opts.get('auto_orphans'))) + opts['rev'] = [revsetlang.formatspec('%ld and orphan()', userrevs)] + opts['dest'] = '_destautoorphanrebase(SRC)' + if inmemory: try: # in-memory merge doesn't support conflicts, so if we hit any, abort # and re-run as an on-disk merge. - return _origrebase(ui, repo, inmemory=inmemory, **opts) + overrides = {('rebase', 'singletransaction'): True} + with ui.configoverride(overrides, 'rebase'): + return _origrebase(ui, repo, inmemory=inmemory, **opts) except error.InMemoryMergeConflictsError: ui.warn(_('hit merge conflicts; re-running rebase without in-memory' ' merge\n')) @@ -840,14 +878,14 @@ if retcode is not None: return retcode else: - destmap = _definedestmap(ui, repo, rbsrt, destf, srcf, basef, revf, - destspace=destspace) + destmap = _definedestmap(ui, repo, inmemory, destf, srcf, basef, + revf, destspace=destspace) retcode = rbsrt._preparenewrebase(destmap) if retcode is not None: return retcode + storecollapsemsg(repo, rbsrt.collapsemsg) tr = None - dsguard = None singletr = ui.configbool('rebase', 'singletransaction') if singletr: @@ -859,14 +897,14 @@ with util.acceptintervention(tr): # Same logic for the dirstate guard, except we don't create one when # rebasing in-memory (it's not needed). + dsguard = None if singletr and not inmemory: dsguard = dirstateguard.dirstateguard(repo, 'rebase') with util.acceptintervention(dsguard): rbsrt._performrebase(tr) + rbsrt._finishrebase() - rbsrt._finishrebase() - -def _definedestmap(ui, repo, rbsrt, destf=None, srcf=None, basef=None, +def _definedestmap(ui, repo, inmemory, destf=None, srcf=None, basef=None, revf=None, destspace=None): """use revisions argument to define destmap {srcrev: destrev}""" if revf is None: @@ -881,7 +919,7 @@ if revf and srcf: raise error.Abort(_('cannot specify both a revision and a source')) - if not rbsrt.inmemory: + if not inmemory: cmdutil.checkunfinished(repo) cmdutil.bailifchanged(repo) @@ -914,12 +952,12 @@ dest = scmutil.revsingle(repo, destf) else: dest = repo[_destrebase(repo, base, destspace=destspace)] - destf = str(dest) + destf = bytes(dest) roots = [] # selected children of branching points bpbase = {} # {branchingpoint: [origbase]} for b in base: # group bases by branching points - bp = repo.revs('ancestor(%d, %d)', b, dest).first() + bp = repo.revs('ancestor(%d, %d)', b, dest.rev()).first() bpbase[bp] = bpbase.get(bp, []) + [b] if None in bpbase: # emulate the old behavior, showing "nothing to rebase" (a better @@ -941,12 +979,12 @@ else: ui.status(_('nothing to rebase - working directory ' 'parent is also destination\n')) - elif not repo.revs('%ld - ::%d', base, dest): + elif not repo.revs('%ld - ::%d', base, dest.rev()): if basef: ui.status(_('nothing to rebase - "base" %s is ' 'already an ancestor of destination ' '%s\n') % - ('+'.join(str(repo[r]) for r in base), + ('+'.join(bytes(repo[r]) for r in base), dest)) else: ui.status(_('nothing to rebase - working ' @@ -954,29 +992,19 @@ 'ancestor of destination %s\n') % dest) else: # can it happen? ui.status(_('nothing to rebase from %s to %s\n') % - ('+'.join(str(repo[r]) for r in base), dest)) + ('+'.join(bytes(repo[r]) for r in base), dest)) return None - # If rebasing the working copy parent, force in-memory merge to be off. - # - # This is because the extra work of checking out the newly rebased commit - # outweights the benefits of rebasing in-memory, and executing an extra - # update command adds a bit of overhead, so better to just do it on disk. In - # all other cases leave it on. - # - # Note that there are cases where this isn't true -- e.g., rebasing large - # stacks that include the WCP. However, I'm not yet sure where the cutoff - # is. + rebasingwcp = repo['.'].rev() in rebaseset ui.log("rebase", "", rebase_rebasing_wcp=rebasingwcp) - if rbsrt.inmemory and rebasingwcp: - rbsrt.inmemory = False + if inmemory and rebasingwcp: # Check these since we did not before. cmdutil.checkunfinished(repo) cmdutil.bailifchanged(repo) if not destf: dest = repo[_destrebase(repo, rebaseset, destspace=destspace)] - destf = str(dest) + destf = bytes(dest) allsrc = revsetlang.formatspec('%ld', rebaseset) alias = {'ALLSRC': allsrc} @@ -1031,82 +1059,47 @@ return nullrev if len(parents) == 1: return parents.pop() - raise error.Abort(_('unable to collapse on top of %s, there is more ' + raise error.Abort(_('unable to collapse on top of %d, there is more ' 'than one external parent: %s') % (max(destancestors), - ', '.join(str(p) for p in sorted(parents)))) + ', '.join("%d" % p for p in sorted(parents)))) -def concludememorynode(repo, rev, p1, p2, wctx=None, - commitmsg=None, editor=None, extrafn=None, - keepbranches=False, date=None): - '''Commit the memory changes with parents p1 and p2. Reuse commit info from - rev but also store useful information in extra. +def commitmemorynode(repo, p1, p2, wctx, editor, extra, user, date, commitmsg): + '''Commit the memory changes with parents p1 and p2. Return node of committed revision.''' - ctx = repo[rev] - if commitmsg is None: - commitmsg = ctx.description() - keepbranch = keepbranches and repo[p1].branch() != ctx.branch() - extra = {'rebase_source': ctx.hex()} - if extrafn: - extrafn(ctx, extra) + # Replicates the empty check in ``repo.commit``. + if wctx.isempty() and not repo.ui.configbool('ui', 'allowemptycommit'): + return None - destphase = max(ctx.phase(), phases.draft) - overrides = {('phases', 'new-commit'): destphase} - with repo.ui.configoverride(overrides, 'rebase'): - if keepbranch: - repo.ui.setconfig('ui', 'allowemptycommit', True) - # Replicates the empty check in ``repo.commit``. - if wctx.isempty() and not repo.ui.configbool('ui', 'allowemptycommit'): - return None - - if date is None: - date = ctx.date() + # By convention, ``extra['branch']`` (set by extrafn) clobbers + # ``branch`` (used when passing ``--keepbranches``). + branch = repo[p1].branch() + if 'branch' in extra: + branch = extra['branch'] - # By convention, ``extra['branch']`` (set by extrafn) clobbers - # ``branch`` (used when passing ``--keepbranches``). - branch = repo[p1].branch() - if 'branch' in extra: - branch = extra['branch'] + memctx = wctx.tomemctx(commitmsg, parents=(p1, p2), date=date, + extra=extra, user=user, branch=branch, editor=editor) + commitres = repo.commitctx(memctx) + wctx.clean() # Might be reused + return commitres - memctx = wctx.tomemctx(commitmsg, parents=(p1, p2), date=date, - extra=extra, user=ctx.user(), branch=branch, editor=editor) - commitres = repo.commitctx(memctx) - wctx.clean() # Might be reused - return commitres - -def concludenode(repo, rev, p1, p2, commitmsg=None, editor=None, extrafn=None, - keepbranches=False, date=None): - '''Commit the wd changes with parents p1 and p2. Reuse commit info from rev - but also store useful information in extra. +def commitnode(repo, p1, p2, editor, extra, user, date, commitmsg): + '''Commit the wd changes with parents p1 and p2. Return node of committed revision.''' dsguard = util.nullcontextmanager() if not repo.ui.configbool('rebase', 'singletransaction'): dsguard = dirstateguard.dirstateguard(repo, 'rebase') with dsguard: repo.setparents(repo[p1].node(), repo[p2].node()) - ctx = repo[rev] - if commitmsg is None: - commitmsg = ctx.description() - keepbranch = keepbranches and repo[p1].branch() != ctx.branch() - extra = {'rebase_source': ctx.hex()} - if extrafn: - extrafn(ctx, extra) - destphase = max(ctx.phase(), phases.draft) - overrides = {('phases', 'new-commit'): destphase} - with repo.ui.configoverride(overrides, 'rebase'): - if keepbranch: - repo.ui.setconfig('ui', 'allowemptycommit', True) - # Commit might fail if unresolved files exist - if date is None: - date = ctx.date() - newnode = repo.commit(text=commitmsg, user=ctx.user(), - date=date, extra=extra, editor=editor) + # Commit might fail if unresolved files exist + newnode = repo.commit(text=commitmsg, user=user, date=date, + extra=extra, editor=editor) repo.dirstate.setbranch(repo[newnode].branch()) return newnode -def rebasenode(repo, rev, p1, base, state, collapse, dest, wctx): +def rebasenode(repo, rev, p1, base, collapse, dest, wctx): 'Rebase a single revision rev on top of p1 using base as merge ancestor' # Merge phase # Update to destination and merge it with local @@ -1220,7 +1213,7 @@ `rebaseobsrevs`: set of obsolete revision in source `rebaseobsskipped`: set of revisions from source skipped because they have - successors in destination + successors in destination or no non-obsolete successor. """ # Obsolete node with successors not in dest leads to divergence divergenceok = ui.configbool('experimental', @@ -1228,7 +1221,7 @@ divergencebasecandidates = rebaseobsrevs - rebaseobsskipped if divergencebasecandidates and not divergenceok: - divhashes = (str(repo[r]) + divhashes = (bytes(repo[r]) for r in divergencebasecandidates) msg = _("this rebase will cause " "divergences from: %s") @@ -1436,7 +1429,7 @@ def isagitpatch(repo, patchname): 'Return true if the given patch is in git format' mqpatch = os.path.join(repo.mq.path, patchname) - for line in patch.linereader(file(mqpatch, 'rb')): + for line in patch.linereader(open(mqpatch, 'rb')): if line.startswith('diff --git'): return True return False @@ -1465,10 +1458,10 @@ for rev in sorted(mqrebase, reverse=True): if rev not in skipped: name, isgit = mqrebase[rev] - repo.ui.note(_('updating mq patch %s to %s:%s\n') % + repo.ui.note(_('updating mq patch %s to %d:%s\n') % (name, state[rev], repo[state[rev]])) mq.qimport(repo, (), patchname=name, git=isgit, - rev=[str(state[rev])]) + rev=["%d" % state[rev]]) else: # Rebased and skipped skippedpatches.add(mqrebase[rev][0]) @@ -1550,7 +1543,7 @@ cleanup = True if immutable: repo.ui.warn(_("warning: can't clean up public changesets %s\n") - % ', '.join(str(repo[r]) for r in immutable), + % ', '.join(bytes(repo[r]) for r in immutable), hint=_("see 'hg help phases' for details")) cleanup = False @@ -1645,7 +1638,9 @@ roots = list(repo.set('roots(%ld)', sortedsrc[0])) if not roots: raise error.Abort(_('no matching revisions')) - roots.sort() + def revof(r): + return r.rev() + roots = sorted(roots, key=revof) state = dict.fromkeys(rebaseset, revtodo) emptyrebase = (len(sortedsrc) == 1) for root in roots: @@ -1784,25 +1779,34 @@ `obsoletewithoutsuccessorindestination` is a set with obsolete revisions without a successor in destination. + + `obsoleteextinctsuccessors` is a set of obsolete revisions with only + obsolete successors. """ obsoletenotrebased = {} obsoletewithoutsuccessorindestination = set([]) + obsoleteextinctsuccessors = set([]) assert repo.filtername is None cl = repo.changelog nodemap = cl.nodemap + extinctnodes = set(cl.node(r) for r in repo.revs('extinct()')) for srcrev in rebaseobsrevs: srcnode = cl.node(srcrev) destnode = cl.node(destmap[srcrev]) # XXX: more advanced APIs are required to handle split correctly - successors = list(obsutil.allsuccessors(repo.obsstore, [srcnode])) - if len(successors) == 1: - # obsutil.allsuccessors includes node itself. When the list only - # contains one element, it means there are no successors. + successors = set(obsutil.allsuccessors(repo.obsstore, [srcnode])) + # obsutil.allsuccessors includes node itself + successors.remove(srcnode) + if successors.issubset(extinctnodes): + # all successors are extinct + obsoleteextinctsuccessors.add(srcrev) + if not successors: + # no successor obsoletenotrebased[srcrev] = None else: for succnode in successors: - if succnode == srcnode or succnode not in nodemap: + if succnode not in nodemap: continue if cl.isancestor(succnode, destnode): obsoletenotrebased[srcrev] = nodemap[succnode] @@ -1811,11 +1815,14 @@ # If 'srcrev' has a successor in rebase set but none in # destination (which would be catched above), we shall skip it # and its descendants to avoid divergence. - if any(nodemap[s] in destmap - for s in successors if s != srcnode): + if any(nodemap[s] in destmap for s in successors): obsoletewithoutsuccessorindestination.add(srcrev) - return obsoletenotrebased, obsoletewithoutsuccessorindestination + return ( + obsoletenotrebased, + obsoletewithoutsuccessorindestination, + obsoleteextinctsuccessors, + ) def summaryhook(ui, repo): if not repo.vfs.exists('rebasestate'): diff -r fb92df8b634c -r ed5448edcbfa hgext/releasenotes.py --- a/hgext/releasenotes.py Wed Apr 04 10:35:09 2018 -0400 +++ b/hgext/releasenotes.py Wed Apr 18 15:32:08 2018 -0400 @@ -311,8 +311,8 @@ title = block['lines'][0].strip() if block['lines'] else None if i + 1 == len(blocks): - raise error.Abort(_('release notes directive %s lacks content') - % directive) + raise error.Abort(_('changeset %s: release notes directive %s ' + 'lacks content') % (ctx, directive)) # Now search ahead and find all paragraphs attached to this # admonition. @@ -324,9 +324,12 @@ if pblock['type'] == 'margin': continue + if pblock['type'] == 'admonition': + break + if pblock['type'] != 'paragraph': - raise error.Abort(_('unexpected block in release notes ' - 'directive %s') % directive) + repo.ui.warn(_('changeset %s: unexpected block in release ' + 'notes directive %s\n') % (ctx, directive)) if pblock['indent'] > 0: paragraphs.append(pblock['lines']) diff -r fb92df8b634c -r ed5448edcbfa hgext/relink.py --- a/hgext/relink.py Wed Apr 04 10:35:09 2018 -0400 +++ b/hgext/relink.py Wed Apr 18 15:32:08 2018 -0400 @@ -18,6 +18,9 @@ registrar, util, ) +from mercurial.utils import ( + stringutil, +) cmdtable = {} command = registrar.command(cmdtable) @@ -168,8 +171,8 @@ source = os.path.join(src, f) tgt = os.path.join(dst, f) # Binary mode, so that read() works correctly, especially on Windows - sfp = file(source, 'rb') - dfp = file(tgt, 'rb') + sfp = open(source, 'rb') + dfp = open(tgt, 'rb') sin = sfp.read(CHUNKLEN) while sin: din = dfp.read(CHUNKLEN) @@ -187,7 +190,7 @@ relinked += 1 savedbytes += sz except OSError as inst: - ui.warn('%s: %s\n' % (tgt, str(inst))) + ui.warn('%s: %s\n' % (tgt, stringutil.forcebytestr(inst))) ui.progress(_('relinking'), None) diff -r fb92df8b634c -r ed5448edcbfa hgext/remotenames.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/hgext/remotenames.py Wed Apr 18 15:32:08 2018 -0400 @@ -0,0 +1,372 @@ +# remotenames.py - extension to display remotenames +# +# Copyright 2017 Augie Fackler +# Copyright 2017 Sean Farley +# +# This software may be used and distributed according to the terms of the +# GNU General Public License version 2 or any later version. + +""" showing remotebookmarks and remotebranches in UI + +By default both remotebookmarks and remotebranches are turned on. Config knob to +control the individually are as follows. + +Config options to tweak the default behaviour: + +remotenames.bookmarks + Boolean value to enable or disable showing of remotebookmarks (default: True) + +remotenames.branches + Boolean value to enable or disable showing of remotebranches (default: True) + +remotenames.hoistedpeer + Name of the peer whose remotebookmarks should be hoisted into the top-level + namespace (default: 'default') +""" + +from __future__ import absolute_import + +from mercurial.i18n import _ + +from mercurial.node import ( + bin, +) +from mercurial import ( + bookmarks, + extensions, + logexchange, + namespaces, + pycompat, + registrar, + revsetlang, + smartset, + templateutil, +) + +if pycompat.ispy3: + import collections.abc + mutablemapping = collections.abc.MutableMapping +else: + import collections + mutablemapping = collections.MutableMapping + +# Note for extension authors: ONLY specify testedwith = 'ships-with-hg-core' for +# extensions which SHIP WITH MERCURIAL. Non-mainline extensions should +# be specifying the version(s) of Mercurial they are tested with, or +# leave the attribute unspecified. +testedwith = 'ships-with-hg-core' + +configtable = {} +configitem = registrar.configitem(configtable) +templatekeyword = registrar.templatekeyword() +revsetpredicate = registrar.revsetpredicate() + +configitem('remotenames', 'bookmarks', + default=True, +) +configitem('remotenames', 'branches', + default=True, +) +configitem('remotenames', 'hoistedpeer', + default='default', +) + +class lazyremotenamedict(mutablemapping): + """ + Read-only dict-like Class to lazily resolve remotename entries + + We are doing that because remotenames startup was slow. + We lazily read the remotenames file once to figure out the potential entries + and store them in self.potentialentries. Then when asked to resolve an + entry, if it is not in self.potentialentries, then it isn't there, if it + is in self.potentialentries we resolve it and store the result in + self.cache. We cannot be lazy is when asked all the entries (keys). + """ + def __init__(self, kind, repo): + self.cache = {} + self.potentialentries = {} + self._kind = kind # bookmarks or branches + self._repo = repo + self.loaded = False + + def _load(self): + """ Read the remotenames file, store entries matching selected kind """ + self.loaded = True + repo = self._repo + for node, rpath, rname in logexchange.readremotenamefile(repo, + self._kind): + name = rpath + '/' + rname + self.potentialentries[name] = (node, rpath, name) + + def _resolvedata(self, potentialentry): + """ Check that the node for potentialentry exists and return it """ + if not potentialentry in self.potentialentries: + return None + node, remote, name = self.potentialentries[potentialentry] + repo = self._repo + binnode = bin(node) + # if the node doesn't exist, skip it + try: + repo.changelog.rev(binnode) + except LookupError: + return None + # Skip closed branches + if (self._kind == 'branches' and repo[binnode].closesbranch()): + return None + return [binnode] + + def __getitem__(self, key): + if not self.loaded: + self._load() + val = self._fetchandcache(key) + if val is not None: + return val + else: + raise KeyError() + + def __iter__(self): + return iter(self.potentialentries) + + def __len__(self): + return len(self.potentialentries) + + def __setitem__(self): + raise NotImplementedError + + def __delitem__(self): + raise NotImplementedError + + def _fetchandcache(self, key): + if key in self.cache: + return self.cache[key] + val = self._resolvedata(key) + if val is not None: + self.cache[key] = val + return val + else: + return None + + def keys(self): + """ Get a list of bookmark or branch names """ + if not self.loaded: + self._load() + return self.potentialentries.keys() + + def iteritems(self): + """ Iterate over (name, node) tuples """ + + if not self.loaded: + self._load() + + for k, vtup in self.potentialentries.iteritems(): + yield (k, [bin(vtup[0])]) + +class remotenames(object): + """ + This class encapsulates all the remotenames state. It also contains + methods to access that state in convenient ways. Remotenames are lazy + loaded. Whenever client code needs to ensure the freshest copy of + remotenames, use the `clearnames` method to force an eventual load. + """ + + def __init__(self, repo, *args): + self._repo = repo + self.clearnames() + + def clearnames(self): + """ Clear all remote names state """ + self.bookmarks = lazyremotenamedict("bookmarks", self._repo) + self.branches = lazyremotenamedict("branches", self._repo) + self._invalidatecache() + + def _invalidatecache(self): + self._nodetobmarks = None + self._nodetobranch = None + self._hoisttonodes = None + self._nodetohoists = None + + def bmarktonodes(self): + return self.bookmarks + + def nodetobmarks(self): + if not self._nodetobmarks: + bmarktonodes = self.bmarktonodes() + self._nodetobmarks = {} + for name, node in bmarktonodes.iteritems(): + self._nodetobmarks.setdefault(node[0], []).append(name) + return self._nodetobmarks + + def branchtonodes(self): + return self.branches + + def nodetobranch(self): + if not self._nodetobranch: + branchtonodes = self.branchtonodes() + self._nodetobranch = {} + for name, nodes in branchtonodes.iteritems(): + for node in nodes: + self._nodetobranch.setdefault(node, []).append(name) + return self._nodetobranch + + def hoisttonodes(self, hoist): + if not self._hoisttonodes: + marktonodes = self.bmarktonodes() + self._hoisttonodes = {} + hoist += '/' + for name, node in marktonodes.iteritems(): + if name.startswith(hoist): + name = name[len(hoist):] + self._hoisttonodes[name] = node + return self._hoisttonodes + + def nodetohoists(self, hoist): + if not self._nodetohoists: + marktonodes = self.bmarktonodes() + self._nodetohoists = {} + hoist += '/' + for name, node in marktonodes.iteritems(): + if name.startswith(hoist): + name = name[len(hoist):] + self._nodetohoists.setdefault(node[0], []).append(name) + return self._nodetohoists + +def wrapprintbookmarks(orig, ui, repo, bmarks, **opts): + if 'remotebookmarks' not in repo.names: + return + ns = repo.names['remotebookmarks'] + + for name in ns.listnames(repo): + nodes = ns.nodes(repo, name) + if not nodes: + continue + node = nodes[0] + + bmarks[name] = (node, ' ', '') + + return orig(ui, repo, bmarks, **opts) + +def extsetup(ui): + extensions.wrapfunction(bookmarks, '_printbookmarks', wrapprintbookmarks) + +def reposetup(ui, repo): + if not repo.local(): + return + + repo._remotenames = remotenames(repo) + ns = namespaces.namespace + + if ui.configbool('remotenames', 'bookmarks'): + remotebookmarkns = ns( + 'remotebookmarks', + templatename='remotebookmarks', + colorname='remotebookmark', + logfmt='remote bookmark: %s\n', + listnames=lambda repo: repo._remotenames.bmarktonodes().keys(), + namemap=lambda repo, name: + repo._remotenames.bmarktonodes().get(name, []), + nodemap=lambda repo, node: + repo._remotenames.nodetobmarks().get(node, [])) + repo.names.addnamespace(remotebookmarkns) + + # hoisting only works if there are remote bookmarks + hoist = ui.config('remotenames', 'hoistedpeer') + if hoist: + hoistednamens = ns( + 'hoistednames', + templatename='hoistednames', + colorname='hoistedname', + logfmt='hoisted name: %s\n', + listnames = lambda repo: + repo._remotenames.hoisttonodes(hoist).keys(), + namemap = lambda repo, name: + repo._remotenames.hoisttonodes(hoist).get(name, []), + nodemap = lambda repo, node: + repo._remotenames.nodetohoists(hoist).get(node, [])) + repo.names.addnamespace(hoistednamens) + + if ui.configbool('remotenames', 'branches'): + remotebranchns = ns( + 'remotebranches', + templatename='remotebranches', + colorname='remotebranch', + logfmt='remote branch: %s\n', + listnames = lambda repo: repo._remotenames.branchtonodes().keys(), + namemap = lambda repo, name: + repo._remotenames.branchtonodes().get(name, []), + nodemap = lambda repo, node: + repo._remotenames.nodetobranch().get(node, [])) + repo.names.addnamespace(remotebranchns) + +@templatekeyword('remotenames', requires={'repo', 'ctx'}) +def remotenameskw(context, mapping): + """List of strings. Remote names associated with the changeset.""" + repo = context.resource(mapping, 'repo') + ctx = context.resource(mapping, 'ctx') + + remotenames = [] + if 'remotebookmarks' in repo.names: + remotenames = repo.names['remotebookmarks'].names(repo, ctx.node()) + + if 'remotebranches' in repo.names: + remotenames += repo.names['remotebranches'].names(repo, ctx.node()) + + return templateutil.compatlist(context, mapping, 'remotename', remotenames, + plural='remotenames') + +@templatekeyword('remotebookmarks', requires={'repo', 'ctx'}) +def remotebookmarkskw(context, mapping): + """List of strings. Remote bookmarks associated with the changeset.""" + repo = context.resource(mapping, 'repo') + ctx = context.resource(mapping, 'ctx') + + remotebmarks = [] + if 'remotebookmarks' in repo.names: + remotebmarks = repo.names['remotebookmarks'].names(repo, ctx.node()) + + return templateutil.compatlist(context, mapping, 'remotebookmark', + remotebmarks, plural='remotebookmarks') + +@templatekeyword('remotebranches', requires={'repo', 'ctx'}) +def remotebrancheskw(context, mapping): + """List of strings. Remote branches associated with the changeset.""" + repo = context.resource(mapping, 'repo') + ctx = context.resource(mapping, 'ctx') + + remotebranches = [] + if 'remotebranches' in repo.names: + remotebranches = repo.names['remotebranches'].names(repo, ctx.node()) + + return templateutil.compatlist(context, mapping, 'remotebranch', + remotebranches, plural='remotebranches') + +def _revsetutil(repo, subset, x, rtypes): + """utility function to return a set of revs based on the rtypes""" + + revs = set() + cl = repo.changelog + for rtype in rtypes: + if rtype in repo.names: + ns = repo.names[rtype] + for name in ns.listnames(repo): + revs.update(ns.nodes(repo, name)) + + results = (cl.rev(n) for n in revs if cl.hasnode(n)) + return subset & smartset.baseset(sorted(results)) + +@revsetpredicate('remotenames()') +def remotenamesrevset(repo, subset, x): + """All changesets which have a remotename on them.""" + revsetlang.getargs(x, 0, 0, _("remotenames takes no arguments")) + return _revsetutil(repo, subset, x, ('remotebookmarks', 'remotebranches')) + +@revsetpredicate('remotebranches()') +def remotebranchesrevset(repo, subset, x): + """All changesets which are branch heads on remotes.""" + revsetlang.getargs(x, 0, 0, _("remotebranches takes no arguments")) + return _revsetutil(repo, subset, x, ('remotebranches',)) + +@revsetpredicate('remotebookmarks()') +def remotebmarksrevset(repo, subset, x): + """All changesets which have bookmarks on remotes.""" + revsetlang.getargs(x, 0, 0, _("remotebookmarks takes no arguments")) + return _revsetutil(repo, subset, x, ('remotebookmarks',)) diff -r fb92df8b634c -r ed5448edcbfa hgext/schemes.py --- a/hgext/schemes.py Wed Apr 04 10:35:09 2018 -0400 +++ b/hgext/schemes.py Wed Apr 18 15:32:08 2018 -0400 @@ -78,9 +78,9 @@ def __repr__(self): return '' % self.scheme - def instance(self, ui, url, create): + def instance(self, ui, url, create, intents=None): url = self.resolve(url) - return hg._peerlookup(url).instance(ui, url, create) + return hg._peerlookup(url).instance(ui, url, create, intents=intents) def resolve(self, url): # Should this use the util.url class, or is manual parsing better? @@ -94,7 +94,7 @@ parts = parts[:-1] else: tail = '' - context = dict((str(i + 1), v) for i, v in enumerate(parts)) + context = dict(('%d' % (i + 1), v) for i, v in enumerate(parts)) return ''.join(self.templater.process(self.url, context)) + tail def hasdriveletter(orig, path): diff -r fb92df8b634c -r ed5448edcbfa hgext/share.py --- a/hgext/share.py Wed Apr 04 10:35:09 2018 -0400 +++ b/hgext/share.py Wed Apr 18 15:32:08 2018 -0400 @@ -52,9 +52,6 @@ util, ) -repository = hg.repository -parseurl = hg.parseurl - cmdtable = {} command = registrar.command(cmdtable) # Note for extension authors: ONLY specify testedwith = 'ships-with-hg-core' for @@ -135,27 +132,9 @@ return False return hg.sharedbookmarks in shared -def _getsrcrepo(repo): - """ - Returns the source repository object for a given shared repository. - If repo is not a shared repository, return None. - """ - if repo.sharedpath == repo.path: - return None - - if util.safehasattr(repo, 'srcrepo') and repo.srcrepo: - return repo.srcrepo - - # the sharedpath always ends in the .hg; we want the path to the repo - source = repo.vfs.split(repo.sharedpath)[0] - srcurl, branches = parseurl(source) - srcrepo = repository(repo.ui, srcurl) - repo.srcrepo = srcrepo - return srcrepo - def getbkfile(orig, repo): if _hassharedbookmarks(repo): - srcrepo = _getsrcrepo(repo) + srcrepo = hg.sharedreposource(repo) if srcrepo is not None: # just orig(srcrepo) doesn't work as expected, because # HG_PENDING refers repo.root. @@ -186,7 +165,7 @@ orig(self, tr) if _hassharedbookmarks(self._repo): - srcrepo = _getsrcrepo(self._repo) + srcrepo = hg.sharedreposource(self._repo) if srcrepo is not None: category = 'share-bookmarks' tr.addpostclose(category, lambda tr: self._writerepo(srcrepo)) @@ -196,6 +175,6 @@ orig(self, repo) if _hassharedbookmarks(self._repo): - srcrepo = _getsrcrepo(self._repo) + srcrepo = hg.sharedreposource(self._repo) if srcrepo is not None: orig(self, srcrepo) diff -r fb92df8b634c -r ed5448edcbfa hgext/shelve.py --- a/hgext/shelve.py Wed Apr 04 10:35:09 2018 -0400 +++ b/hgext/shelve.py Wed Apr 18 15:32:08 2018 -0400 @@ -25,6 +25,7 @@ import collections import errno import itertools +import stat from mercurial.i18n import _ from mercurial import ( @@ -55,6 +56,10 @@ from . import ( rebase, ) +from mercurial.utils import ( + dateutil, + stringutil, +) cmdtable = {} command = registrar.command(cmdtable) @@ -192,7 +197,7 @@ d['nodestoremove'] = [nodemod.bin(h) for h in d['nodestoremove'].split(' ')] except (ValueError, TypeError, KeyError) as err: - raise error.CorruptedState(str(err)) + raise error.CorruptedState(pycompat.bytestr(err)) @classmethod def _getversion(cls, repo): @@ -201,7 +206,7 @@ try: version = int(fp.readline().strip()) except ValueError as err: - raise error.CorruptedState(str(err)) + raise error.CorruptedState(pycompat.bytestr(err)) finally: fp.close() return version @@ -251,7 +256,7 @@ if d.get('activebook', '') != cls._noactivebook: obj.activebookmark = d.get('activebook', '') except (error.RepoLookupError, KeyError) as err: - raise error.CorruptedState(str(err)) + raise error.CorruptedState(pycompat.bytestr(err)) return obj @@ -271,7 +276,7 @@ "activebook": activebook or cls._noactivebook } scmutil.simplekeyvaluefile(repo.vfs, cls._filename)\ - .write(info, firstline=str(cls._version)) + .write(info, firstline=("%d" % cls._version)) @classmethod def clear(cls, repo): @@ -282,7 +287,7 @@ maxbackups = repo.ui.configint('shelve', 'maxbackups') hgfiles = [f for f in vfs.listdir() if f.endswith('.' + patchextension)] - hgfiles = sorted([(vfs.stat(f).st_mtime, f) for f in hgfiles]) + hgfiles = sorted([(vfs.stat(f)[stat.ST_MTIME], f) for f in hgfiles]) if 0 < maxbackups and maxbackups < len(hgfiles): bordermtime = hgfiles[-maxbackups][0] else: @@ -408,9 +413,8 @@ def _shelvecreatedcommit(repo, node, name): bases = list(mutableancestors(repo[node])) shelvedfile(repo, name, 'hg').writebundle(bases, node) - cmdutil.export(repo, [node], - fp=shelvedfile(repo, name, patchextension).opener('wb'), - opts=mdiff.diffopts(git=True)) + with shelvedfile(repo, name, patchextension).opener('wb') as fp: + cmdutil.exportfile(repo, [node], fp, opts=mdiff.diffopts(git=True)) def _includeunknownfiles(repo, pats, opts, extra): s = repo.status(match=scmutil.match(repo[None], pats, opts), @@ -475,7 +479,7 @@ _shelvecreatedcommit(repo, node, name) if ui.formatted(): - desc = util.ellipsis(desc, ui.termwidth()) + desc = stringutil.ellipsis(desc, ui.termwidth()) ui.status(_('shelved as %s\n') % name) hg.update(repo, parent.node()) if origbranch != repo['.'].branch() and not _isbareshelve(pats, opts): @@ -541,7 +545,7 @@ if not pfx or sfx != patchextension: continue st = shelvedfile(repo, name).stat() - info.append((st.st_mtime, shelvedfile(repo, pfx).filename())) + info.append((st[stat.ST_MTIME], shelvedfile(repo, pfx).filename())) return sorted(info, reverse=True) def listcmd(ui, repo, pats, opts): @@ -563,7 +567,8 @@ continue ui.write(' ' * (16 - len(sname))) used = 16 - age = '(%s)' % templatefilters.age(util.makedate(mtime), abbrev=True) + date = dateutil.makedate(mtime) + age = '(%s)' % templatefilters.age(date, abbrev=True) ui.write(age, label='shelve.age') ui.write(' ' * (12 - len(age))) used += 12 @@ -575,7 +580,7 @@ if not line.startswith('#'): desc = line.rstrip() if ui.formatted(): - desc = util.ellipsis(desc, width - used) + desc = stringutil.ellipsis(desc, width - used) ui.write(desc) break ui.write('\n') @@ -619,7 +624,7 @@ repo.vfs.rename('unshelverebasestate', 'rebasestate') try: rebase.rebase(ui, repo, **{ - 'abort' : True + r'abort' : True }) except Exception: repo.vfs.rename('rebasestate', 'unshelverebasestate') @@ -648,7 +653,7 @@ ui.pushbuffer(True) cmdutil.revert(ui, repo, shelvectx, repo.dirstate.parents(), *pathtofiles(repo, files), - **{'no_backup': True}) + **{r'no_backup': True}) ui.popbuffer() def restorebranch(ui, repo, branchtorestore): @@ -681,7 +686,7 @@ repo.vfs.rename('unshelverebasestate', 'rebasestate') try: rebase.rebase(ui, repo, **{ - 'continue' : True + r'continue' : True }) except Exception: repo.vfs.rename('rebasestate', 'unshelverebasestate') @@ -744,10 +749,10 @@ ui.status(_('rebasing shelved changes\n')) try: rebase.rebase(ui, repo, **{ - 'rev': [shelvectx.rev()], - 'dest': str(tmpwctx.rev()), - 'keep': True, - 'tool': opts.get('tool', ''), + r'rev': [shelvectx.rev()], + r'dest': "%d" % tmpwctx.rev(), + r'keep': True, + r'tool': opts.get('tool', ''), }) except error.InterventionRequired: tr.close() @@ -881,7 +886,7 @@ raise cmdutil.wrongtooltocontinue(repo, _('unshelve')) except error.CorruptedState as err: - ui.debug(str(err) + '\n') + ui.debug(pycompat.bytestr(err) + '\n') if continuef: msg = _('corrupted shelved state file') hint = _('please run hg unshelve --abort to abort unshelve ' diff -r fb92df8b634c -r ed5448edcbfa hgext/show.py --- a/hgext/show.py Wed Apr 04 10:35:09 2018 -0400 +++ b/hgext/show.py Wed Apr 18 15:32:08 2018 -0400 @@ -29,7 +29,6 @@ from mercurial.i18n import _ from mercurial.node import ( - hex, nullrev, ) from mercurial import ( @@ -39,11 +38,13 @@ error, formatter, graphmod, + logcmdutil, phases, pycompat, registrar, revset, revsetlang, + scmutil, ) # Note for extension authors: ONLY specify testedwith = 'ships-with-hg-core' for @@ -125,7 +126,7 @@ ui.write('\n') for name, func in sorted(views.items()): - ui.write(('%s\n') % func.__doc__) + ui.write(('%s\n') % pycompat.sysbytes(func.__doc__)) ui.write('\n') raise error.Abort(_('no view requested'), @@ -148,7 +149,7 @@ elif fn._csettopic: ref = 'show%s' % fn._csettopic spec = formatter.lookuptemplate(ui, ref, template) - displayer = cmdutil.changeset_templater(ui, repo, spec, buffered=True) + displayer = logcmdutil.changesettemplater(ui, repo, spec, buffered=True) return fn(ui, repo, displayer) else: return fn(ui, repo) @@ -259,7 +260,7 @@ shortesttmpl = formatter.maketemplater(ui, '{shortest(node, %d)}' % nodelen, resources=tres) def shortest(ctx): - return shortesttmpl.render({'ctx': ctx, 'node': ctx.hex()}) + return shortesttmpl.renderdefault({'ctx': ctx, 'node': ctx.hex()}) # We write out new heads to aid in DAG awareness and to help with decision # making on how the stack should be reconciled with commits made since the @@ -409,8 +410,8 @@ revdag = graphmod.dagwalker(repo, revs) ui.setconfig('experimental', 'graphshorten', True) - cmdutil.displaygraph(ui, repo, revdag, displayer, graphmod.asciiedges, - props={'nodelen': nodelen}) + logcmdutil.displaygraph(ui, repo, revdag, displayer, graphmod.asciiedges, + props={'nodelen': nodelen}) def extsetup(ui): # Alias `hg ` to `hg show `. @@ -426,7 +427,7 @@ continue # Same for aliases. - if ui.config('alias', name): + if ui.config('alias', name, None): continue ui.setconfig('alias', name, 'show %s' % view, source='show') @@ -445,9 +446,9 @@ """ if not revs: return minlen - # don't use filtered repo because it's slow. see templater.shortest(). - cl = repo.unfiltered().changelog - return max(len(cl.shortest(hex(cl.node(r)), minlen)) for r in revs) + cl = repo.changelog + return max(len(scmutil.shortesthexnodeidprefix(repo, cl.node(r), minlen)) + for r in revs) # Adjust the docstring of the show command so it shows all registered views. # This is a bit hacky because it runs at the end of module load. When moved diff -r fb92df8b634c -r ed5448edcbfa hgext/sparse.py --- a/hgext/sparse.py Wed Apr 04 10:35:09 2018 -0400 +++ b/hgext/sparse.py Wed Apr 18 15:32:08 2018 -0400 @@ -75,12 +75,12 @@ from mercurial.i18n import _ from mercurial import ( - cmdutil, commands, dirstate, error, extensions, hg, + logcmdutil, match as matchmod, pycompat, registrar, @@ -126,7 +126,7 @@ entry[1].append(('', 'sparse', None, "limit to changesets affecting the sparse checkout")) - def _logrevs(orig, repo, opts): + def _initialrevs(orig, repo, opts): revs = orig(repo, opts) if opts.get('sparse'): sparsematch = sparse.matcher(repo) @@ -135,7 +135,7 @@ return any(f for f in ctx.files() if sparsematch(f)) revs = revs.filter(ctxmatch) return revs - extensions.wrapfunction(cmdutil, '_logrevs', _logrevs) + extensions.wrapfunction(logcmdutil, '_initialrevs', _initialrevs) def _clonesparsecmd(orig, ui, repo, *args, **opts): include_pat = opts.get('include') @@ -194,7 +194,11 @@ """ def walk(orig, self, match, subrepos, unknown, ignored, full=True): - match = matchmod.intersectmatchers(match, self._sparsematcher) + # hack to not exclude explicitly-specified paths so that they can + # be warned later on e.g. dirstate.add() + em = matchmod.exact(match._root, match._cwd, match.files()) + sm = matchmod.unionmatcher([self._sparsematcher, em]) + match = matchmod.intersectmatchers(match, sm) return orig(self, match, subrepos, unknown, ignored, full) extensions.wrapfunction(dirstate.dirstate, 'walk', walk) diff -r fb92df8b634c -r ed5448edcbfa hgext/split.py --- a/hgext/split.py Wed Apr 04 10:35:09 2018 -0400 +++ b/hgext/split.py Wed Apr 18 15:32:08 2018 -0400 @@ -24,6 +24,7 @@ hg, obsolete, phases, + pycompat, registrar, revsetlang, scmutil, @@ -160,7 +161,7 @@ 'interactive': True, 'message': header + ctx.description(), }) - commands.commit(ui, repo, **opts) + commands.commit(ui, repo, **pycompat.strkwargs(opts)) newctx = repo['.'] committed.append(newctx) @@ -172,6 +173,6 @@ return committed[-1] -def dorebase(ui, repo, src, dest): +def dorebase(ui, repo, src, destctx): rebase.rebase(ui, repo, rev=[revsetlang.formatspec('%ld', src)], - dest=revsetlang.formatspec('%d', dest)) + dest=revsetlang.formatspec('%d', destctx.rev())) diff -r fb92df8b634c -r ed5448edcbfa hgext/strip.py --- a/hgext/strip.py Wed Apr 04 10:35:09 2018 -0400 +++ b/hgext/strip.py Wed Apr 18 15:32:08 2018 -0400 @@ -181,13 +181,10 @@ strippedrevs = revs.union(descendants) roots = revs.difference(descendants) - update = False # if one of the wdir parent is stripped we'll need # to update away to an earlier revision - for p in repo.dirstate.parents(): - if p != nullid and cl.rev(p) in strippedrevs: - update = True - break + update = any(p != nullid and cl.rev(p) in strippedrevs + for p in repo.dirstate.parents()) rootnodes = set(cl.node(r) for r in roots) @@ -215,7 +212,7 @@ # only reset the dirstate for files that would actually change # between the working context and uctx - descendantrevs = repo.revs("%s::." % uctx.rev()) + descendantrevs = repo.revs(b"%d::.", uctx.rev()) changedfiles = [] for rev in descendantrevs: # blindly reset the files, regardless of what actually changed diff -r fb92df8b634c -r ed5448edcbfa hgext/transplant.py --- a/hgext/transplant.py Wed Apr 04 10:35:09 2018 -0400 +++ b/hgext/transplant.py Wed Apr 18 15:32:08 2018 -0400 @@ -24,6 +24,7 @@ error, exchange, hg, + logcmdutil, match, merge, node as nodemod, @@ -37,6 +38,10 @@ util, vfs as vfsmod, ) +from mercurial.utils import ( + procutil, + stringutil, +) class TransplantError(error.Abort): pass @@ -119,7 +124,8 @@ opener=self.opener) def getcommiteditor(): editform = cmdutil.mergeeditform(repo[None], 'transplant') - return cmdutil.getcommiteditor(editform=editform, **opts) + return cmdutil.getcommiteditor(editform=editform, + **pycompat.strkwargs(opts)) self.getcommiteditor = getcommiteditor def applied(self, repo, node, parent): @@ -160,7 +166,7 @@ tr = repo.transaction('transplant') for rev in revs: node = revmap[rev] - revstr = '%s:%s' % (rev, nodemod.short(node)) + revstr = '%d:%s' % (rev, nodemod.short(node)) if self.applied(repo, node, p1): self.ui.warn(_('skipping already applied revision %s\n') % @@ -194,7 +200,7 @@ skipmerge = False if parents[1] != revlog.nullid: if not opts.get('parent'): - self.ui.note(_('skipping merge changeset %s:%s\n') + self.ui.note(_('skipping merge changeset %d:%s\n') % (rev, nodemod.short(node))) skipmerge = True else: @@ -210,7 +216,7 @@ patchfile = None else: fd, patchfile = tempfile.mkstemp(prefix='hg-transplant-') - fp = os.fdopen(fd, pycompat.sysstr('w')) + fp = os.fdopen(fd, r'wb') gen = patch.diff(source, parent, node, opts=diffopts) for chunk in gen: fp.write(chunk) @@ -258,7 +264,7 @@ self.ui.status(_('filtering %s\n') % patchfile) user, date, msg = (changelog[1], changelog[2], changelog[4]) fd, headerfile = tempfile.mkstemp(prefix='hg-transplant-') - fp = os.fdopen(fd, pycompat.sysstr('w')) + fp = os.fdopen(fd, r'wb') fp.write("# HG changeset patch\n") fp.write("# User %s\n" % user) fp.write("# Date %d %d\n" % date) @@ -266,14 +272,15 @@ fp.close() try: - self.ui.system('%s %s %s' % (filter, util.shellquote(headerfile), - util.shellquote(patchfile)), + self.ui.system('%s %s %s' % (filter, + procutil.shellquote(headerfile), + procutil.shellquote(patchfile)), environ={'HGUSER': changelog[1], 'HGREVISION': nodemod.hex(node), }, onerr=error.Abort, errprefix=_('filter failed'), blockedtag='transplant_filter') - user, date, msg = self.parselog(file(headerfile))[1:4] + user, date, msg = self.parselog(open(headerfile, 'rb'))[1:4] finally: os.unlink(headerfile) @@ -309,7 +316,7 @@ p1 = repo.dirstate.p1() p2 = node self.log(user, date, message, p1, p2, merge=merge) - self.ui.write(str(inst) + '\n') + self.ui.write(stringutil.forcebytestr(inst) + '\n') raise TransplantError(_('fix up the working directory and run ' 'hg transplant --continue')) else: @@ -501,7 +508,7 @@ def browserevs(ui, repo, nodes, opts): '''interactively transplant changesets''' - displayer = cmdutil.show_changeset(ui, repo, opts) + displayer = logcmdutil.changesetdisplayer(ui, repo, opts) transplants = [] merges = [] prompt = _('apply changeset? [ynmpcq?]:' @@ -646,6 +653,7 @@ raise error.Abort(_('--all is incompatible with a ' 'revision list')) + opts = pycompat.byteskwargs(opts) checkopts(opts, revs) if not opts.get('log'): @@ -695,7 +703,7 @@ tf = tp.transplantfilter(repo, source, p1) if opts.get('prune'): - prune = set(source.lookup(r) + prune = set(source[r].node() for r in scmutil.revrange(source, opts.get('prune'))) matchfn = lambda x: tf(x) and x not in prune else: @@ -704,7 +712,7 @@ revmap = {} if revs: for r in scmutil.revrange(source, revs): - revmap[int(r)] = source.lookup(r) + revmap[int(r)] = source[r].node() elif opts.get('all') or not merges: if source != repo: alltransplants = incwalk(source, csets, match=matchfn) @@ -741,10 +749,11 @@ templatekeyword = registrar.templatekeyword() -@templatekeyword('transplanted') -def kwtransplanted(repo, ctx, **args): +@templatekeyword('transplanted', requires={'ctx'}) +def kwtransplanted(context, mapping): """String. The node identifier of the transplanted changeset if any.""" + ctx = context.resource(mapping, 'ctx') n = ctx.extra().get('transplant_source') return n and nodemod.hex(n) or '' diff -r fb92df8b634c -r ed5448edcbfa hgext/uncommit.py --- a/hgext/uncommit.py Wed Apr 04 10:35:09 2018 -0400 +++ b/hgext/uncommit.py Wed Apr 18 15:32:08 2018 -0400 @@ -51,7 +51,7 @@ # leave the attribute unspecified. testedwith = 'ships-with-hg-core' -def _commitfiltered(repo, ctx, match, allowempty): +def _commitfiltered(repo, ctx, match, keepcommit): """Recommit ctx with changed files not in match. Return the new node identifier, or None if nothing changed. """ @@ -66,7 +66,7 @@ files = (initialfiles - exclude) # return the p1 so that we don't create an obsmarker later - if not files and not allowempty: + if not keepcommit: return ctx.parents()[0].node() # Filter copies @@ -151,13 +151,16 @@ files to their uncommitted state. This means that files modified or deleted in the changeset will be left unchanged, and so will remain modified in the working directory. + + If no files are specified, the commit will be pruned, unless --keep is + given. """ opts = pycompat.byteskwargs(opts) with repo.wlock(), repo.lock(): if not pats and not repo.ui.configbool('experimental', - 'uncommitondirtywdir'): + 'uncommitondirtywdir'): cmdutil.bailifchanged(repo) old = repo['.'] rewriteutil.precheck(repo, [old.rev()], 'uncommit') @@ -166,7 +169,8 @@ with repo.transaction('uncommit'): match = scmutil.match(old, pats, opts) - newid = _commitfiltered(repo, old, match, opts.get('keep')) + keepcommit = opts.get('keep') or pats + newid = _commitfiltered(repo, old, match, keepcommit) if newid is None: ui.status(_("nothing to uncommit\n")) return 1 diff -r fb92df8b634c -r ed5448edcbfa hgext/win32text.py --- a/hgext/win32text.py Wed Apr 04 10:35:09 2018 -0400 +++ b/hgext/win32text.py Wed Apr 18 15:32:08 2018 -0400 @@ -50,7 +50,9 @@ ) from mercurial import ( registrar, - util, +) +from mercurial.utils import ( + stringutil, ) # Note for extension authors: ONLY specify testedwith = 'ships-with-hg-core' for @@ -100,22 +102,22 @@ return s.replace('\r', '\n') def cleverdecode(s, cmd, **kwargs): - if not util.binary(s): + if not stringutil.binary(s): return dumbdecode(s, cmd, **kwargs) return s def cleverencode(s, cmd): - if not util.binary(s): + if not stringutil.binary(s): return dumbencode(s, cmd) return s def macdecode(s, cmd, **kwargs): - if not util.binary(s): + if not stringutil.binary(s): return macdumbdecode(s, cmd, **kwargs) return s def macencode(s, cmd): - if not util.binary(s): + if not stringutil.binary(s): return macdumbencode(s, cmd) return s @@ -146,7 +148,7 @@ continue seen.add(f) data = c[f].data() - if not util.binary(data) and newline in data: + if not stringutil.binary(data) and newline in data: if not halt: ui.warn(_('attempt to commit or push text file(s) ' 'using %s line endings\n') % diff -r fb92df8b634c -r ed5448edcbfa i18n/hggettext --- a/i18n/hggettext Wed Apr 04 10:35:09 2018 -0400 +++ b/i18n/hggettext Wed Apr 18 15:32:08 2018 -0400 @@ -104,7 +104,8 @@ """ mod = importpath(path) if not path.startswith('mercurial/') and mod.__doc__: - src = open(path).read() + with open(path) as fobj: + src = fobj.read() lineno = 1 + offset(src, mod.__doc__, path, 7) print(poentry(path, lineno, mod.__doc__)) @@ -143,7 +144,8 @@ def rawtext(path): - src = open(path).read() + with open(path) as f: + src = f.read() print(poentry(path, 1, src)) diff -r fb92df8b634c -r ed5448edcbfa mercurial/__init__.py --- a/mercurial/__init__.py Wed Apr 04 10:35:09 2018 -0400 +++ b/mercurial/__init__.py Wed Apr 18 15:32:08 2018 -0400 @@ -31,6 +31,9 @@ # Only handle Mercurial-related modules. if not fullname.startswith(('mercurial.', 'hgext.', 'hgext3rd.')): return None + # don't try to parse binary + if fullname.startswith('mercurial.cext.'): + return None # third-party packages are expected to be dual-version clean if fullname.startswith('mercurial.thirdparty'): return None diff -r fb92df8b634c -r ed5448edcbfa mercurial/archival.py --- a/mercurial/archival.py Wed Apr 04 10:35:09 2018 -0400 +++ b/mercurial/archival.py Wed Apr 18 15:32:08 2018 -0400 @@ -21,6 +21,8 @@ error, formatter, match as matchmod, + pycompat, + scmutil, util, vfs as vfsmod, ) @@ -37,7 +39,7 @@ if prefix: prefix = util.normpath(prefix) else: - if not isinstance(dest, str): + if not isinstance(dest, bytes): raise ValueError('dest must be string if no prefix') prefix = os.path.basename(dest) lower = prefix.lower() @@ -76,29 +78,27 @@ return repo[rev] return repo['null'] +# {tags} on ctx includes local tags and 'tip', with no current way to limit +# that to global tags. Therefore, use {latesttag} as a substitute when +# the distance is 0, since that will be the list of global tags on ctx. +_defaultmetatemplate = br''' +repo: {root} +node: {ifcontains(rev, revset("wdir()"), "{p1node}{dirty}", "{node}")} +branch: {branch|utf8} +{ifeq(latesttagdistance, 0, join(latesttag % "tag: {tag}", "\n"), + separate("\n", + join(latesttag % "latesttag: {tag}", "\n"), + "latesttagdistance: {latesttagdistance}", + "changessincelatesttag: {changessincelatesttag}"))} +'''[1:] # drop leading '\n' + def buildmetadata(ctx): '''build content of .hg_archival.txt''' repo = ctx.repo() - default = ( - r'repo: {root}\n' - r'node: {ifcontains(rev, revset("wdir()"),' - r'"{p1node}{dirty}", "{node}")}\n' - r'branch: {branch|utf8}\n' - - # {tags} on ctx includes local tags and 'tip', with no current way to - # limit that to global tags. Therefore, use {latesttag} as a substitute - # when the distance is 0, since that will be the list of global tags on - # ctx. - r'{ifeq(latesttagdistance, 0, latesttag % "tag: {tag}\n",' - r'"{latesttag % "latesttag: {tag}\n"}' - r'latesttagdistance: {latesttagdistance}\n' - r'changessincelatesttag: {changessincelatesttag}\n")}' - ) - opts = { 'template': repo.ui.config('experimental', 'archivemetatemplate', - default) + _defaultmetatemplate) } out = util.stringio() @@ -125,7 +125,7 @@ def __init__(self, *args, **kw): timestamp = None - if 'timestamp' in kw: + if r'timestamp' in kw: timestamp = kw.pop(r'timestamp') if timestamp is None: self.timestamp = time.time() @@ -142,8 +142,8 @@ flags = 0 if fname: flags = gzip.FNAME - self.fileobj.write(chr(flags)) - gzip.write32u(self.fileobj, long(self.timestamp)) + self.fileobj.write(pycompat.bytechr(flags)) + gzip.write32u(self.fileobj, int(self.timestamp)) self.fileobj.write('\002') self.fileobj.write('\377') if fname: @@ -155,30 +155,34 @@ def taropen(mode, name='', fileobj=None): if kind == 'gz': - mode = mode[0] + mode = mode[0:1] if not fileobj: fileobj = open(name, mode + 'b') - gzfileobj = self.GzipFileWithTime(name, mode + 'b', + gzfileobj = self.GzipFileWithTime(name, + pycompat.sysstr(mode + 'b'), zlib.Z_BEST_COMPRESSION, fileobj, timestamp=mtime) self.fileobj = gzfileobj - return tarfile.TarFile.taropen(name, mode, gzfileobj) + return tarfile.TarFile.taropen( + name, pycompat.sysstr(mode), gzfileobj) else: - return tarfile.open(name, mode + kind, fileobj) + return tarfile.open( + name, pycompat.sysstr(mode + kind), fileobj) - if isinstance(dest, str): + if isinstance(dest, bytes): self.z = taropen('w:', name=dest) else: self.z = taropen('w|', fileobj=dest) def addfile(self, name, mode, islink, data): + name = pycompat.fsdecode(name) i = tarfile.TarInfo(name) i.mtime = self.mtime i.size = len(data) if islink: i.type = tarfile.SYMTYPE i.mode = 0o777 - i.linkname = data + i.linkname = pycompat.fsdecode(data) data = None i.size = 0 else: @@ -191,35 +195,12 @@ if self.fileobj: self.fileobj.close() -class tellable(object): - '''provide tell method for zipfile.ZipFile when writing to http - response file object.''' - - def __init__(self, fp): - self.fp = fp - self.offset = 0 - - def __getattr__(self, key): - return getattr(self.fp, key) - - def write(self, s): - self.fp.write(s) - self.offset += len(s) - - def tell(self): - return self.offset - class zipit(object): '''write archive to zip file or stream. can write uncompressed, or compressed with deflate.''' def __init__(self, dest, mtime, compress=True): - if not isinstance(dest, str): - try: - dest.tell() - except (AttributeError, IOError): - dest = tellable(dest) - self.z = zipfile.ZipFile(dest, 'w', + self.z = zipfile.ZipFile(pycompat.fsdecode(dest), r'w', compress and zipfile.ZIP_DEFLATED or zipfile.ZIP_STORED) @@ -233,7 +214,7 @@ self.date_time = time.gmtime(mtime)[:6] def addfile(self, name, mode, islink, data): - i = zipfile.ZipInfo(name, self.date_time) + i = zipfile.ZipInfo(pycompat.fsdecode(name), self.date_time) i.compress_type = self.z.compression # unzip will not honor unix file modes unless file creator is # set to unix (id 3). @@ -268,7 +249,7 @@ if islink: self.opener.symlink(data, name) return - f = self.opener(name, "w", atomictemp=True) + f = self.opener(name, "w", atomictemp=False) f.write(data) f.close() destfile = os.path.join(self.basedir, name) @@ -339,6 +320,8 @@ total = len(files) if total: files.sort() + scmutil.prefetchfiles(repo, [ctx.rev()], + scmutil.matchfiles(repo, files)) repo.ui.progress(_('archiving'), 0, unit=_('files'), total=total) for i, f in enumerate(files): ff = ctx.flags(f) diff -r fb92df8b634c -r ed5448edcbfa mercurial/bookmarks.py --- a/mercurial/bookmarks.py Wed Apr 04 10:35:09 2018 -0400 +++ b/mercurial/bookmarks.py Wed Apr 18 15:32:08 2018 -0400 @@ -84,7 +84,7 @@ # - node in nm, for non-20-bytes entry # - split(...), for string without ' ' repo.ui.warn(_('malformed line in .hg/bookmarks: %r\n') - % line) + % pycompat.bytestr(line)) except IOError as inst: if inst.errno != errno.ENOENT: raise @@ -103,30 +103,24 @@ self._aclean = False def __setitem__(self, *args, **kwargs): - msg = ("'bookmarks[name] = node' is deprecated, " - "use 'bookmarks.applychanges'") - self._repo.ui.deprecwarn(msg, '4.3') - self._set(*args, **kwargs) + raise error.ProgrammingError("use 'bookmarks.applychanges' instead") def _set(self, key, value): self._clean = False return dict.__setitem__(self, key, value) def __delitem__(self, key): - msg = ("'del bookmarks[name]' is deprecated, " - "use 'bookmarks.applychanges'") - self._repo.ui.deprecwarn(msg, '4.3') - self._del(key) + raise error.ProgrammingError("use 'bookmarks.applychanges' instead") def _del(self, key): self._clean = False return dict.__delitem__(self, key) def update(self, *others): - msg = ("bookmarks.update(...)' is deprecated, " - "use 'bookmarks.applychanges'") - self._repo.ui.deprecwarn(msg, '4.5') - return dict.update(self, *others) + raise error.ProgrammingError("use 'bookmarks.applychanges' instead") + + def changectx(self, mark): + return self._repo[self[mark]] def applychanges(self, repo, tr, changes): """Apply a list of changes to bookmarks @@ -146,12 +140,6 @@ bmchanges[name] = (old, node) self._recordchange(tr) - def recordchange(self, tr): - msg = ("'bookmarks.recorchange' is deprecated, " - "use 'bookmarks.applychanges'") - self._repo.ui.deprecwarn(msg, '4.3') - return self._recordchange(tr) - def _recordchange(self, tr): """record that bookmarks have been changed in a transaction @@ -194,7 +182,7 @@ self._aclean = True def _write(self, fp): - for name, node in self.iteritems(): + for name, node in sorted(self.iteritems()): fp.write("%s %s\n" % (hex(node), encoding.fromlocal(name))) self._clean = True self._repo.invalidatevolatilesets() @@ -219,7 +207,7 @@ If divergent bookmark are to be deleted, they will be returned as list. """ - cur = self._repo.changectx('.').node() + cur = self._repo['.'].node() if mark in self and not force: if target: if self[mark] == target and target == cur: @@ -227,8 +215,8 @@ return [] rev = self._repo[target].rev() anc = self._repo.changelog.ancestors([rev]) - bmctx = self._repo[self[mark]] - divs = [self._repo[b].node() for b in self + bmctx = self.changectx(mark) + divs = [self[b] for b in self if b.split('@', 1)[0] == mark.split('@', 1)[0]] # allow resolving a single divergent bookmark even if moving @@ -253,7 +241,7 @@ _("a bookmark cannot have the name of an existing branch")) if len(mark) > 3 and not force: try: - shadowhash = (mark in self._repo) + shadowhash = scmutil.isrevsymbol(self._repo, mark) except error.LookupError: # ambiguous identifier shadowhash = False if shadowhash: @@ -363,17 +351,16 @@ heads.append(n) return heads -def calculateupdate(ui, repo, checkout): - '''Return a tuple (targetrev, movemarkfrom) indicating the rev to - check out and where to move the active bookmark from, if needed.''' - movemarkfrom = None - if checkout is None: - activemark = repo._activebookmark - if isactivewdirparent(repo): - movemarkfrom = repo['.'].node() - elif activemark: - ui.status(_("updating to active bookmark %s\n") % activemark) - checkout = activemark +def calculateupdate(ui, repo): + '''Return a tuple (activemark, movemarkfrom) indicating the active bookmark + and where to move the active bookmark from, if needed.''' + checkout, movemarkfrom = None, None + activemark = repo._activebookmark + if isactivewdirparent(repo): + movemarkfrom = repo['.'].node() + elif activemark: + ui.status(_("updating to active bookmark %s\n") % activemark) + checkout = activemark return (checkout, movemarkfrom) def update(repo, parents, node): @@ -386,11 +373,11 @@ bmchanges = [] if marks[active] in parents: new = repo[node] - divs = [repo[b] for b in marks + divs = [marks.changectx(b) for b in marks if b.split('@', 1)[0] == active.split('@', 1)[0]] anc = repo.changelog.ancestors([new.rev()]) deletefrom = [b.node() for b in divs if b.rev() in anc or b == new] - if validdest(repo, repo[marks[active]], new): + if validdest(repo, marks.changectx(active), new): bmchanges.append((active, new.node())) for bm in divergent2delete(repo, deletefrom, active): @@ -659,12 +646,16 @@ writer(msg) localmarks.applychanges(repo, tr, changes) -def incoming(ui, repo, other): +def incoming(ui, repo, peer): '''Show bookmarks incoming from other to repo ''' ui.status(_("searching for changed bookmarks\n")) - remotemarks = unhexlifybookmarks(other.listkeys('bookmarks')) + with peer.commandexecutor() as e: + remotemarks = unhexlifybookmarks(e.callcommand('listkeys', { + 'namespace': 'bookmarks', + }).result()) + r = comparebookmarks(repo, remotemarks, repo._bookmarks) addsrc, adddst, advsrc, advdst, diverge, differ, invalid, same = r @@ -746,12 +737,16 @@ return 0 -def summary(repo, other): +def summary(repo, peer): '''Compare bookmarks between repo and other for "hg summary" output This returns "(# of incoming, # of outgoing)" tuple. ''' - remotemarks = unhexlifybookmarks(other.listkeys('bookmarks')) + with peer.commandexecutor() as e: + remotemarks = unhexlifybookmarks(e.callcommand('listkeys', { + 'namespace': 'bookmarks', + }).result()) + r = comparebookmarks(repo, remotemarks, repo._bookmarks) addsrc, adddst, advsrc, advdst, diverge, differ, invalid, same = r return (len(addsrc), len(adddst)) @@ -833,7 +828,7 @@ Raises an abort error if old is not in the bookmark store. """ marks = repo._bookmarks - cur = repo.changectx('.').node() + cur = repo['.'].node() newact = None changes = [] hiddenrev = None diff -r fb92df8b634c -r ed5448edcbfa mercurial/branchmap.py --- a/mercurial/branchmap.py Wed Apr 04 10:35:09 2018 -0400 +++ b/mercurial/branchmap.py Wed Apr 18 15:32:08 2018 -0400 @@ -18,9 +18,13 @@ from . import ( encoding, error, + pycompat, scmutil, util, ) +from .utils import ( + stringutil, +) calcsize = struct.calcsize pack_into = struct.pack_into @@ -52,18 +56,19 @@ filteredhash=filteredhash) if not partial.validfor(repo): # invalidate the cache - raise ValueError('tip differs') + raise ValueError(r'tip differs') cl = repo.changelog for l in lines: if not l: continue node, state, label = l.split(" ", 2) if state not in 'oc': - raise ValueError('invalid branch state') + raise ValueError(r'invalid branch state') label = encoding.tolocal(label.strip()) node = bin(node) if not cl.hasnode(node): - raise ValueError('node %s does not exist' % hex(node)) + raise ValueError( + r'node %s does not exist' % pycompat.sysstr(hex(node))) partial.setdefault(label, []).append(node) if state == 'c': partial._closednodes.add(node) @@ -73,7 +78,7 @@ if repo.filtername is not None: msg += ' (%s)' % repo.filtername msg += ': %s\n' - repo.ui.debug(msg % inst) + repo.ui.debug(msg % pycompat.bytestr(inst)) partial = None return partial @@ -253,7 +258,8 @@ repo.filtername, len(self), nodecount) except (IOError, OSError, error.Abort) as inst: # Abort may be raised by read only opener, so log and continue - repo.ui.debug("couldn't write branch cache: %s\n" % inst) + repo.ui.debug("couldn't write branch cache: %s\n" % + stringutil.forcebytestr(inst)) def update(self, repo, revgen): """Given a branchhead cache, self, that may have extra nodes or be @@ -375,7 +381,7 @@ self._rbcrevs[:] = data except (IOError, OSError) as inst: repo.ui.debug("couldn't read revision branch cache: %s\n" % - inst) + stringutil.forcebytestr(inst)) # remember number of good records on disk self._rbcrevslen = min(len(self._rbcrevs) // _rbcrecsize, len(repo.changelog)) @@ -451,6 +457,26 @@ self._setcachedata(rev, reponode, branchidx) return b, close + def setdata(self, branch, rev, node, close): + """add new data information to the cache""" + if branch in self._namesreverse: + branchidx = self._namesreverse[branch] + else: + branchidx = len(self._names) + self._names.append(branch) + self._namesreverse[branch] = branchidx + if close: + branchidx |= _rbccloseflag + self._setcachedata(rev, node, branchidx) + # If no cache data were readable (non exists, bad permission, etc) + # the cache was bypassing itself by setting: + # + # self.branchinfo = self._branchinfo + # + # Since we now have data in the cache, we need to drop this bypassing. + if 'branchinfo' in vars(self): + del self.branchinfo + def _setcachedata(self, rev, node, branchidx): """Writes the node's branch data to the in-memory cache data.""" if rev == nullrev: @@ -517,7 +543,7 @@ self._rbcrevslen = revs except (IOError, OSError, error.Abort, error.LockError) as inst: repo.ui.debug("couldn't write revision branch cache%s: %s\n" - % (step, inst)) + % (step, stringutil.forcebytestr(inst))) finally: if wlock is not None: wlock.release() diff -r fb92df8b634c -r ed5448edcbfa mercurial/bundle2.py --- a/mercurial/bundle2.py Wed Apr 04 10:35:09 2018 -0400 +++ b/mercurial/bundle2.py Wed Apr 18 15:32:08 2018 -0400 @@ -147,6 +147,7 @@ from __future__ import absolute_import, division +import collections import errno import os import re @@ -158,6 +159,7 @@ from . import ( bookmarks, changegroup, + encoding, error, node as nodemod, obsolete, @@ -169,6 +171,9 @@ url, util, ) +from .utils import ( + stringutil, +) urlerr = util.urlerr urlreq = util.urlreq @@ -294,7 +299,7 @@ * a way to construct a bundle response when applicable. """ - def __init__(self, repo, transactiongetter, captureoutput=True): + def __init__(self, repo, transactiongetter, captureoutput=True, source=''): self.repo = repo self.ui = repo.ui self.records = unbundlerecords() @@ -304,6 +309,7 @@ self._gettransaction = transactiongetter # carries value that can modify part behavior self.modes = {} + self.source = source def gettransaction(self): transaction = self._gettransaction() @@ -336,7 +342,7 @@ to be created""" raise TransactionUnavailable() -def applybundle(repo, unbundler, tr, source=None, url=None, **kwargs): +def applybundle(repo, unbundler, tr, source, url=None, **kwargs): # transform me into unbundler.apply() as soon as the freeze is lifted if isinstance(unbundler, unbundle20): tr.hookargs['bundle2'] = '1' @@ -344,10 +350,10 @@ tr.hookargs['source'] = source if url is not None and 'url' not in tr.hookargs: tr.hookargs['url'] = url - return processbundle(repo, unbundler, lambda: tr) + return processbundle(repo, unbundler, lambda: tr, source=source) else: # the transactiongetter won't be used, but we might as well set it - op = bundleoperation(repo, lambda: tr) + op = bundleoperation(repo, lambda: tr, source=source) _processchangegroup(op, unbundler, tr, source, url, **kwargs) return op @@ -419,7 +425,7 @@ self.repo.ui.debug('bundle2-input-bundle: %i parts total\n' % self.count) -def processbundle(repo, unbundler, transactiongetter=None, op=None): +def processbundle(repo, unbundler, transactiongetter=None, op=None, source=''): """This function process a bundle, apply effect to/from a repo It iterates over each part then searches for and uses the proper handling @@ -435,7 +441,7 @@ if op is None: if transactiongetter is None: transactiongetter = _notransaction - op = bundleoperation(repo, transactiongetter) + op = bundleoperation(repo, transactiongetter, source=source) # todo: # - replace this is a init function soon. # - exception catching @@ -1089,7 +1095,7 @@ ui.debug('bundle2-generatorexit\n') raise except BaseException as exc: - bexc = util.forcebytestr(exc) + bexc = stringutil.forcebytestr(exc) # backup exception data for later ui.debug('bundle2-input-stream-interrupt: encoding exception %s' % bexc) @@ -1490,6 +1496,7 @@ 'digests': tuple(sorted(util.DIGESTS.keys())), 'remote-changegroup': ('http', 'https'), 'hgtagsfnodes': (), + 'rev-branch-cache': (), 'phases': ('heads',), 'stream': ('v2',), } @@ -1574,21 +1581,30 @@ # different right now. So we keep them separated for now for the sake of # simplicity. - # we always want a changegroup in such bundle - cgversion = opts.get('cg.version') - if cgversion is None: - cgversion = changegroup.safeversion(repo) - cg = changegroup.makechangegroup(repo, outgoing, cgversion, source) - part = bundler.newpart('changegroup', data=cg.getchunks()) - part.addparam('version', cg.version) - if 'clcount' in cg.extras: - part.addparam('nbchanges', '%d' % cg.extras['clcount'], - mandatory=False) - if opts.get('phases') and repo.revs('%ln and secret()', - outgoing.missingheads): - part.addparam('targetphase', '%d' % phases.secret, mandatory=False) + # we might not always want a changegroup in such bundle, for example in + # stream bundles + if opts.get('changegroup', True): + cgversion = opts.get('cg.version') + if cgversion is None: + cgversion = changegroup.safeversion(repo) + cg = changegroup.makechangegroup(repo, outgoing, cgversion, source) + part = bundler.newpart('changegroup', data=cg.getchunks()) + part.addparam('version', cg.version) + if 'clcount' in cg.extras: + part.addparam('nbchanges', '%d' % cg.extras['clcount'], + mandatory=False) + if opts.get('phases') and repo.revs('%ln and secret()', + outgoing.missingheads): + part.addparam('targetphase', '%d' % phases.secret, mandatory=False) - addparttagsfnodescache(repo, bundler, outgoing) + if opts.get('streamv2', False): + addpartbundlestream2(bundler, repo, stream=True) + + if opts.get('tagsfnodescache', True): + addparttagsfnodescache(repo, bundler, outgoing) + + if opts.get('revbranchcache', True): + addpartrevbranchcache(repo, bundler, outgoing) if opts.get('obsolescence', False): obsmarkers = repo.obsstore.relevantmarkers(outgoing.missing) @@ -1623,6 +1639,59 @@ if chunks: bundler.newpart('hgtagsfnodes', data=''.join(chunks)) +def addpartrevbranchcache(repo, bundler, outgoing): + # we include the rev branch cache for the bundle changeset + # (as an optional parts) + cache = repo.revbranchcache() + cl = repo.unfiltered().changelog + branchesdata = collections.defaultdict(lambda: (set(), set())) + for node in outgoing.missing: + branch, close = cache.branchinfo(cl.rev(node)) + branchesdata[branch][close].add(node) + + def generate(): + for branch, (nodes, closed) in sorted(branchesdata.items()): + utf8branch = encoding.fromlocal(branch) + yield rbcstruct.pack(len(utf8branch), len(nodes), len(closed)) + yield utf8branch + for n in sorted(nodes): + yield n + for n in sorted(closed): + yield n + + bundler.newpart('cache:rev-branch-cache', data=generate()) + +def _formatrequirementsspec(requirements): + return urlreq.quote(','.join(sorted(requirements))) + +def _formatrequirementsparams(requirements): + requirements = _formatrequirementsspec(requirements) + params = "%s%s" % (urlreq.quote("requirements="), requirements) + return params + +def addpartbundlestream2(bundler, repo, **kwargs): + if not kwargs.get('stream', False): + return + + if not streamclone.allowservergeneration(repo): + raise error.Abort(_('stream data requested but server does not allow ' + 'this feature'), + hint=_('well-behaved clients should not be ' + 'requesting stream data from servers not ' + 'advertising it; the client may be buggy')) + + # Stream clones don't compress well. And compression undermines a + # goal of stream clones, which is to be fast. Communicate the desire + # to avoid compression to consumers of the bundle. + bundler.prefercompressed = False + + filecount, bytecount, it = streamclone.generatev2(repo) + requirements = _formatrequirementsspec(repo.requirements) + part = bundler.newpart('stream2', data=it) + part.addparam('bytecount', '%d' % bytecount, mandatory=True) + part.addparam('filecount', '%d' % filecount, mandatory=True) + part.addparam('requirements', requirements, mandatory=True) + def buildobsmarkerspart(bundler, markers): """add an obsmarker part to the bundler with @@ -1729,7 +1798,7 @@ extrakwargs = {} targetphase = inpart.params.get('targetphase') if targetphase is not None: - extrakwargs['targetphase'] = int(targetphase) + extrakwargs[r'targetphase'] = int(targetphase) ret = _processchangegroup(op, cg, tr, 'bundle2', 'bundle2', expectedtotal=nbchangesets, **extrakwargs) if op.reply is not None: @@ -1946,7 +2015,8 @@ value = inpart.params.get(name) if value is not None: kwargs[name] = value - raise error.PushkeyFailed(inpart.params['in-reply-to'], **kwargs) + raise error.PushkeyFailed(inpart.params['in-reply-to'], + **pycompat.strkwargs(kwargs)) @parthandler('error:unsupportedcontent', ('parttype', 'params')) def handleerrorunsupportedcontent(op, inpart): @@ -1959,7 +2029,7 @@ if params is not None: kwargs['params'] = params.split('\0') - raise error.BundleUnknownFeatureError(**kwargs) + raise error.BundleUnknownFeatureError(**pycompat.strkwargs(kwargs)) @parthandler('error:pushraced', ('message',)) def handleerrorpushraced(op, inpart): @@ -2001,7 +2071,8 @@ for key in ('namespace', 'key', 'new', 'old', 'ret'): if key in inpart.params: kwargs[key] = inpart.params[key] - raise error.PushkeyFailed(partid=str(inpart.id), **kwargs) + raise error.PushkeyFailed(partid='%d' % inpart.id, + **pycompat.strkwargs(kwargs)) @parthandler('bookmarks') def handlebookmark(op, inpart): @@ -2040,14 +2111,15 @@ allhooks.append(hookargs) for hookargs in allhooks: - op.repo.hook('prepushkey', throw=True, **hookargs) + op.repo.hook('prepushkey', throw=True, + **pycompat.strkwargs(hookargs)) bookstore.applychanges(op.repo, op.gettransaction(), changes) if pushkeycompat: def runhook(): for hookargs in allhooks: - op.repo.hook('pushkey', **hookargs) + op.repo.hook('pushkey', **pycompat.strkwargs(hookargs)) op.repo._afterlock(runhook) elif bookmarksmode == 'records': @@ -2126,6 +2198,40 @@ cache.write() op.ui.debug('applied %i hgtags fnodes cache entries\n' % count) +rbcstruct = struct.Struct('>III') + +@parthandler('cache:rev-branch-cache') +def handlerbc(op, inpart): + """receive a rev-branch-cache payload and update the local cache + + The payload is a series of data related to each branch + + 1) branch name length + 2) number of open heads + 3) number of closed heads + 4) open heads nodes + 5) closed heads nodes + """ + total = 0 + rawheader = inpart.read(rbcstruct.size) + cache = op.repo.revbranchcache() + cl = op.repo.unfiltered().changelog + while rawheader: + header = rbcstruct.unpack(rawheader) + total += header[1] + header[2] + utf8branch = inpart.read(header[0]) + branch = encoding.tolocal(utf8branch) + for x in xrange(header[1]): + node = inpart.read(20) + rev = cl.rev(node) + cache.setdata(branch, rev, node, False) + for x in xrange(header[2]): + node = inpart.read(20) + rev = cl.rev(node) + cache.setdata(branch, rev, node, True) + rawheader = inpart.read(rbcstruct.size) + cache.write() + @parthandler('pushvars') def bundle2getvars(op, part): '''unbundle a bundle2 containing shellvars on the server''' diff -r fb92df8b634c -r ed5448edcbfa mercurial/bundlerepo.py --- a/mercurial/bundlerepo.py Wed Apr 04 10:35:09 2018 -0400 +++ b/mercurial/bundlerepo.py Wed Apr 18 15:32:08 2018 -0400 @@ -217,11 +217,11 @@ self._dirlogstarts, dir=d) return super(bundlemanifest, self).dirlog(d) -class bundlefilelog(bundlerevlog, filelog.filelog): +class bundlefilelog(filelog.filelog): def __init__(self, opener, path, cgunpacker, linkmapper): filelog.filelog.__init__(self, opener, path) - bundlerevlog.__init__(self, opener, self.indexfile, cgunpacker, - linkmapper) + self._revlog = bundlerevlog(opener, self.indexfile, + cgunpacker, linkmapper) def baserevision(self, nodeorrev): return filelog.filelog.revision(self, nodeorrev, raw=True) @@ -349,7 +349,7 @@ suffix=suffix) self.tempfile = temp - with os.fdopen(fdtemp, pycompat.sysstr('wb')) as fptemp: + with os.fdopen(fdtemp, r'wb') as fptemp: fptemp.write(header) while True: chunk = readfn(2**18) @@ -402,7 +402,7 @@ # manifestlog implementation did not consume the manifests from the # changegroup (ex: it might be consuming trees from a separate bundle2 # part instead). So we need to manually consume it. - if 'filestart' not in self.__dict__: + if r'filestart' not in self.__dict__: self._consumemanifest() return self.filestart @@ -420,7 +420,7 @@ linkmapper = self.unfiltered().changelog.rev return bundlefilelog(self.svfs, f, self._cgunpacker, linkmapper) else: - return filelog.filelog(self.svfs, f) + return super(bundlerepository, self).file(f) def close(self): """Close assigned bundle file immediately.""" @@ -450,7 +450,7 @@ self.ui.warn(msg % nodemod.hex(p2)) return super(bundlerepository, self).setparents(p1, p2) -def instance(ui, path, create): +def instance(ui, path, create, intents=None): if create: raise error.Abort(_('cannot create new bundle repository')) # internal config: bundle.mainreporoot @@ -492,9 +492,9 @@ def release(self): raise NotImplementedError -def getremotechanges(ui, repo, other, onlyheads=None, bundlename=None, +def getremotechanges(ui, repo, peer, onlyheads=None, bundlename=None, force=False): - '''obtains a bundle of changes incoming from other + '''obtains a bundle of changes incoming from peer "onlyheads" restricts the returned changes to those reachable from the specified heads. @@ -507,13 +507,13 @@ "local" is a local repo from which to obtain the actual incoming changesets; it is a bundlerepo for the obtained bundle when the - original "other" is remote. + original "peer" is remote. "csets" lists the incoming changeset node ids. "cleanupfn" must be called without arguments when you're done processing - the changes; it closes both the original "other" and the one returned + the changes; it closes both the original "peer" and the one returned here. ''' - tmp = discovery.findcommonincoming(repo, other, heads=onlyheads, + tmp = discovery.findcommonincoming(repo, peer, heads=onlyheads, force=force) common, incoming, rheads = tmp if not incoming: @@ -522,41 +522,62 @@ os.unlink(bundlename) except OSError: pass - return repo, [], other.close + return repo, [], peer.close commonset = set(common) rheads = [x for x in rheads if x not in commonset] bundle = None bundlerepo = None - localrepo = other.local() + localrepo = peer.local() if bundlename or not localrepo: - # create a bundle (uncompressed if other repo is not local) + # create a bundle (uncompressed if peer repo is not local) # developer config: devel.legacy.exchange legexc = ui.configlist('devel', 'legacy.exchange') forcebundle1 = 'bundle2' not in legexc and 'bundle1' in legexc canbundle2 = (not forcebundle1 - and other.capable('getbundle') - and other.capable('bundle2')) + and peer.capable('getbundle') + and peer.capable('bundle2')) if canbundle2: - kwargs = {} - kwargs[r'common'] = common - kwargs[r'heads'] = rheads - kwargs[r'bundlecaps'] = exchange.caps20to10(repo, role='client') - kwargs[r'cg'] = True - b2 = other.getbundle('incoming', **kwargs) - fname = bundle = changegroup.writechunks(ui, b2._forwardchunks(), - bundlename) + with peer.commandexecutor() as e: + b2 = e.callcommand('getbundle', { + 'source': 'incoming', + 'common': common, + 'heads': rheads, + 'bundlecaps': exchange.caps20to10(repo, role='client'), + 'cg': True, + }).result() + + fname = bundle = changegroup.writechunks(ui, + b2._forwardchunks(), + bundlename) else: - if other.capable('getbundle'): - cg = other.getbundle('incoming', common=common, heads=rheads) - elif onlyheads is None and not other.capable('changegroupsubset'): + if peer.capable('getbundle'): + with peer.commandexecutor() as e: + cg = e.callcommand('getbundle', { + 'source': 'incoming', + 'common': common, + 'heads': rheads, + }).result() + elif onlyheads is None and not peer.capable('changegroupsubset'): # compat with older servers when pulling all remote heads - cg = other.changegroup(incoming, "incoming") + + with peer.commandexecutor() as e: + cg = e.callcommand('changegroup', { + 'nodes': incoming, + 'source': 'incoming', + }).result() + rheads = None else: - cg = other.changegroupsubset(incoming, rheads, 'incoming') + with peer.commandexecutor() as e: + cg = e.callcommand('changegroupsubset', { + 'bases': incoming, + 'heads': rheads, + 'source': 'incoming', + }).result() + if localrepo: bundletype = "HG10BZ" else: @@ -570,7 +591,7 @@ # use the created uncompressed bundlerepo localrepo = bundlerepo = bundlerepository(repo.baseui, repo.root, fname) - # this repo contains local and other now, so filter out local again + # this repo contains local and peer now, so filter out local again common = repo.heads() if localrepo: # Part of common may be remotely filtered @@ -582,9 +603,13 @@ if bundlerepo: reponodes = [ctx.node() for ctx in bundlerepo[bundlerepo.firstnewrev:]] - remotephases = other.listkeys('phases') - pullop = exchange.pulloperation(bundlerepo, other, heads=reponodes) + with peer.commandexecutor() as e: + remotephases = e.callcommand('listkeys', { + 'namespace': 'phases', + }).result() + + pullop = exchange.pulloperation(bundlerepo, peer, heads=reponodes) pullop.trmanager = bundletransactionmanager() exchange._pullapplyphases(pullop, remotephases) @@ -593,6 +618,6 @@ bundlerepo.close() if bundle: os.unlink(bundle) - other.close() + peer.close() return (localrepo, csets, cleanup) diff -r fb92df8b634c -r ed5448edcbfa mercurial/byterange.py --- a/mercurial/byterange.py Wed Apr 04 10:35:09 2018 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,472 +0,0 @@ -# This library is free software; you can redistribute it and/or -# modify it under the terms of the GNU Lesser General Public -# License as published by the Free Software Foundation; either -# version 2.1 of the License, or (at your option) any later version. -# -# This library is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -# Lesser General Public License for more details. -# -# You should have received a copy of the GNU Lesser General Public -# License along with this library; if not, see -# . - -# This file is part of urlgrabber, a high-level cross-protocol url-grabber -# Copyright 2002-2004 Michael D. Stenner, Ryan Tomayko - -# $Id: byterange.py,v 1.9 2005/02/14 21:55:07 mstenner Exp $ - -from __future__ import absolute_import - -import email -import ftplib -import mimetypes -import os -import re -import socket -import stat - -from . import ( - urllibcompat, - util, -) - -urlerr = util.urlerr -urlreq = util.urlreq - -addclosehook = urlreq.addclosehook -addinfourl = urlreq.addinfourl -splitattr = urlreq.splitattr -splitpasswd = urlreq.splitpasswd -splitport = urlreq.splitport -splituser = urlreq.splituser -unquote = urlreq.unquote - -class RangeError(IOError): - """Error raised when an unsatisfiable range is requested.""" - -class HTTPRangeHandler(urlreq.basehandler): - """Handler that enables HTTP Range headers. - - This was extremely simple. The Range header is a HTTP feature to - begin with so all this class does is tell urllib2 that the - "206 Partial Content" response from the HTTP server is what we - expected. - - Example: - import urllib2 - import byterange - - range_handler = range.HTTPRangeHandler() - opener = urlreq.buildopener(range_handler) - - # install it - urlreq.installopener(opener) - - # create Request and set Range header - req = urlreq.request('http://www.python.org/') - req.header['Range'] = 'bytes=30-50' - f = urlreq.urlopen(req) - """ - - def http_error_206(self, req, fp, code, msg, hdrs): - # 206 Partial Content Response - r = urlreq.addinfourl(fp, hdrs, req.get_full_url()) - r.code = code - r.msg = msg - return r - - def http_error_416(self, req, fp, code, msg, hdrs): - # HTTP's Range Not Satisfiable error - raise RangeError('Requested Range Not Satisfiable') - -class RangeableFileObject(object): - """File object wrapper to enable raw range handling. - This was implemented primarily for handling range - specifications for file:// urls. This object effectively makes - a file object look like it consists only of a range of bytes in - the stream. - - Examples: - # expose 10 bytes, starting at byte position 20, from - # /etc/aliases. - >>> fo = RangeableFileObject(file(b'/etc/passwd', b'r'), (20,30)) - # seek seeks within the range (to position 23 in this case) - >>> fo.seek(3) - # tell tells where your at _within the range_ (position 3 in - # this case) - >>> fo.tell() - # read EOFs if an attempt is made to read past the last - # byte in the range. the following will return only 7 bytes. - >>> fo.read(30) - """ - - def __init__(self, fo, rangetup): - """Create a RangeableFileObject. - fo -- a file like object. only the read() method need be - supported but supporting an optimized seek() is - preferable. - rangetup -- a (firstbyte,lastbyte) tuple specifying the range - to work over. - The file object provided is assumed to be at byte offset 0. - """ - self.fo = fo - (self.firstbyte, self.lastbyte) = range_tuple_normalize(rangetup) - self.realpos = 0 - self._do_seek(self.firstbyte) - - def __getattr__(self, name): - """This effectively allows us to wrap at the instance level. - Any attribute not found in _this_ object will be searched for - in self.fo. This includes methods.""" - return getattr(self.fo, name) - - def tell(self): - """Return the position within the range. - This is different from fo.seek in that position 0 is the - first byte position of the range tuple. For example, if - this object was created with a range tuple of (500,899), - tell() will return 0 when at byte position 500 of the file. - """ - return (self.realpos - self.firstbyte) - - def seek(self, offset, whence=0): - """Seek within the byte range. - Positioning is identical to that described under tell(). - """ - assert whence in (0, 1, 2) - if whence == 0: # absolute seek - realoffset = self.firstbyte + offset - elif whence == 1: # relative seek - realoffset = self.realpos + offset - elif whence == 2: # absolute from end of file - # XXX: are we raising the right Error here? - raise IOError('seek from end of file not supported.') - - # do not allow seek past lastbyte in range - if self.lastbyte and (realoffset >= self.lastbyte): - realoffset = self.lastbyte - - self._do_seek(realoffset - self.realpos) - - def read(self, size=-1): - """Read within the range. - This method will limit the size read based on the range. - """ - size = self._calc_read_size(size) - rslt = self.fo.read(size) - self.realpos += len(rslt) - return rslt - - def readline(self, size=-1): - """Read lines within the range. - This method will limit the size read based on the range. - """ - size = self._calc_read_size(size) - rslt = self.fo.readline(size) - self.realpos += len(rslt) - return rslt - - def _calc_read_size(self, size): - """Handles calculating the amount of data to read based on - the range. - """ - if self.lastbyte: - if size > -1: - if ((self.realpos + size) >= self.lastbyte): - size = (self.lastbyte - self.realpos) - else: - size = (self.lastbyte - self.realpos) - return size - - def _do_seek(self, offset): - """Seek based on whether wrapped object supports seek(). - offset is relative to the current position (self.realpos). - """ - assert offset >= 0 - seek = getattr(self.fo, 'seek', self._poor_mans_seek) - seek(self.realpos + offset) - self.realpos += offset - - def _poor_mans_seek(self, offset): - """Seek by calling the wrapped file objects read() method. - This is used for file like objects that do not have native - seek support. The wrapped objects read() method is called - to manually seek to the desired position. - offset -- read this number of bytes from the wrapped - file object. - raise RangeError if we encounter EOF before reaching the - specified offset. - """ - pos = 0 - bufsize = 1024 - while pos < offset: - if (pos + bufsize) > offset: - bufsize = offset - pos - buf = self.fo.read(bufsize) - if len(buf) != bufsize: - raise RangeError('Requested Range Not Satisfiable') - pos += bufsize - -class FileRangeHandler(urlreq.filehandler): - """FileHandler subclass that adds Range support. - This class handles Range headers exactly like an HTTP - server would. - """ - def open_local_file(self, req): - host = urllibcompat.gethost(req) - file = urllibcompat.getselector(req) - localfile = urlreq.url2pathname(file) - stats = os.stat(localfile) - size = stats[stat.ST_SIZE] - modified = email.Utils.formatdate(stats[stat.ST_MTIME]) - mtype = mimetypes.guess_type(file)[0] - if host: - host, port = urlreq.splitport(host) - if port or socket.gethostbyname(host) not in self.get_names(): - raise urlerr.urlerror('file not on local host') - fo = open(localfile,'rb') - brange = req.headers.get('Range', None) - brange = range_header_to_tuple(brange) - assert brange != () - if brange: - (fb, lb) = brange - if lb == '': - lb = size - if fb < 0 or fb > size or lb > size: - raise RangeError('Requested Range Not Satisfiable') - size = (lb - fb) - fo = RangeableFileObject(fo, (fb, lb)) - headers = email.message_from_string( - 'Content-Type: %s\nContent-Length: %d\nLast-Modified: %s\n' % - (mtype or 'text/plain', size, modified)) - return urlreq.addinfourl(fo, headers, 'file:'+file) - - -# FTP Range Support -# Unfortunately, a large amount of base FTP code had to be copied -# from urllib and urllib2 in order to insert the FTP REST command. -# Code modifications for range support have been commented as -# follows: -# -- range support modifications start/end here - -class FTPRangeHandler(urlreq.ftphandler): - def ftp_open(self, req): - host = urllibcompat.gethost(req) - if not host: - raise IOError('ftp error', 'no host given') - host, port = splitport(host) - if port is None: - port = ftplib.FTP_PORT - else: - port = int(port) - - # username/password handling - user, host = splituser(host) - if user: - user, passwd = splitpasswd(user) - else: - passwd = None - host = unquote(host) - user = unquote(user or '') - passwd = unquote(passwd or '') - - try: - host = socket.gethostbyname(host) - except socket.error as msg: - raise urlerr.urlerror(msg) - path, attrs = splitattr(req.get_selector()) - dirs = path.split('/') - dirs = map(unquote, dirs) - dirs, file = dirs[:-1], dirs[-1] - if dirs and not dirs[0]: - dirs = dirs[1:] - try: - fw = self.connect_ftp(user, passwd, host, port, dirs) - if file: - type = 'I' - else: - type = 'D' - - for attr in attrs: - attr, value = splitattr(attr) - if attr.lower() == 'type' and \ - value in ('a', 'A', 'i', 'I', 'd', 'D'): - type = value.upper() - - # -- range support modifications start here - rest = None - range_tup = range_header_to_tuple(req.headers.get('Range', None)) - assert range_tup != () - if range_tup: - (fb, lb) = range_tup - if fb > 0: - rest = fb - # -- range support modifications end here - - fp, retrlen = fw.retrfile(file, type, rest) - - # -- range support modifications start here - if range_tup: - (fb, lb) = range_tup - if lb == '': - if retrlen is None or retrlen == 0: - raise RangeError('Requested Range Not Satisfiable due' - ' to unobtainable file length.') - lb = retrlen - retrlen = lb - fb - if retrlen < 0: - # beginning of range is larger than file - raise RangeError('Requested Range Not Satisfiable') - else: - retrlen = lb - fb - fp = RangeableFileObject(fp, (0, retrlen)) - # -- range support modifications end here - - headers = "" - mtype = mimetypes.guess_type(req.get_full_url())[0] - if mtype: - headers += "Content-Type: %s\n" % mtype - if retrlen is not None and retrlen >= 0: - headers += "Content-Length: %d\n" % retrlen - headers = email.message_from_string(headers) - return addinfourl(fp, headers, req.get_full_url()) - except ftplib.all_errors as msg: - raise IOError('ftp error', msg) - - def connect_ftp(self, user, passwd, host, port, dirs): - fw = ftpwrapper(user, passwd, host, port, dirs) - return fw - -class ftpwrapper(urlreq.ftpwrapper): - # range support note: - # this ftpwrapper code is copied directly from - # urllib. The only enhancement is to add the rest - # argument and pass it on to ftp.ntransfercmd - def retrfile(self, file, type, rest=None): - self.endtransfer() - if type in ('d', 'D'): - cmd = 'TYPE A' - isdir = 1 - else: - cmd = 'TYPE ' + type - isdir = 0 - try: - self.ftp.voidcmd(cmd) - except ftplib.all_errors: - self.init() - self.ftp.voidcmd(cmd) - conn = None - if file and not isdir: - # Use nlst to see if the file exists at all - try: - self.ftp.nlst(file) - except ftplib.error_perm as reason: - raise IOError('ftp error', reason) - # Restore the transfer mode! - self.ftp.voidcmd(cmd) - # Try to retrieve as a file - try: - cmd = 'RETR ' + file - conn = self.ftp.ntransfercmd(cmd, rest) - except ftplib.error_perm as reason: - if str(reason).startswith('501'): - # workaround for REST not supported error - fp, retrlen = self.retrfile(file, type) - fp = RangeableFileObject(fp, (rest,'')) - return (fp, retrlen) - elif not str(reason).startswith('550'): - raise IOError('ftp error', reason) - if not conn: - # Set transfer mode to ASCII! - self.ftp.voidcmd('TYPE A') - # Try a directory listing - if file: - cmd = 'LIST ' + file - else: - cmd = 'LIST' - conn = self.ftp.ntransfercmd(cmd) - self.busy = 1 - # Pass back both a suitably decorated object and a retrieval length - return (addclosehook(conn[0].makefile('rb'), - self.endtransfer), conn[1]) - - -#################################################################### -# Range Tuple Functions -# XXX: These range tuple functions might go better in a class. - -_rangere = None -def range_header_to_tuple(range_header): - """Get a (firstbyte,lastbyte) tuple from a Range header value. - - Range headers have the form "bytes=-". This - function pulls the firstbyte and lastbyte values and returns - a (firstbyte,lastbyte) tuple. If lastbyte is not specified in - the header value, it is returned as an empty string in the - tuple. - - Return None if range_header is None - Return () if range_header does not conform to the range spec - pattern. - - """ - global _rangere - if range_header is None: - return None - if _rangere is None: - _rangere = re.compile(br'^bytes=(\d{1,})-(\d*)') - match = _rangere.match(range_header) - if match: - tup = range_tuple_normalize(match.group(1, 2)) - if tup and tup[1]: - tup = (tup[0], tup[1]+1) - return tup - return () - -def range_tuple_to_header(range_tup): - """Convert a range tuple to a Range header value. - Return a string of the form "bytes=-" or None - if no range is needed. - """ - if range_tup is None: - return None - range_tup = range_tuple_normalize(range_tup) - if range_tup: - if range_tup[1]: - range_tup = (range_tup[0], range_tup[1] - 1) - return 'bytes=%s-%s' % range_tup - -def range_tuple_normalize(range_tup): - """Normalize a (first_byte,last_byte) range tuple. - Return a tuple whose first element is guaranteed to be an int - and whose second element will be '' (meaning: the last byte) or - an int. Finally, return None if the normalized tuple == (0,'') - as that is equivalent to retrieving the entire file. - """ - if range_tup is None: - return None - # handle first byte - fb = range_tup[0] - if fb in (None, ''): - fb = 0 - else: - fb = int(fb) - # handle last byte - try: - lb = range_tup[1] - except IndexError: - lb = '' - else: - if lb is None: - lb = '' - elif lb != '': - lb = int(lb) - # check if range is over the entire file - if (fb, lb) == (0, ''): - return None - # check that the range is valid - if lb < fb: - raise RangeError('Invalid byte range: %s-%s' % (fb, lb)) - return (fb, lb) diff -r fb92df8b634c -r ed5448edcbfa mercurial/cext/base85.c --- a/mercurial/cext/base85.c Wed Apr 04 10:35:09 2018 -0400 +++ b/mercurial/cext/base85.c Wed Apr 18 15:32:08 2018 -0400 @@ -14,8 +14,9 @@ #include "util.h" -static const char b85chars[] = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ" - "abcdefghijklmnopqrstuvwxyz!#$%&()*+-;<=>?@^_`{|}~"; +static const char b85chars[] = + "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ" + "abcdefghijklmnopqrstuvwxyz!#$%&()*+-;<=>?@^_`{|}~"; static char b85dec[256]; static void b85prep(void) @@ -36,7 +37,7 @@ unsigned int acc, val, ch; int pad = 0; - if (!PyArg_ParseTuple(args, "s#|i", &text, &len, &pad)) + if (!PyArg_ParseTuple(args, PY23("s#|i", "y#|i"), &text, &len, &pad)) return NULL; if (pad) @@ -83,7 +84,7 @@ int c; unsigned int acc; - if (!PyArg_ParseTuple(args, "s#", &text, &len)) + if (!PyArg_ParseTuple(args, PY23("s#", "y#"), &text, &len)) return NULL; olen = len / 5 * 4; @@ -105,25 +106,25 @@ c = b85dec[(int)*text++] - 1; if (c < 0) return PyErr_Format( - PyExc_ValueError, - "bad base85 character at position %d", - (int)i); + PyExc_ValueError, + "bad base85 character at position %d", + (int)i); acc = acc * 85 + c; } if (i++ < len) { c = b85dec[(int)*text++] - 1; if (c < 0) return PyErr_Format( - PyExc_ValueError, - "bad base85 character at position %d", - (int)i); + PyExc_ValueError, + "bad base85 character at position %d", + (int)i); /* overflow detection: 0xffffffff == "|NsC0", * "|NsC" == 0x03030303 */ if (acc > 0x03030303 || (acc *= 85) > 0xffffffff - c) return PyErr_Format( - PyExc_ValueError, - "bad base85 sequence at position %d", - (int)i); + PyExc_ValueError, + "bad base85 sequence at position %d", + (int)i); acc += c; } @@ -145,23 +146,19 @@ static char base85_doc[] = "Base85 Data Encoding"; static PyMethodDef methods[] = { - {"b85encode", b85encode, METH_VARARGS, - "Encode text in base85.\n\n" - "If the second parameter is true, pad the result to a multiple of " - "five characters.\n"}, - {"b85decode", b85decode, METH_VARARGS, "Decode base85 text.\n"}, - {NULL, NULL} + {"b85encode", b85encode, METH_VARARGS, + "Encode text in base85.\n\n" + "If the second parameter is true, pad the result to a multiple of " + "five characters.\n"}, + {"b85decode", b85decode, METH_VARARGS, "Decode base85 text.\n"}, + {NULL, NULL}, }; static const int version = 1; #ifdef IS_PY3K static struct PyModuleDef base85_module = { - PyModuleDef_HEAD_INIT, - "base85", - base85_doc, - -1, - methods + PyModuleDef_HEAD_INIT, "base85", base85_doc, -1, methods, }; PyMODINIT_FUNC PyInit_base85(void) diff -r fb92df8b634c -r ed5448edcbfa mercurial/cext/bdiff.c --- a/mercurial/cext/bdiff.c Wed Apr 04 10:35:09 2018 -0400 +++ b/mercurial/cext/bdiff.c Wed Apr 18 15:32:08 2018 -0400 @@ -17,9 +17,9 @@ #include "bdiff.h" #include "bitmanipulation.h" +#include "thirdparty/xdiff/xdiff.h" #include "util.h" - static PyObject *blocks(PyObject *self, PyObject *args) { PyObject *sa, *sb, *rl = NULL, *m; @@ -61,42 +61,60 @@ static PyObject *bdiff(PyObject *self, PyObject *args) { - char *sa, *sb, *rb, *ia, *ib; + Py_buffer ba, bb; + char *rb, *ia, *ib; PyObject *result = NULL; - struct bdiff_line *al, *bl; + struct bdiff_line *al = NULL, *bl = NULL; struct bdiff_hunk l, *h; int an, bn, count; Py_ssize_t len = 0, la, lb, li = 0, lcommon = 0, lmax; - PyThreadState *_save; + PyThreadState *_save = NULL; l.next = NULL; - if (!PyArg_ParseTuple(args, "s#s#:bdiff", &sa, &la, &sb, &lb)) + if (!PyArg_ParseTuple(args, PY23("s*s*:bdiff", "y*y*:bdiff"), &ba, &bb)) return NULL; + if (!PyBuffer_IsContiguous(&ba, 'C') || ba.ndim > 1) { + PyErr_SetString(PyExc_ValueError, "bdiff input not contiguous"); + goto cleanup; + } + + if (!PyBuffer_IsContiguous(&bb, 'C') || bb.ndim > 1) { + PyErr_SetString(PyExc_ValueError, "bdiff input not contiguous"); + goto cleanup; + } + + la = ba.len; + lb = bb.len; + if (la > UINT_MAX || lb > UINT_MAX) { PyErr_SetString(PyExc_ValueError, "bdiff inputs too large"); - return NULL; + goto cleanup; } _save = PyEval_SaveThread(); lmax = la > lb ? lb : la; - for (ia = sa, ib = sb; - li < lmax && *ia == *ib; - ++li, ++ia, ++ib) + for (ia = ba.buf, ib = bb.buf; li < lmax && *ia == *ib; + ++li, ++ia, ++ib) { if (*ia == '\n') lcommon = li + 1; + } /* we can almost add: if (li == lmax) lcommon = li; */ - an = bdiff_splitlines(sa + lcommon, la - lcommon, &al); - bn = bdiff_splitlines(sb + lcommon, lb - lcommon, &bl); - if (!al || !bl) - goto nomem; + an = bdiff_splitlines((char *)ba.buf + lcommon, la - lcommon, &al); + bn = bdiff_splitlines((char *)bb.buf + lcommon, lb - lcommon, &bl); + if (!al || !bl) { + PyErr_NoMemory(); + goto cleanup; + } count = bdiff_diff(al, an, bl, bn, &l); - if (count < 0) - goto nomem; + if (count < 0) { + PyErr_NoMemory(); + goto cleanup; + } /* calculate length of output */ la = lb = 0; @@ -112,7 +130,7 @@ result = PyBytes_FromStringAndSize(NULL, len); if (!result) - goto nomem; + goto cleanup; /* build binary patch */ rb = PyBytes_AsString(result); @@ -122,7 +140,8 @@ if (h->a1 != la || h->b1 != lb) { len = bl[h->b1].l - bl[lb].l; putbe32((uint32_t)(al[la].l + lcommon - al->l), rb); - putbe32((uint32_t)(al[h->a1].l + lcommon - al->l), rb + 4); + putbe32((uint32_t)(al[h->a1].l + lcommon - al->l), + rb + 4); putbe32((uint32_t)len, rb + 8); memcpy(rb + 12, bl[lb].l, len); rb += 12 + len; @@ -131,13 +150,21 @@ lb = h->b2; } -nomem: +cleanup: if (_save) PyEval_RestoreThread(_save); - free(al); - free(bl); - bdiff_freehunks(l.next); - return result ? result : PyErr_NoMemory(); + PyBuffer_Release(&ba); + PyBuffer_Release(&bb); + if (al) { + free(al); + } + if (bl) { + free(bl); + } + if (l.next) { + bdiff_freehunks(l.next); + } + return result; } /* @@ -167,8 +194,8 @@ if (c == ' ' || c == '\t' || c == '\r') { if (!allws && (wlen == 0 || w[wlen - 1] != ' ')) w[wlen++] = ' '; - } else if (c == '\n' && !allws - && wlen > 0 && w[wlen - 1] == ' ') { + } else if (c == '\n' && !allws && wlen > 0 && + w[wlen - 1] == ' ') { w[wlen - 1] = '\n'; } else { w[wlen++] = c; @@ -182,25 +209,124 @@ return result ? result : PyErr_NoMemory(); } +static bool sliceintolist(PyObject *list, Py_ssize_t destidx, + const char *source, Py_ssize_t len) +{ + PyObject *sliced = PyBytes_FromStringAndSize(source, len); + if (sliced == NULL) + return false; + PyList_SET_ITEM(list, destidx, sliced); + return true; +} + +static PyObject *splitnewlines(PyObject *self, PyObject *args) +{ + const char *text; + Py_ssize_t nelts = 0, size, i, start = 0; + PyObject *result = NULL; + + if (!PyArg_ParseTuple(args, PY23("s#", "y#"), &text, &size)) { + goto abort; + } + if (!size) { + return PyList_New(0); + } + /* This loops to size-1 because if the last byte is a newline, + * we don't want to perform a split there. */ + for (i = 0; i < size - 1; ++i) { + if (text[i] == '\n') { + ++nelts; + } + } + if ((result = PyList_New(nelts + 1)) == NULL) + goto abort; + nelts = 0; + for (i = 0; i < size - 1; ++i) { + if (text[i] == '\n') { + if (!sliceintolist(result, nelts++, text + start, + i - start + 1)) + goto abort; + start = i + 1; + } + } + if (!sliceintolist(result, nelts++, text + start, size - start)) + goto abort; + return result; +abort: + Py_XDECREF(result); + return NULL; +} + +static int hunk_consumer(int64_t a1, int64_t a2, int64_t b1, int64_t b2, + void *priv) +{ + PyObject *rl = (PyObject *)priv; + PyObject *m = Py_BuildValue("llll", a1, a2, b1, b2); + if (!m) + return -1; + if (PyList_Append(rl, m) != 0) { + Py_DECREF(m); + return -1; + } + return 0; +} + +static PyObject *xdiffblocks(PyObject *self, PyObject *args) +{ + Py_ssize_t la, lb; + mmfile_t a, b; + PyObject *rl; + + xpparam_t xpp = { + XDF_INDENT_HEURISTIC, /* flags */ + }; + xdemitconf_t xecfg = { + XDL_EMIT_BDIFFHUNK, /* flags */ + hunk_consumer, /* hunk_consume_func */ + }; + xdemitcb_t ecb = { + NULL, /* priv */ + }; + + if (!PyArg_ParseTuple(args, PY23("s#s#", "y#y#"), &a.ptr, &la, &b.ptr, + &lb)) + return NULL; + + a.size = la; + b.size = lb; + + rl = PyList_New(0); + if (!rl) + return PyErr_NoMemory(); + + ecb.priv = rl; + + if (xdl_diff(&a, &b, &xpp, &xecfg, &ecb) != 0) { + Py_DECREF(rl); + return PyErr_NoMemory(); + } + + return rl; +} static char mdiff_doc[] = "Efficient binary diff."; static PyMethodDef methods[] = { - {"bdiff", bdiff, METH_VARARGS, "calculate a binary diff\n"}, - {"blocks", blocks, METH_VARARGS, "find a list of matching lines\n"}, - {"fixws", fixws, METH_VARARGS, "normalize diff whitespaces\n"}, - {NULL, NULL} + {"bdiff", bdiff, METH_VARARGS, "calculate a binary diff\n"}, + {"blocks", blocks, METH_VARARGS, "find a list of matching lines\n"}, + {"fixws", fixws, METH_VARARGS, "normalize diff whitespaces\n"}, + {"splitnewlines", splitnewlines, METH_VARARGS, + "like str.splitlines, but only split on newlines\n"}, + {"xdiffblocks", xdiffblocks, METH_VARARGS, + "find a list of matching lines using xdiff algorithm\n"}, + {NULL, NULL}, }; -static const int version = 1; +static const int version = 3; #ifdef IS_PY3K static struct PyModuleDef bdiff_module = { - PyModuleDef_HEAD_INIT, - "bdiff", - mdiff_doc, - -1, - methods + PyModuleDef_HEAD_INIT, "bdiff", mdiff_doc, -1, methods, }; PyMODINIT_FUNC PyInit_bdiff(void) diff -r fb92df8b634c -r ed5448edcbfa mercurial/cext/charencode.c --- a/mercurial/cext/charencode.c Wed Apr 04 10:35:09 2018 -0400 +++ b/mercurial/cext/charencode.c Wed Apr 18 15:32:08 2018 -0400 @@ -65,7 +65,6 @@ '\x58', '\x59', '\x5a', /* x-z */ '\x7b', '\x7c', '\x7d', '\x7e', '\x7f' }; -/* clang-format on */ /* 1: no escape, 2: \, 6: \u */ static const uint8_t jsonlentable[256] = { @@ -102,6 +101,7 @@ '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'a', 'b', 'c', 'd', 'e', 'f', }; +/* clang-format on */ /* * Turn a hex-encoded string into binary. @@ -132,7 +132,8 @@ { const char *buf; Py_ssize_t i, len; - if (!PyArg_ParseTuple(args, "s#:isasciistr", &buf, &len)) + if (!PyArg_ParseTuple(args, PY23("s#:isasciistr", "y#:isasciistr"), + &buf, &len)) return NULL; i = 0; /* char array in PyStringObject should be at least 4-byte aligned */ @@ -151,9 +152,8 @@ Py_RETURN_TRUE; } -static inline PyObject *_asciitransform(PyObject *str_obj, - const char table[128], - PyObject *fallback_fn) +static inline PyObject * +_asciitransform(PyObject *str_obj, const char table[128], PyObject *fallback_fn) { char *str, *newstr; Py_ssize_t i, len; @@ -173,12 +173,12 @@ char c = str[i]; if (c & 0x80) { if (fallback_fn != NULL) { - ret = PyObject_CallFunctionObjArgs(fallback_fn, - str_obj, NULL); + ret = PyObject_CallFunctionObjArgs( + fallback_fn, str_obj, NULL); } else { PyObject *err = PyUnicodeDecodeError_Create( - "ascii", str, len, i, (i + 1), - "unexpected code byte"); + "ascii", str, len, i, (i + 1), + "unexpected code byte"); PyErr_SetObject(PyExc_UnicodeDecodeError, err); Py_XDECREF(err); } @@ -220,10 +220,9 @@ Py_ssize_t pos = 0; const char *table; - if (!PyArg_ParseTuple(args, "O!O!O!:make_file_foldmap", - &PyDict_Type, &dmap, - &PyInt_Type, &spec_obj, - &PyFunction_Type, &normcase_fallback)) + if (!PyArg_ParseTuple(args, "O!O!O!:make_file_foldmap", &PyDict_Type, + &dmap, &PyInt_Type, &spec_obj, &PyFunction_Type, + &normcase_fallback)) goto quit; spec = (int)PyInt_AS_LONG(spec_obj); @@ -251,7 +250,7 @@ while (PyDict_Next(dmap, &pos, &k, &v)) { if (!dirstate_tuple_check(v)) { PyErr_SetString(PyExc_TypeError, - "expected a dirstate tuple"); + "expected a dirstate tuple"); goto quit; } @@ -260,10 +259,10 @@ PyObject *normed; if (table != NULL) { normed = _asciitransform(k, table, - normcase_fallback); + normcase_fallback); } else { normed = PyObject_CallFunctionObjArgs( - normcase_fallback, k, NULL); + normcase_fallback, k, NULL); } if (normed == NULL) @@ -292,13 +291,13 @@ char c = buf[i]; if (c & 0x80) { PyErr_SetString(PyExc_ValueError, - "cannot process non-ascii str"); + "cannot process non-ascii str"); return -1; } esclen += jsonparanoidlentable[(unsigned char)c]; if (esclen < 0) { PyErr_SetString(PyExc_MemoryError, - "overflow in jsonescapelen"); + "overflow in jsonescapelen"); return -1; } } @@ -308,7 +307,7 @@ esclen += jsonlentable[(unsigned char)c]; if (esclen < 0) { PyErr_SetString(PyExc_MemoryError, - "overflow in jsonescapelen"); + "overflow in jsonescapelen"); return -1; } } @@ -336,17 +335,17 @@ case '\\': return '\\'; } - return '\0'; /* should not happen */ + return '\0'; /* should not happen */ } /* convert 'origbuf' to JSON-escaped form 'escbuf'; 'origbuf' should only include characters mappable by json(paranoid)lentable */ static void encodejsonescape(char *escbuf, Py_ssize_t esclen, - const char *origbuf, Py_ssize_t origlen, - bool paranoid) + const char *origbuf, Py_ssize_t origlen, + bool paranoid) { const uint8_t *lentable = - (paranoid) ? jsonparanoidlentable : jsonlentable; + (paranoid) ? jsonparanoidlentable : jsonlentable; Py_ssize_t i, j; for (i = 0, j = 0; i < origlen; i++) { @@ -377,15 +376,15 @@ const char *origbuf; Py_ssize_t origlen, esclen; int paranoid; - if (!PyArg_ParseTuple(args, "O!i:jsonescapeu8fast", - &PyBytes_Type, &origstr, ¶noid)) + if (!PyArg_ParseTuple(args, "O!i:jsonescapeu8fast", &PyBytes_Type, + &origstr, ¶noid)) return NULL; origbuf = PyBytes_AS_STRING(origstr); origlen = PyBytes_GET_SIZE(origstr); esclen = jsonescapelen(origbuf, origlen, paranoid); if (esclen < 0) - return NULL; /* unsupported char found or overflow */ + return NULL; /* unsupported char found or overflow */ if (origlen == esclen) { Py_INCREF(origstr); return origstr; @@ -395,7 +394,7 @@ if (!escstr) return NULL; encodejsonescape(PyBytes_AS_STRING(escstr), esclen, origbuf, origlen, - paranoid); + paranoid); return escstr; } diff -r fb92df8b634c -r ed5448edcbfa mercurial/cext/charencode.h --- a/mercurial/cext/charencode.h Wed Apr 04 10:35:09 2018 -0400 +++ b/mercurial/cext/charencode.h Wed Apr 18 15:32:08 2018 -0400 @@ -25,6 +25,7 @@ PyObject *make_file_foldmap(PyObject *self, PyObject *args); PyObject *jsonescapeu8fast(PyObject *self, PyObject *args); +/* clang-format off */ static const int8_t hextable[256] = { -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, @@ -43,6 +44,7 @@ -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 }; +/* clang-format on */ static inline int hexdigit(const char *p, Py_ssize_t off) { diff -r fb92df8b634c -r ed5448edcbfa mercurial/cext/diffhelpers.c --- a/mercurial/cext/diffhelpers.c Wed Apr 04 10:35:09 2018 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,204 +0,0 @@ -/* - * diffhelpers.c - helper routines for mpatch - * - * Copyright 2007 Chris Mason - * - * This software may be used and distributed according to the terms - * of the GNU General Public License v2, incorporated herein by reference. - */ - -#include -#include -#include - -#include "util.h" - -static char diffhelpers_doc[] = "Efficient diff parsing"; -static PyObject *diffhelpers_Error; - - -/* fixup the last lines of a and b when the patch has no newline at eof */ -static void _fix_newline(PyObject *hunk, PyObject *a, PyObject *b) -{ - Py_ssize_t hunksz = PyList_Size(hunk); - PyObject *s = PyList_GET_ITEM(hunk, hunksz-1); - char *l = PyBytes_AsString(s); - Py_ssize_t alen = PyList_Size(a); - Py_ssize_t blen = PyList_Size(b); - char c = l[0]; - PyObject *hline; - Py_ssize_t sz = PyBytes_GET_SIZE(s); - - if (sz > 1 && l[sz-2] == '\r') - /* tolerate CRLF in last line */ - sz -= 1; - - hline = PyBytes_FromStringAndSize(l, sz-1); - if (!hline) { - return; - } - - if (c == ' ' || c == '+') { - PyObject *rline = PyBytes_FromStringAndSize(l + 1, sz - 2); - PyList_SetItem(b, blen-1, rline); - } - if (c == ' ' || c == '-') { - Py_INCREF(hline); - PyList_SetItem(a, alen-1, hline); - } - PyList_SetItem(hunk, hunksz-1, hline); -} - -/* python callable form of _fix_newline */ -static PyObject * -fix_newline(PyObject *self, PyObject *args) -{ - PyObject *hunk, *a, *b; - if (!PyArg_ParseTuple(args, "OOO", &hunk, &a, &b)) - return NULL; - _fix_newline(hunk, a, b); - return Py_BuildValue("l", 0); -} - -#if (PY_VERSION_HEX < 0x02050000) -static const char *addlines_format = "OOiiOO"; -#else -static const char *addlines_format = "OOnnOO"; -#endif - -/* - * read lines from fp into the hunk. The hunk is parsed into two arrays - * a and b. a gets the old state of the text, b gets the new state - * The control char from the hunk is saved when inserting into a, but not b - * (for performance while deleting files) - */ -static PyObject * -addlines(PyObject *self, PyObject *args) -{ - - PyObject *fp, *hunk, *a, *b, *x; - Py_ssize_t i; - Py_ssize_t lena, lenb; - Py_ssize_t num; - Py_ssize_t todoa, todob; - char *s, c; - PyObject *l; - if (!PyArg_ParseTuple(args, addlines_format, - &fp, &hunk, &lena, &lenb, &a, &b)) - return NULL; - - while (1) { - todoa = lena - PyList_Size(a); - todob = lenb - PyList_Size(b); - num = todoa > todob ? todoa : todob; - if (num == 0) - break; - for (i = 0; i < num; i++) { - x = PyFile_GetLine(fp, 0); - s = PyBytes_AsString(x); - c = *s; - if (strcmp(s, "\\ No newline at end of file\n") == 0) { - _fix_newline(hunk, a, b); - continue; - } - if (c == '\n') { - /* Some patches may be missing the control char - * on empty lines. Supply a leading space. */ - Py_DECREF(x); - x = PyBytes_FromString(" \n"); - } - PyList_Append(hunk, x); - if (c == '+') { - l = PyBytes_FromString(s + 1); - PyList_Append(b, l); - Py_DECREF(l); - } else if (c == '-') { - PyList_Append(a, x); - } else { - l = PyBytes_FromString(s + 1); - PyList_Append(b, l); - Py_DECREF(l); - PyList_Append(a, x); - } - Py_DECREF(x); - } - } - return Py_BuildValue("l", 0); -} - -/* - * compare the lines in a with the lines in b. a is assumed to have - * a control char at the start of each line, this char is ignored in the - * compare - */ -static PyObject * -testhunk(PyObject *self, PyObject *args) -{ - - PyObject *a, *b; - long bstart; - Py_ssize_t alen, blen; - Py_ssize_t i; - char *sa, *sb; - - if (!PyArg_ParseTuple(args, "OOl", &a, &b, &bstart)) - return NULL; - alen = PyList_Size(a); - blen = PyList_Size(b); - if (alen > blen - bstart || bstart < 0) { - return Py_BuildValue("l", -1); - } - for (i = 0; i < alen; i++) { - sa = PyBytes_AsString(PyList_GET_ITEM(a, i)); - sb = PyBytes_AsString(PyList_GET_ITEM(b, i + bstart)); - if (strcmp(sa + 1, sb) != 0) - return Py_BuildValue("l", -1); - } - return Py_BuildValue("l", 0); -} - -static PyMethodDef methods[] = { - {"addlines", addlines, METH_VARARGS, "add lines to a hunk\n"}, - {"fix_newline", fix_newline, METH_VARARGS, "fixup newline counters\n"}, - {"testhunk", testhunk, METH_VARARGS, "test lines in a hunk\n"}, - {NULL, NULL} -}; - -static const int version = 1; - -#ifdef IS_PY3K -static struct PyModuleDef diffhelpers_module = { - PyModuleDef_HEAD_INIT, - "diffhelpers", - diffhelpers_doc, - -1, - methods -}; - -PyMODINIT_FUNC PyInit_diffhelpers(void) -{ - PyObject *m; - - m = PyModule_Create(&diffhelpers_module); - if (m == NULL) - return NULL; - - diffhelpers_Error = PyErr_NewException("diffhelpers.diffhelpersError", - NULL, NULL); - Py_INCREF(diffhelpers_Error); - PyModule_AddObject(m, "diffhelpersError", diffhelpers_Error); - PyModule_AddIntConstant(m, "version", version); - - return m; -} -#else -PyMODINIT_FUNC -initdiffhelpers(void) -{ - PyObject *m; - m = Py_InitModule3("diffhelpers", methods, diffhelpers_doc); - diffhelpers_Error = PyErr_NewException("diffhelpers.diffhelpersError", - NULL, NULL); - PyModule_AddIntConstant(m, "version", version); -} -#endif diff -r fb92df8b634c -r ed5448edcbfa mercurial/cext/manifest.c --- a/mercurial/cext/manifest.c Wed Apr 04 10:35:09 2018 -0400 +++ b/mercurial/cext/manifest.c Wed Apr 18 15:32:08 2018 -0400 @@ -718,7 +718,8 @@ Py_INCREF(self->pydata); for (i = 0; i < self->numlines; i++) { PyObject *arglist = NULL, *result = NULL; - arglist = Py_BuildValue("(s)", self->lines[i].start); + arglist = Py_BuildValue(PY23("(s)", "(y)"), + self->lines[i].start); if (!arglist) { return NULL; } diff -r fb92df8b634c -r ed5448edcbfa mercurial/cext/mpatch.c --- a/mercurial/cext/mpatch.c Wed Apr 04 10:35:09 2018 -0400 +++ b/mercurial/cext/mpatch.c Wed Apr 18 15:32:08 2018 -0400 @@ -55,10 +55,10 @@ ssize_t blen; int r; - PyObject *tmp = PyList_GetItem((PyObject*)bins, pos); + PyObject *tmp = PyList_GetItem((PyObject *)bins, pos); if (!tmp) return NULL; - if (PyObject_AsCharBuffer(tmp, &buffer, (Py_ssize_t*)&blen)) + if (PyObject_AsCharBuffer(tmp, &buffer, (Py_ssize_t *)&blen)) return NULL; if ((r = mpatch_decode(buffer, blen, &res)) < 0) { if (!PyErr_Occurred()) @@ -68,8 +68,7 @@ return res; } -static PyObject * -patches(PyObject *self, PyObject *args) +static PyObject *patches(PyObject *self, PyObject *args) { PyObject *text, *bins, *result; struct mpatch_flist *patch; @@ -110,7 +109,14 @@ goto cleanup; } out = PyBytes_AsString(result); - if ((r = mpatch_apply(out, in, inlen, patch)) < 0) { + /* clang-format off */ + { + Py_BEGIN_ALLOW_THREADS + r = mpatch_apply(out, in, inlen, patch); + Py_END_ALLOW_THREADS + } + /* clang-format on */ + if (r < 0) { Py_DECREF(result); result = NULL; } @@ -122,14 +128,13 @@ } /* calculate size of a patched file directly */ -static PyObject * -patchedsize(PyObject *self, PyObject *args) +static PyObject *patchedsize(PyObject *self, PyObject *args) { long orig, start, end, len, outlen = 0, last = 0, pos = 0; Py_ssize_t patchlen; char *bin; - if (!PyArg_ParseTuple(args, "ls#", &orig, &bin, &patchlen)) + if (!PyArg_ParseTuple(args, PY23("ls#", "ly#"), &orig, &bin, &patchlen)) return NULL; while (pos >= 0 && pos < patchlen) { @@ -146,7 +151,8 @@ if (pos != patchlen) { if (!PyErr_Occurred()) - PyErr_SetString(mpatch_Error, "patch cannot be decoded"); + PyErr_SetString(mpatch_Error, + "patch cannot be decoded"); return NULL; } @@ -155,20 +161,16 @@ } static PyMethodDef methods[] = { - {"patches", patches, METH_VARARGS, "apply a series of patches\n"}, - {"patchedsize", patchedsize, METH_VARARGS, "calculed patched size\n"}, - {NULL, NULL} + {"patches", patches, METH_VARARGS, "apply a series of patches\n"}, + {"patchedsize", patchedsize, METH_VARARGS, "calculed patched size\n"}, + {NULL, NULL}, }; static const int version = 1; #ifdef IS_PY3K static struct PyModuleDef mpatch_module = { - PyModuleDef_HEAD_INIT, - "mpatch", - mpatch_doc, - -1, - methods + PyModuleDef_HEAD_INIT, "mpatch", mpatch_doc, -1, methods, }; PyMODINIT_FUNC PyInit_mpatch(void) @@ -179,8 +181,8 @@ if (m == NULL) return NULL; - mpatch_Error = PyErr_NewException("mercurial.cext.mpatch.mpatchError", - NULL, NULL); + mpatch_Error = + PyErr_NewException("mercurial.cext.mpatch.mpatchError", NULL, NULL); Py_INCREF(mpatch_Error); PyModule_AddObject(m, "mpatchError", mpatch_Error); PyModule_AddIntConstant(m, "version", version); @@ -188,13 +190,12 @@ return m; } #else -PyMODINIT_FUNC -initmpatch(void) +PyMODINIT_FUNC initmpatch(void) { PyObject *m; m = Py_InitModule3("mpatch", methods, mpatch_doc); - mpatch_Error = PyErr_NewException("mercurial.cext.mpatch.mpatchError", - NULL, NULL); + mpatch_Error = + PyErr_NewException("mercurial.cext.mpatch.mpatchError", NULL, NULL); PyModule_AddIntConstant(m, "version", version); } #endif diff -r fb92df8b634c -r ed5448edcbfa mercurial/cext/osutil.c --- a/mercurial/cext/osutil.c Wed Apr 04 10:35:09 2018 -0400 +++ b/mercurial/cext/osutil.c Wed Apr 18 15:32:08 2018 -0400 @@ -121,6 +121,27 @@ o->ob_type->tp_free(o); } +static PyObject *listdir_stat_getitem(PyObject *self, PyObject *key) +{ + long index = PyLong_AsLong(key); + if (index == -1 && PyErr_Occurred()) { + return NULL; + } + if (index != 8) { + PyErr_Format(PyExc_IndexError, "osutil.stat objects only " + "support stat.ST_MTIME in " + "__getitem__"); + return NULL; + } + return listdir_stat_st_mtime(self, NULL); +} + +static PyMappingMethods listdir_stat_type_mapping_methods = { + (lenfunc)NULL, /* mp_length */ + (binaryfunc)listdir_stat_getitem, /* mp_subscript */ + (objobjargproc)NULL, /* mp_ass_subscript */ +}; + static PyTypeObject listdir_stat_type = { PyVarObject_HEAD_INIT(NULL, 0) /* header */ "osutil.stat", /*tp_name*/ @@ -134,7 +155,7 @@ 0, /*tp_repr*/ 0, /*tp_as_number*/ 0, /*tp_as_sequence*/ - 0, /*tp_as_mapping*/ + &listdir_stat_type_mapping_methods, /*tp_as_mapping*/ 0, /*tp_hash */ 0, /*tp_call*/ 0, /*tp_str*/ @@ -184,7 +205,7 @@ ? _S_IFDIR : _S_IFREG; if (!wantstat) - return Py_BuildValue("si", fd->cFileName, kind); + return Py_BuildValue(PY23("si", "yi"), fd->cFileName, kind); py_st = PyObject_CallObject((PyObject *)&listdir_stat_type, NULL); if (!py_st) @@ -202,7 +223,7 @@ if (kind == _S_IFREG) stp->st_size = ((__int64)fd->nFileSizeHigh << 32) + fd->nFileSizeLow; - return Py_BuildValue("siN", fd->cFileName, + return Py_BuildValue(PY23("siN", "yiN"), fd->cFileName, kind, py_st); } @@ -390,9 +411,11 @@ stat = makestat(&st); if (!stat) goto error; - elem = Py_BuildValue("siN", ent->d_name, kind, stat); + elem = Py_BuildValue(PY23("siN", "yiN"), ent->d_name, + kind, stat); } else - elem = Py_BuildValue("si", ent->d_name, kind); + elem = Py_BuildValue(PY23("si", "yi"), ent->d_name, + kind); if (!elem) goto error; stat = NULL; @@ -570,9 +593,11 @@ stat = makestat(&st); if (!stat) goto error; - elem = Py_BuildValue("siN", filename, kind, stat); + elem = Py_BuildValue(PY23("siN", "yiN"), + filename, kind, stat); } else - elem = Py_BuildValue("si", filename, kind); + elem = Py_BuildValue(PY23("si", "yi"), + filename, kind); if (!elem) goto error; stat = NULL; @@ -754,7 +779,7 @@ static PyObject *setprocname(PyObject *self, PyObject *args) { const char *name = NULL; - if (!PyArg_ParseTuple(args, "s", &name)) + if (!PyArg_ParseTuple(args, PY23("s", "y"), &name)) return NULL; #if defined(SETPROCNAME_USE_SETPROCTITLE) @@ -1101,14 +1126,14 @@ const char *path = NULL; struct statfs buf; int r; - if (!PyArg_ParseTuple(args, "s", &path)) + if (!PyArg_ParseTuple(args, PY23("s", "y"), &path)) return NULL; memset(&buf, 0, sizeof(buf)); r = statfs(path, &buf); if (r != 0) return PyErr_SetFromErrno(PyExc_OSError); - return Py_BuildValue("s", describefstype(&buf)); + return Py_BuildValue(PY23("s", "y"), describefstype(&buf)); } #endif /* defined(HAVE_LINUX_STATFS) || defined(HAVE_BSD_STATFS) */ @@ -1119,14 +1144,14 @@ const char *path = NULL; struct statfs buf; int r; - if (!PyArg_ParseTuple(args, "s", &path)) + if (!PyArg_ParseTuple(args, PY23("s", "y"), &path)) return NULL; memset(&buf, 0, sizeof(buf)); r = statfs(path, &buf); if (r != 0) return PyErr_SetFromErrno(PyExc_OSError); - return Py_BuildValue("s", buf.f_mntonname); + return Py_BuildValue(PY23("s", "y"), buf.f_mntonname); } #endif /* defined(HAVE_BSD_STATFS) */ @@ -1160,7 +1185,8 @@ static char *kwlist[] = {"path", "stat", "skip", NULL}; - if (!PyArg_ParseTupleAndKeywords(args, kwargs, "s#|OO:listdir", + if (!PyArg_ParseTupleAndKeywords(args, kwargs, PY23("s#|OO:listdir", + "y#|OO:listdir"), kwlist, &path, &plen, &statobj, &skipobj)) return NULL; @@ -1193,7 +1219,9 @@ int plus; FILE *fp; - if (!PyArg_ParseTupleAndKeywords(args, kwds, "et|si:posixfile", kwlist, + if (!PyArg_ParseTupleAndKeywords(args, kwds, PY23("et|si:posixfile", + "et|yi:posixfile"), + kwlist, Py_FileSystemDefaultEncoding, &name, &mode, &bufsize)) return NULL; @@ -1345,7 +1373,7 @@ {NULL, NULL} }; -static const int version = 3; +static const int version = 4; #ifdef IS_PY3K static struct PyModuleDef osutil_module = { diff -r fb92df8b634c -r ed5448edcbfa mercurial/cext/parsers.c --- a/mercurial/cext/parsers.c Wed Apr 04 10:35:09 2018 -0400 +++ b/mercurial/cext/parsers.c Wed Apr 18 15:32:08 2018 -0400 @@ -48,8 +48,9 @@ char *str, *start, *end; int len; - if (!PyArg_ParseTuple(args, "O!O!s#:parse_manifest", &PyDict_Type, - &mfdict, &PyDict_Type, &fdict, &str, &len)) + if (!PyArg_ParseTuple( + args, PY23("O!O!s#:parse_manifest", "O!O!y#:parse_manifest"), + &PyDict_Type, &mfdict, &PyDict_Type, &fdict, &str, &len)) goto quit; start = str; @@ -241,8 +242,9 @@ unsigned int flen, len, pos = 40; int readlen; - if (!PyArg_ParseTuple(args, "O!O!s#:parse_dirstate", &PyDict_Type, - &dmap, &PyDict_Type, &cmap, &str, &readlen)) + if (!PyArg_ParseTuple( + args, PY23("O!O!s#:parse_dirstate", "O!O!y#:parse_dirstate"), + &PyDict_Type, &dmap, &PyDict_Type, &cmap, &str, &readlen)) goto quit; len = readlen; @@ -254,7 +256,7 @@ goto quit; } - parents = Py_BuildValue("s#s#", str, 20, str + 20, 20); + parents = Py_BuildValue(PY23("s#s#", "y#y#"), str, 20, str + 20, 20); if (!parents) goto quit; @@ -645,7 +647,8 @@ Py_ssize_t offset, stop; PyObject *markers = NULL; - if (!PyArg_ParseTuple(args, "s#nn", &data, &datalen, &offset, &stop)) { + if (!PyArg_ParseTuple(args, PY23("s#nn", "y#nn"), &data, &datalen, + &offset, &stop)) { return NULL; } dataend = data + datalen; diff -r fb92df8b634c -r ed5448edcbfa mercurial/cext/pathencode.c --- a/mercurial/cext/pathencode.c Wed Apr 04 10:35:09 2018 -0400 +++ b/mercurial/cext/pathencode.c Wed Apr 18 15:32:08 2018 -0400 @@ -26,26 +26,26 @@ /* state machine for the fast path */ enum path_state { - START, /* first byte of a path component */ - A, /* "AUX" */ + START, /* first byte of a path component */ + A, /* "AUX" */ AU, - THIRD, /* third of a 3-byte sequence, e.g. "AUX", "NUL" */ - C, /* "CON" or "COMn" */ + THIRD, /* third of a 3-byte sequence, e.g. "AUX", "NUL" */ + C, /* "CON" or "COMn" */ CO, - COMLPT, /* "COM" or "LPT" */ + COMLPT, /* "COM" or "LPT" */ COMLPTn, L, LP, N, NU, - P, /* "PRN" */ + P, /* "PRN" */ PR, - LDOT, /* leading '.' */ - DOT, /* '.' in a non-leading position */ - H, /* ".h" */ - HGDI, /* ".hg", ".d", or ".i" */ + LDOT, /* leading '.' */ + DOT, /* '.' in a non-leading position */ + H, /* ".h" */ + HGDI, /* ".hg", ".d", or ".i" */ SPACE, - DEFAULT /* byte of a path component after the first */ + DEFAULT, /* byte of a path component after the first */ }; /* state machine for dir-encoding */ @@ -53,7 +53,7 @@ DDOT, DH, DHGDI, - DDEFAULT + DDEFAULT, }; static inline int inset(const uint32_t bitset[], char c) @@ -82,7 +82,7 @@ } static inline void hexencode(char *dest, Py_ssize_t *destlen, size_t destsize, - uint8_t c) + uint8_t c) { static const char hexdigit[] = "0123456789abcdef"; @@ -92,14 +92,14 @@ /* 3-byte escape: tilde followed by two hex digits */ static inline void escape3(char *dest, Py_ssize_t *destlen, size_t destsize, - char c) + char c) { charcopy(dest, destlen, destsize, '~'); hexencode(dest, destlen, destsize, c); } -static Py_ssize_t _encodedir(char *dest, size_t destsize, - const char *src, Py_ssize_t len) +static Py_ssize_t _encodedir(char *dest, size_t destsize, const char *src, + Py_ssize_t len) { enum dir_state state = DDEFAULT; Py_ssize_t i = 0, destlen = 0; @@ -126,8 +126,8 @@ if (src[i] == 'g') { state = DHGDI; charcopy(dest, &destlen, destsize, src[i++]); - } - else state = DDEFAULT; + } else + state = DDEFAULT; break; case DHGDI: if (src[i] == '/') { @@ -173,17 +173,15 @@ if (newobj) { assert(PyBytes_Check(newobj)); Py_SIZE(newobj)--; - _encodedir(PyBytes_AS_STRING(newobj), newlen, path, - len + 1); + _encodedir(PyBytes_AS_STRING(newobj), newlen, path, len + 1); } return newobj; } static Py_ssize_t _encode(const uint32_t twobytes[8], const uint32_t onebyte[8], - char *dest, Py_ssize_t destlen, size_t destsize, - const char *src, Py_ssize_t len, - int encodedir) + char *dest, Py_ssize_t destlen, size_t destsize, + const char *src, Py_ssize_t len, int encodedir) { enum path_state state = START; Py_ssize_t i = 0; @@ -237,15 +235,15 @@ if (src[i] == 'u') { state = AU; charcopy(dest, &destlen, destsize, src[i++]); - } - else state = DEFAULT; + } else + state = DEFAULT; break; case AU: if (src[i] == 'x') { state = THIRD; i++; - } - else state = DEFAULT; + } else + state = DEFAULT; break; case THIRD: state = DEFAULT; @@ -264,24 +262,30 @@ if (src[i] == 'o') { state = CO; charcopy(dest, &destlen, destsize, src[i++]); - } - else state = DEFAULT; + } else + state = DEFAULT; break; case CO: if (src[i] == 'm') { state = COMLPT; i++; - } - else if (src[i] == 'n') { + } else if (src[i] == 'n') { state = THIRD; i++; - } - else state = DEFAULT; + } else + state = DEFAULT; break; case COMLPT: switch (src[i]) { - case '1': case '2': case '3': case '4': case '5': - case '6': case '7': case '8': case '9': + case '1': + case '2': + case '3': + case '4': + case '5': + case '6': + case '7': + case '8': + case '9': state = COMLPTn; i++; break; @@ -301,8 +305,8 @@ charcopy(dest, &destlen, destsize, src[i - 1]); break; default: - memcopy(dest, &destlen, destsize, - &src[i - 2], 2); + memcopy(dest, &destlen, destsize, &src[i - 2], + 2); break; } break; @@ -310,43 +314,43 @@ if (src[i] == 'p') { state = LP; charcopy(dest, &destlen, destsize, src[i++]); - } - else state = DEFAULT; + } else + state = DEFAULT; break; case LP: if (src[i] == 't') { state = COMLPT; i++; - } - else state = DEFAULT; + } else + state = DEFAULT; break; case N: if (src[i] == 'u') { state = NU; charcopy(dest, &destlen, destsize, src[i++]); - } - else state = DEFAULT; + } else + state = DEFAULT; break; case NU: if (src[i] == 'l') { state = THIRD; i++; - } - else state = DEFAULT; + } else + state = DEFAULT; break; case P: if (src[i] == 'r') { state = PR; charcopy(dest, &destlen, destsize, src[i++]); - } - else state = DEFAULT; + } else + state = DEFAULT; break; case PR: if (src[i] == 'n') { state = THIRD; i++; - } - else state = DEFAULT; + } else + state = DEFAULT; break; case LDOT: switch (src[i]) { @@ -393,18 +397,18 @@ if (src[i] == 'g') { state = HGDI; charcopy(dest, &destlen, destsize, src[i++]); - } - else state = DEFAULT; + } else + state = DEFAULT; break; case HGDI: if (src[i] == '/') { state = START; if (encodedir) memcopy(dest, &destlen, destsize, ".hg", - 3); + 3); charcopy(dest, &destlen, destsize, src[i++]); - } - else state = DEFAULT; + } else + state = DEFAULT; break; case SPACE: switch (src[i]) { @@ -444,19 +448,17 @@ if (inset(onebyte, src[i])) { do { charcopy(dest, &destlen, - destsize, src[i++]); + destsize, src[i++]); } while (i < len && - inset(onebyte, src[i])); - } - else if (inset(twobytes, src[i])) { + inset(onebyte, src[i])); + } else if (inset(twobytes, src[i])) { char c = src[i++]; charcopy(dest, &destlen, destsize, '_'); charcopy(dest, &destlen, destsize, - c == '_' ? '_' : c + 32); - } - else + c == '_' ? '_' : c + 32); + } else escape3(dest, &destlen, destsize, - src[i++]); + src[i++]); break; } break; @@ -466,31 +468,29 @@ return destlen; } -static Py_ssize_t basicencode(char *dest, size_t destsize, - const char *src, Py_ssize_t len) +static Py_ssize_t basicencode(char *dest, size_t destsize, const char *src, + Py_ssize_t len) { - static const uint32_t twobytes[8] = { 0, 0, 0x87fffffe }; + static const uint32_t twobytes[8] = {0, 0, 0x87fffffe}; static const uint32_t onebyte[8] = { - 1, 0x2bff3bfa, 0x68000001, 0x2fffffff, + 1, 0x2bff3bfa, 0x68000001, 0x2fffffff, }; Py_ssize_t destlen = 0; - return _encode(twobytes, onebyte, dest, destlen, destsize, - src, len, 1); + return _encode(twobytes, onebyte, dest, destlen, destsize, src, len, 1); } static const Py_ssize_t maxstorepathlen = 120; -static Py_ssize_t _lowerencode(char *dest, size_t destsize, - const char *src, Py_ssize_t len) +static Py_ssize_t _lowerencode(char *dest, size_t destsize, const char *src, + Py_ssize_t len) { - static const uint32_t onebyte[8] = { - 1, 0x2bfffbfb, 0xe8000001, 0x2fffffff - }; + static const uint32_t onebyte[8] = {1, 0x2bfffbfb, 0xe8000001, + 0x2fffffff}; - static const uint32_t lower[8] = { 0, 0, 0x7fffffe }; + static const uint32_t lower[8] = {0, 0, 0x7fffffe}; Py_ssize_t i, destlen = 0; @@ -512,7 +512,8 @@ Py_ssize_t len, newlen; PyObject *ret; - if (!PyArg_ParseTuple(args, "s#:lowerencode", &path, &len)) + if (!PyArg_ParseTuple(args, PY23("s#:lowerencode", "y#:lowerencode"), + &path, &len)) return NULL; newlen = _lowerencode(NULL, 0, path, len); @@ -524,13 +525,13 @@ } /* See store.py:_auxencode for a description. */ -static Py_ssize_t auxencode(char *dest, size_t destsize, - const char *src, Py_ssize_t len) +static Py_ssize_t auxencode(char *dest, size_t destsize, const char *src, + Py_ssize_t len) { static const uint32_t twobytes[8]; static const uint32_t onebyte[8] = { - ~0U, 0xffff3ffe, ~0U, ~0U, ~0U, ~0U, ~0U, ~0U, + ~0U, 0xffff3ffe, ~0U, ~0U, ~0U, ~0U, ~0U, ~0U, }; return _encode(twobytes, onebyte, dest, 0, destsize, src, len, 0); @@ -590,8 +591,7 @@ break; charcopy(dest, &destlen, destsize, src[i]); p = -1; - } - else if (p < dirprefixlen) + } else if (p < dirprefixlen) charcopy(dest, &destlen, destsize, src[i]); } @@ -622,13 +622,13 @@ slop = maxstorepathlen - used; if (slop > 0) { Py_ssize_t basenamelen = - lastslash >= 0 ? len - lastslash - 2 : len - 1; + lastslash >= 0 ? len - lastslash - 2 : len - 1; if (basenamelen > slop) basenamelen = slop; if (basenamelen > 0) memcopy(dest, &destlen, destsize, &src[lastslash + 1], - basenamelen); + basenamelen); } /* Add hash and suffix. */ @@ -637,7 +637,7 @@ if (lastdot >= 0) memcopy(dest, &destlen, destsize, &src[lastdot], - len - lastdot - 1); + len - lastdot - 1); assert(PyBytes_Check(ret)); Py_SIZE(ret) = destlen; @@ -672,8 +672,8 @@ if (shafunc == NULL) { PyErr_SetString(PyExc_AttributeError, - "module 'hashlib' has no " - "attribute 'sha1'"); + "module 'hashlib' has no " + "attribute 'sha1'"); return -1; } } @@ -690,7 +690,7 @@ if (!PyBytes_Check(hashobj) || PyBytes_GET_SIZE(hashobj) != 20) { PyErr_SetString(PyExc_TypeError, - "result of digest is not a 20-byte hash"); + "result of digest is not a 20-byte hash"); Py_DECREF(hashobj); return -1; } @@ -755,10 +755,9 @@ assert(PyBytes_Check(newobj)); Py_SIZE(newobj)--; basicencode(PyBytes_AS_STRING(newobj), newlen, path, - len + 1); + len + 1); } - } - else + } else newobj = hashencode(path, len + 1); return newobj; diff -r fb92df8b634c -r ed5448edcbfa mercurial/cext/revlog.c --- a/mercurial/cext/revlog.c Wed Apr 04 10:35:09 2018 -0400 +++ b/mercurial/cext/revlog.c Wed Apr 18 15:32:08 2018 -0400 @@ -87,9 +87,9 @@ static Py_ssize_t inline_scan(indexObject *self, const char **offsets); #if LONG_MAX == 0x7fffffffL -static char *tuple_format = "Kiiiiiis#"; +static const char *const tuple_format = PY23("Kiiiiiis#", "Kiiiiiiy#"); #else -static char *tuple_format = "kiiiiiis#"; +static const char *const tuple_format = PY23("kiiiiiis#", "kiiiiiiy#"); #endif /* A RevlogNG v1 index entry is 64 bytes long. */ @@ -643,8 +643,10 @@ if (!PyArg_ParseTuple(args, "O", &roots)) goto done; - if (roots == NULL || !PyList_Check(roots)) + if (roots == NULL || !PyList_Check(roots)) { + PyErr_SetString(PyExc_TypeError, "roots must be a list"); goto done; + } phases = calloc(len, 1); /* phase per rev: {0: public, 1: draft, 2: secret} */ if (phases == NULL) { @@ -667,8 +669,11 @@ if (phaseset == NULL) goto release; PyList_SET_ITEM(phasessetlist, i+1, phaseset); - if (!PyList_Check(phaseroots)) + if (!PyList_Check(phaseroots)) { + PyErr_SetString(PyExc_TypeError, + "roots item must be a list"); goto release; + } minrevphase = add_roots_get_min(self, phaseroots, i+1, phases); if (minrevphase == -2) /* Error from add_roots_get_min */ goto release; @@ -1243,7 +1248,7 @@ char *node; int rev, i; - if (!PyArg_ParseTuple(args, "s#", &node, &nodelen)) + if (!PyArg_ParseTuple(args, PY23("s#", "y#"), &node, &nodelen)) return NULL; if (nodelen < 4) { @@ -2077,7 +2082,7 @@ Py_INCREF(&indexType); PyModule_AddObject(mod, "index", (PyObject *)&indexType); - nullentry = Py_BuildValue("iiiiiiis#", 0, 0, 0, + nullentry = Py_BuildValue(PY23("iiiiiiis#", "iiiiiiiy#"), 0, 0, 0, -1, -1, -1, -1, nullid, 20); if (nullentry) PyObject_GC_UnTrack(nullentry); diff -r fb92df8b634c -r ed5448edcbfa mercurial/cext/util.h --- a/mercurial/cext/util.h Wed Apr 04 10:35:09 2018 -0400 +++ b/mercurial/cext/util.h Wed Apr 18 15:32:08 2018 -0400 @@ -14,6 +14,13 @@ #define IS_PY3K #endif +/* helper to switch things like string literal depending on Python version */ +#ifdef IS_PY3K +#define PY23(py2, py3) py3 +#else +#define PY23(py2, py3) py2 +#endif + /* clang-format off */ typedef struct { PyObject_HEAD diff -r fb92df8b634c -r ed5448edcbfa mercurial/cffi/bdiffbuild.py --- a/mercurial/cffi/bdiffbuild.py Wed Apr 04 10:35:09 2018 -0400 +++ b/mercurial/cffi/bdiffbuild.py Wed Apr 18 15:32:08 2018 -0400 @@ -4,9 +4,10 @@ import os ffi = cffi.FFI() -ffi.set_source("mercurial.cffi._bdiff", - open(os.path.join(os.path.join(os.path.dirname(__file__), '..'), - 'bdiff.c')).read(), include_dirs=['mercurial']) +with open(os.path.join(os.path.join(os.path.dirname(__file__), '..'), + 'bdiff.c')) as f: + ffi.set_source("mercurial.cffi._bdiff", + f.read(), include_dirs=['mercurial']) ffi.cdef(""" struct bdiff_line { int hash, n, e; diff -r fb92df8b634c -r ed5448edcbfa mercurial/cffi/mpatchbuild.py --- a/mercurial/cffi/mpatchbuild.py Wed Apr 04 10:35:09 2018 -0400 +++ b/mercurial/cffi/mpatchbuild.py Wed Apr 18 15:32:08 2018 -0400 @@ -6,8 +6,9 @@ ffi = cffi.FFI() mpatch_c = os.path.join(os.path.join(os.path.dirname(__file__), '..', 'mpatch.c')) -ffi.set_source("mercurial.cffi._mpatch", open(mpatch_c).read(), - include_dirs=["mercurial"]) +with open(mpatch_c) as f: + ffi.set_source("mercurial.cffi._mpatch", f.read(), + include_dirs=["mercurial"]) ffi.cdef(""" struct mpatch_frag { diff -r fb92df8b634c -r ed5448edcbfa mercurial/changegroup.py --- a/mercurial/changegroup.py Wed Apr 04 10:35:09 2018 -0400 +++ b/mercurial/changegroup.py Wed Apr 18 15:32:08 2018 -0400 @@ -28,10 +28,20 @@ util, ) +from .utils import ( + stringutil, +) + _CHANGEGROUPV1_DELTA_HEADER = "20s20s20s20s" _CHANGEGROUPV2_DELTA_HEADER = "20s20s20s20s20s" _CHANGEGROUPV3_DELTA_HEADER = ">20s20s20s20s20sH" +LFS_REQUIREMENT = 'lfs' + +# When narrowing is finalized and no longer subject to format changes, +# we should move this to just "narrow" or similar. +NARROW_REQUIREMENT = 'narrowhg-experimental' + readexactly = util.readexactly def getchunk(stream): @@ -71,7 +81,7 @@ fh = open(filename, "wb", 131072) else: fd, filename = tempfile.mkstemp(prefix="hg-bundle-", suffix=".hg") - fh = os.fdopen(fd, pycompat.sysstr("wb")) + fh = os.fdopen(fd, r"wb") cleanup = filename for c in chunks: fh.write(c) @@ -407,7 +417,7 @@ newheads = [h for h in repo.heads() if h not in oldheads] repo.ui.log("incoming", - "%s incoming changes - new heads: %s\n", + "%d incoming changes - new heads: %s\n", len(added), ', '.join([hex(c[:6]) for c in newheads])) @@ -510,7 +520,7 @@ if reorder == 'auto': reorder = None else: - reorder = util.parsebool(reorder) + reorder = stringutil.parsebool(reorder) self._repo = repo self._reorder = reorder self._progress = repo.ui.progress @@ -748,7 +758,8 @@ for i, fname in enumerate(sorted(changedfiles)): filerevlog = repo.file(fname) if not filerevlog: - raise error.Abort(_("empty or missing revlog for %s") % fname) + raise error.Abort(_("empty or missing file data for %s") % + fname) linkrevnodes = linknodes(filerevlog, fname) # Lookup for filenodes, we collected the linkrev nodes above in the @@ -899,6 +910,17 @@ # support versions 01 and 02. versions.discard('01') versions.discard('02') + if NARROW_REQUIREMENT in repo.requirements: + # Versions 01 and 02 don't support revlog flags, and we need to + # support that for stripping and unbundling to work. + versions.discard('01') + versions.discard('02') + if LFS_REQUIREMENT in repo.requirements: + # Versions 01 and 02 don't support revlog flags, and we need to + # mark LFS entries with REVIDX_EXTSTORED. + versions.discard('01') + versions.discard('02') + return versions def localversion(repo): diff -r fb92df8b634c -r ed5448edcbfa mercurial/changelog.py --- a/mercurial/changelog.py Wed Apr 04 10:35:09 2018 -0400 +++ b/mercurial/changelog.py Wed Apr 18 15:32:08 2018 -0400 @@ -20,9 +20,14 @@ from . import ( encoding, error, + pycompat, revlog, util, ) +from .utils import ( + dateutil, + stringutil, +) _defaultextra = {'branch': 'default'} @@ -34,7 +39,7 @@ >>> s 'ab\\ncd\\\\\\\\n\\x00ab\\rcd\\\\\\n' >>> res = _string_escape(s) - >>> s == util.unescapestr(res) + >>> s == stringutil.unescapestr(res) True """ # subset of the string_escape codec @@ -60,7 +65,7 @@ l = l.replace('\\\\', '\\\\\n') l = l.replace('\\0', '\0') l = l.replace('\n', '') - k, v = util.unescapestr(l).split(':', 1) + k, v = stringutil.unescapestr(l).split(':', 1) extra[k] = v return extra @@ -90,6 +95,11 @@ return self.offset def flush(self): pass + + @property + def closed(self): + return self.fp.closed + def close(self): self.fp.close() @@ -127,6 +137,13 @@ self.offset += len(s) self._end += len(s) + def __enter__(self): + self.fp.__enter__() + return self + + def __exit__(self, *args): + return self.fp.__exit__(*args) + def _divertopener(opener, target): """build an opener that writes in 'target.a' instead of 'target'""" def _divert(name, mode='r', checkambig=False): @@ -420,7 +437,7 @@ self._delaybuf = None self._divert = False # split when we're done - self.checkinlinesize(tr) + self._enforceinlinesize(tr) def _writepending(self, tr): "create a file containing the unfinalized state for pretxnchangegroup" @@ -446,9 +463,9 @@ return False - def checkinlinesize(self, tr, fp=None): + def _enforceinlinesize(self, tr, fp=None): if not self._delayed: - revlog.revlog.checkinlinesize(self, tr, fp) + revlog.revlog._enforceinlinesize(self, tr, fp) def read(self, node): """Obtain data from a parsed changelog revision. @@ -505,15 +522,15 @@ if not user: raise error.RevlogError(_("empty username")) if "\n" in user: - raise error.RevlogError(_("username %s contains a newline") - % repr(user)) + raise error.RevlogError(_("username %r contains a newline") + % pycompat.bytestr(user)) desc = stripdesc(desc) if date: - parseddate = "%d %d" % util.parsedate(date) + parseddate = "%d %d" % dateutil.parsedate(date) else: - parseddate = "%d %d" % util.makedate() + parseddate = "%d %d" % dateutil.makedate() if extra: branch = extra.get("branch") if branch in ("default", ""): diff -r fb92df8b634c -r ed5448edcbfa mercurial/chgserver.py --- a/mercurial/chgserver.py Wed Apr 04 10:35:09 2018 -0400 +++ b/mercurial/chgserver.py Wed Apr 18 15:32:08 2018 -0400 @@ -45,6 +45,7 @@ import os import re import socket +import stat import struct import time @@ -60,6 +61,10 @@ util, ) +from .utils import ( + procutil, +) + _log = commandserver.log def _hashlist(items): @@ -161,7 +166,7 @@ def trystat(path): try: st = os.stat(path) - return (st.st_mtime, st.st_size) + return (st[stat.ST_MTIME], st.st_size) except OSError: # could be ENOENT, EPERM etc. not fatal in any case pass @@ -199,13 +204,13 @@ # these situations and will behave differently (write to stdout). if (out is not self.fout or not util.safehasattr(self.fout, 'fileno') - or self.fout.fileno() != util.stdout.fileno()): - return util.system(cmd, environ=environ, cwd=cwd, out=out) + or self.fout.fileno() != procutil.stdout.fileno()): + return procutil.system(cmd, environ=environ, cwd=cwd, out=out) self.flush() - return self._csystem(cmd, util.shellenviron(environ), cwd) + return self._csystem(cmd, procutil.shellenviron(environ), cwd) def _runpager(self, cmd, env=None): - self._csystem(cmd, util.shellenviron(env), type='pager', + self._csystem(cmd, procutil.shellenviron(env), type='pager', cmdtable={'attachio': attachio}) return True @@ -266,7 +271,7 @@ self.channel = channel def __call__(self, cmd, environ, cwd=None, type='system', cmdtable=None): - args = [type, util.quotecommand(cmd), os.path.abspath(cwd or '.')] + args = [type, procutil.quotecommand(cmd), os.path.abspath(cwd or '.')] args.extend('%s=%s' % (k, v) for k, v in environ.iteritems()) data = '\0'.join(args) self.out.write(struct.pack('>cI', self.channel, len(data))) @@ -295,9 +300,9 @@ _iochannels = [ # server.ch, ui.fp, mode - ('cin', 'fin', pycompat.sysstr('rb')), - ('cout', 'fout', pycompat.sysstr('wb')), - ('cerr', 'ferr', pycompat.sysstr('wb')), + ('cin', 'fin', r'rb'), + ('cout', 'fout', r'wb'), + ('cerr', 'ferr', r'wb'), ] class chgcmdserver(commandserver.server): @@ -472,12 +477,12 @@ 'setenv': setenv, 'setumask': setumask}) - if util.safehasattr(util, 'setprocname'): + if util.safehasattr(procutil, 'setprocname'): def setprocname(self): """Change process title""" name = self._readstr() _log('setprocname: %r\n' % name) - util.setprocname(name) + procutil.setprocname(name) capabilities['setprocname'] = setprocname def _tempaddress(address): @@ -546,9 +551,9 @@ def _issocketowner(self): try: - stat = os.stat(self._realaddress) - return (stat.st_ino == self._socketstat.st_ino and - stat.st_mtime == self._socketstat.st_mtime) + st = os.stat(self._realaddress) + return (st.st_ino == self._socketstat.st_ino and + st[stat.ST_MTIME] == self._socketstat[stat.ST_MTIME]) except OSError: return False diff -r fb92df8b634c -r ed5448edcbfa mercurial/cmdutil.py --- a/mercurial/cmdutil.py Wed Apr 04 10:35:09 2018 -0400 +++ b/mercurial/cmdutil.py Wed Apr 18 15:32:08 2018 -0400 @@ -8,7 +8,6 @@ from __future__ import absolute_import import errno -import itertools import os import re import tempfile @@ -26,14 +25,13 @@ changelog, copies, crecord as crecordmod, - dagop, dirstateguard, encoding, error, formatter, - graphmod, + logcmdutil, match as matchmod, - mdiff, + merge as mergemod, mergeutil, obsolete, patch, @@ -41,16 +39,21 @@ pycompat, registrar, revlog, - revset, - revsetlang, rewriteutil, scmutil, smartset, + subrepoutil, templatekw, templater, util, vfs as vfsmod, ) + +from .utils import ( + dateutil, + stringutil, +) + stringio = util.stringio # templates of common command options @@ -60,6 +63,11 @@ _('do not perform actions, just print output')), ] +confirmopts = [ + ('', 'confirm', None, + _('ask before applying actions')), +] + remoteopts = [ ('e', 'ssh', '', _('specify ssh command to use'), _('CMD')), @@ -226,7 +234,6 @@ def dorecord(ui, repo, commitfunc, cmdsuggest, backupall, filterfn, *pats, **opts): - from . import merge as mergemod opts = pycompat.byteskwargs(opts) if not ui.interactive(): if cmdsuggest: @@ -366,7 +373,7 @@ ui.debug(fp.getvalue()) patch.internalpatch(ui, repo, fp, 1, eolmode=None) except error.PatchError as err: - raise error.Abort(str(err)) + raise error.Abort(pycompat.bytestr(err)) del fp # 4. We prepared working directory according to filtered @@ -563,8 +570,6 @@ return '\n'.join(commentedlines) + '\n' def _conflictsmsg(repo): - # avoid merge cycle - from . import merge as mergemod mergestate = mergemod.mergestate.read(repo) if not mergestate.active(): return @@ -899,65 +904,124 @@ else: return commiteditor -def loglimit(opts): - """get the log limit according to option -l/--limit""" - limit = opts.get('limit') - if limit: - try: - limit = int(limit) - except ValueError: - raise error.Abort(_('limit must be a positive integer')) - if limit <= 0: - raise error.Abort(_('limit must be positive')) - else: - limit = None - return limit - -def makefilename(repo, pat, node, desc=None, - total=None, seqno=None, revwidth=None, pathname=None): - node_expander = { - 'H': lambda: hex(node), - 'R': lambda: '%d' % repo.changelog.rev(node), - 'h': lambda: short(node), - 'm': lambda: re.sub('[^\w]', '_', desc or '') - } +def _escapecommandtemplate(tmpl): + parts = [] + for typ, start, end in templater.scantemplate(tmpl, raw=True): + if typ == b'string': + parts.append(stringutil.escapestr(tmpl[start:end])) + else: + parts.append(tmpl[start:end]) + return b''.join(parts) + +def rendercommandtemplate(ui, tmpl, props): + r"""Expand a literal template 'tmpl' in a way suitable for command line + + '\' in outermost string is not taken as an escape character because it + is a directory separator on Windows. + + >>> from . import ui as uimod + >>> ui = uimod.ui() + >>> rendercommandtemplate(ui, b'c:\\{path}', {b'path': b'foo'}) + 'c:\\foo' + >>> rendercommandtemplate(ui, b'{"c:\\{path}"}', {'path': b'foo'}) + 'c:{path}' + """ + if not tmpl: + return tmpl + t = formatter.maketemplater(ui, _escapecommandtemplate(tmpl)) + return t.renderdefault(props) + +def rendertemplate(ctx, tmpl, props=None): + """Expand a literal template 'tmpl' byte-string against one changeset + + Each props item must be a stringify-able value or a callable returning + such value, i.e. no bare list nor dict should be passed. + """ + repo = ctx.repo() + tres = formatter.templateresources(repo.ui, repo) + t = formatter.maketemplater(repo.ui, tmpl, defaults=templatekw.keywords, + resources=tres) + mapping = {'ctx': ctx} + if props: + mapping.update(props) + return t.renderdefault(mapping) + +def _buildfntemplate(pat, total=None, seqno=None, revwidth=None, pathname=None): + r"""Convert old-style filename format string to template string + + >>> _buildfntemplate(b'foo-%b-%n.patch', seqno=0) + 'foo-{reporoot|basename}-{seqno}.patch' + >>> _buildfntemplate(b'%R{tags % "{tag}"}%H') + '{rev}{tags % "{tag}"}{node}' + + '\' in outermost strings has to be escaped because it is a directory + separator on Windows: + + >>> _buildfntemplate(b'c:\\tmp\\%R\\%n.patch', seqno=0) + 'c:\\\\tmp\\\\{rev}\\\\{seqno}.patch' + >>> _buildfntemplate(b'\\\\foo\\bar.patch') + '\\\\\\\\foo\\\\bar.patch' + >>> _buildfntemplate(b'\\{tags % "{tag}"}') + '\\\\{tags % "{tag}"}' + + but inner strings follow the template rules (i.e. '\' is taken as an + escape character): + + >>> _buildfntemplate(br'{"c:\tmp"}', seqno=0) + '{"c:\\tmp"}' + """ expander = { - '%': lambda: '%', - 'b': lambda: os.path.basename(repo.root), - } - - try: - if node: - expander.update(node_expander) - if node: - expander['r'] = (lambda: - ('%d' % repo.changelog.rev(node)).zfill(revwidth or 0)) - if total is not None: - expander['N'] = lambda: '%d' % total - if seqno is not None: - expander['n'] = lambda: '%d' % seqno - if total is not None and seqno is not None: - expander['n'] = (lambda: ('%d' % seqno).zfill(len('%d' % total))) - if pathname is not None: - expander['s'] = lambda: os.path.basename(pathname) - expander['d'] = lambda: os.path.dirname(pathname) or '.' - expander['p'] = lambda: pathname - - newname = [] - patlen = len(pat) - i = 0 - while i < patlen: - c = pat[i:i + 1] - if c == '%': - i += 1 - c = pat[i:i + 1] - c = expander[c]() - newname.append(c) - i += 1 - return ''.join(newname) - except KeyError as inst: - raise error.Abort(_("invalid format spec '%%%s' in output filename") % - inst.args[0]) + b'H': b'{node}', + b'R': b'{rev}', + b'h': b'{node|short}', + b'm': br'{sub(r"[^\w]", "_", desc|firstline)}', + b'r': b'{if(revwidth, pad(rev, revwidth, "0", left=True), rev)}', + b'%': b'%', + b'b': b'{reporoot|basename}', + } + if total is not None: + expander[b'N'] = b'{total}' + if seqno is not None: + expander[b'n'] = b'{seqno}' + if total is not None and seqno is not None: + expander[b'n'] = b'{pad(seqno, total|stringify|count, "0", left=True)}' + if pathname is not None: + expander[b's'] = b'{pathname|basename}' + expander[b'd'] = b'{if(pathname|dirname, pathname|dirname, ".")}' + expander[b'p'] = b'{pathname}' + + newname = [] + for typ, start, end in templater.scantemplate(pat, raw=True): + if typ != b'string': + newname.append(pat[start:end]) + continue + i = start + while i < end: + n = pat.find(b'%', i, end) + if n < 0: + newname.append(stringutil.escapestr(pat[i:end])) + break + newname.append(stringutil.escapestr(pat[i:n])) + if n + 2 > end: + raise error.Abort(_("incomplete format spec in output " + "filename")) + c = pat[n + 1:n + 2] + i = n + 2 + try: + newname.append(expander[c]) + except KeyError: + raise error.Abort(_("invalid format spec '%%%s' in output " + "filename") % c) + return ''.join(newname) + +def makefilename(ctx, pat, **props): + if not pat: + return pat + tmpl = _buildfntemplate(pat, **props) + # BUG: alias expansion shouldn't be made against template fragments + # rewritten from %-format strings, but we have no easy way to partially + # disable the expansion. + return rendertemplate(ctx, tmpl, pycompat.byteskwargs(props)) def isstdiofilename(pat): """True if the given pat looks like a filename denoting stdin/stdout""" @@ -982,23 +1046,17 @@ def __exit__(self, exc_type, exc_value, exc_tb): pass -def makefileobj(repo, pat, node=None, desc=None, total=None, - seqno=None, revwidth=None, mode='wb', modemap=None, - pathname=None): - +def makefileobj(ctx, pat, mode='wb', **props): writable = mode not in ('r', 'rb') if isstdiofilename(pat): + repo = ctx.repo() if writable: fp = repo.ui.fout else: fp = repo.ui.fin return _unclosablefile(fp) - fn = makefilename(repo, pat, node, desc, total, seqno, revwidth, pathname) - if modemap is not None: - mode = modemap.get(fn, mode) - if mode == 'wb': - modemap[fn] = 'ab' + fn = makefilename(ctx, pat, **props) return open(fn, mode) def openrevlog(repo, cmd, file_, opts): @@ -1028,6 +1086,8 @@ if 'treemanifest' not in repo.requirements: raise error.Abort(_("--dir can only be used on repos with " "treemanifest enabled")) + if not dir.endswith('/'): + dir = dir + '/' dirlog = repo.manifestlog._revlog.dirlog(dir) if len(dirlog): r = dirlog @@ -1162,7 +1222,9 @@ os.rename(src, tmp) os.rename(tmp, target) else: - util.copyfile(src, target) + # Preserve stat info on renames, not on copies; this matches + # Linux CLI behavior. + util.copyfile(src, target, copystat=rename) srcexists = True except IOError as inst: if inst.errno == errno.ENOENT: @@ -1313,7 +1375,7 @@ # - ctx: the changectx created by import. extrapostimportmap = {} -def tryimportone(ui, repo, hunk, parents, opts, msgs, updatefunc): +def tryimportone(ui, repo, patchdata, parents, opts, msgs, updatefunc): """Utility function used by commands.import to import a single patch This function is explicitly defined here to help the evolve extension to @@ -1322,7 +1384,8 @@ The API is currently a bit ugly because it a simple code translation from the import command. Feel free to make it better. - :hunk: a patch (as a binary string) + :patchdata: a dictionary containing parsed patch data (such as from + ``patch.extract()``) :parents: nodes that will be parent of the created commit :opts: the full dict of option passed to the import command :msgs: list to save commit message to. @@ -1332,15 +1395,15 @@ """ # avoid cycle context -> subrepo -> cmdutil from . import context - extractdata = patch.extract(ui, hunk) - tmpname = extractdata.get('filename') - message = extractdata.get('message') - user = opts.get('user') or extractdata.get('user') - date = opts.get('date') or extractdata.get('date') - branch = extractdata.get('branch') - nodeid = extractdata.get('nodeid') - p1 = extractdata.get('p1') - p2 = extractdata.get('p2') + + tmpname = patchdata.get('filename') + message = patchdata.get('message') + user = opts.get('user') or patchdata.get('user') + date = opts.get('date') or patchdata.get('date') + branch = patchdata.get('branch') + nodeid = patchdata.get('nodeid') + p1 = patchdata.get('p1') + p2 = patchdata.get('p2') nocommit = opts.get('no_commit') importbranch = opts.get('import_branch') @@ -1348,141 +1411,139 @@ strip = opts["strip"] prefix = opts["prefix"] sim = float(opts.get('similarity') or 0) + if not tmpname: - return (None, None, False) + return None, None, False rejects = False - try: - cmdline_message = logmessage(ui, opts) - if cmdline_message: - # pickup the cmdline msg - message = cmdline_message - elif message: - # pickup the patch msg - message = message.strip() - else: - # launch the editor - message = None - ui.debug('message:\n%s\n' % message) - - if len(parents) == 1: - parents.append(repo[nullid]) - if opts.get('exact'): - if not nodeid or not p1: - raise error.Abort(_('not a Mercurial patch')) + cmdline_message = logmessage(ui, opts) + if cmdline_message: + # pickup the cmdline msg + message = cmdline_message + elif message: + # pickup the patch msg + message = message.strip() + else: + # launch the editor + message = None + ui.debug('message:\n%s\n' % (message or '')) + + if len(parents) == 1: + parents.append(repo[nullid]) + if opts.get('exact'): + if not nodeid or not p1: + raise error.Abort(_('not a Mercurial patch')) + p1 = repo[p1] + p2 = repo[p2 or nullid] + elif p2: + try: p1 = repo[p1] - p2 = repo[p2 or nullid] - elif p2: - try: - p1 = repo[p1] - p2 = repo[p2] - # Without any options, consider p2 only if the - # patch is being applied on top of the recorded - # first parent. - if p1 != parents[0]: - p1 = parents[0] - p2 = repo[nullid] - except error.RepoError: - p1, p2 = parents - if p2.node() == nullid: - ui.warn(_("warning: import the patch as a normal revision\n" - "(use --exact to import the patch as a merge)\n")) + p2 = repo[p2] + # Without any options, consider p2 only if the + # patch is being applied on top of the recorded + # first parent. + if p1 != parents[0]: + p1 = parents[0] + p2 = repo[nullid] + except error.RepoError: + p1, p2 = parents + if p2.node() == nullid: + ui.warn(_("warning: import the patch as a normal revision\n" + "(use --exact to import the patch as a merge)\n")) + else: + p1, p2 = parents + + n = None + if update: + if p1 != parents[0]: + updatefunc(repo, p1.node()) + if p2 != parents[1]: + repo.setparents(p1.node(), p2.node()) + + if opts.get('exact') or importbranch: + repo.dirstate.setbranch(branch or 'default') + + partial = opts.get('partial', False) + files = set() + try: + patch.patch(ui, repo, tmpname, strip=strip, prefix=prefix, + files=files, eolmode=None, similarity=sim / 100.0) + except error.PatchError as e: + if not partial: + raise error.Abort(pycompat.bytestr(e)) + if partial: + rejects = True + + files = list(files) + if nocommit: + if message: + msgs.append(message) else: - p1, p2 = parents - - n = None - if update: - if p1 != parents[0]: - updatefunc(repo, p1.node()) - if p2 != parents[1]: - repo.setparents(p1.node(), p2.node()) - - if opts.get('exact') or importbranch: - repo.dirstate.setbranch(branch or 'default') - - partial = opts.get('partial', False) + if opts.get('exact') or p2: + # If you got here, you either use --force and know what + # you are doing or used --exact or a merge patch while + # being updated to its first parent. + m = None + else: + m = scmutil.matchfiles(repo, files or []) + editform = mergeeditform(repo[None], 'import.normal') + if opts.get('exact'): + editor = None + else: + editor = getcommiteditor(editform=editform, + **pycompat.strkwargs(opts)) + extra = {} + for idfunc in extrapreimport: + extrapreimportmap[idfunc](repo, patchdata, extra, opts) + overrides = {} + if partial: + overrides[('ui', 'allowemptycommit')] = True + with repo.ui.configoverride(overrides, 'import'): + n = repo.commit(message, user, + date, match=m, + editor=editor, extra=extra) + for idfunc in extrapostimport: + extrapostimportmap[idfunc](repo[n]) + else: + if opts.get('exact') or importbranch: + branch = branch or 'default' + else: + branch = p1.branch() + store = patch.filestore() + try: files = set() try: - patch.patch(ui, repo, tmpname, strip=strip, prefix=prefix, - files=files, eolmode=None, similarity=sim / 100.0) + patch.patchrepo(ui, repo, p1, store, tmpname, strip, prefix, + files, eolmode=None) except error.PatchError as e: - if not partial: - raise error.Abort(str(e)) - if partial: - rejects = True - - files = list(files) - if nocommit: - if message: - msgs.append(message) + raise error.Abort(stringutil.forcebytestr(e)) + if opts.get('exact'): + editor = None else: - if opts.get('exact') or p2: - # If you got here, you either use --force and know what - # you are doing or used --exact or a merge patch while - # being updated to its first parent. - m = None - else: - m = scmutil.matchfiles(repo, files or []) - editform = mergeeditform(repo[None], 'import.normal') - if opts.get('exact'): - editor = None - else: - editor = getcommiteditor(editform=editform, - **pycompat.strkwargs(opts)) - extra = {} - for idfunc in extrapreimport: - extrapreimportmap[idfunc](repo, extractdata, extra, opts) - overrides = {} - if partial: - overrides[('ui', 'allowemptycommit')] = True - with repo.ui.configoverride(overrides, 'import'): - n = repo.commit(message, user, - date, match=m, - editor=editor, extra=extra) - for idfunc in extrapostimport: - extrapostimportmap[idfunc](repo[n]) - else: - if opts.get('exact') or importbranch: - branch = branch or 'default' - else: - branch = p1.branch() - store = patch.filestore() - try: - files = set() - try: - patch.patchrepo(ui, repo, p1, store, tmpname, strip, prefix, - files, eolmode=None) - except error.PatchError as e: - raise error.Abort(str(e)) - if opts.get('exact'): - editor = None - else: - editor = getcommiteditor(editform='import.bypass') - memctx = context.memctx(repo, (p1.node(), p2.node()), - message, - files=files, - filectxfn=store, - user=user, - date=date, - branch=branch, - editor=editor) - n = memctx.commit() - finally: - store.close() - if opts.get('exact') and nocommit: - # --exact with --no-commit is still useful in that it does merge - # and branch bits - ui.warn(_("warning: can't check exact import with --no-commit\n")) - elif opts.get('exact') and hex(n) != nodeid: - raise error.Abort(_('patch is damaged or loses information')) - msg = _('applied to working directory') - if n: - # i18n: refers to a short changeset id - msg = _('created %s') % short(n) - return (msg, n, rejects) - finally: - os.unlink(tmpname) + editor = getcommiteditor(editform='import.bypass') + memctx = context.memctx(repo, (p1.node(), p2.node()), + message, + files=files, + filectxfn=store, + user=user, + date=date, + branch=branch, + editor=editor) + n = memctx.commit() + finally: + store.close() + if opts.get('exact') and nocommit: + # --exact with --no-commit is still useful in that it does merge + # and branch bits + ui.warn(_("warning: can't check exact import with --no-commit\n")) + elif opts.get('exact') and hex(n) != nodeid: + raise error.Abort(_('patch is damaged or loses information')) + msg = _('applied to working directory') + if n: + # i18n: refers to a short changeset id + msg = _('created %s') % short(n) + return msg, n, rejects # facility to let extensions include additional data in an exported patch # list of identifiers to be executed in order @@ -1492,7 +1553,7 @@ # it is given two arguments (sequencenumber, changectx) extraexportmap = {} -def _exportsingle(repo, ctx, match, switch_parent, rev, seqno, write, diffopts): +def _exportsingle(repo, ctx, fm, match, switch_parent, seqno, diffopts): node = scmutil.binnode(ctx) parents = [p.node() for p in ctx.parents() if p] branch = ctx.branch() @@ -1504,36 +1565,77 @@ else: prev = nullid - write("# HG changeset patch\n") - write("# User %s\n" % ctx.user()) - write("# Date %d %d\n" % ctx.date()) - write("# %s\n" % util.datestr(ctx.date())) - if branch and branch != 'default': - write("# Branch %s\n" % branch) - write("# Node ID %s\n" % hex(node)) - write("# Parent %s\n" % hex(prev)) + fm.context(ctx=ctx) + fm.plain('# HG changeset patch\n') + fm.write('user', '# User %s\n', ctx.user()) + fm.plain('# Date %d %d\n' % ctx.date()) + fm.write('date', '# %s\n', fm.formatdate(ctx.date())) + fm.condwrite(branch and branch != 'default', + 'branch', '# Branch %s\n', branch) + fm.write('node', '# Node ID %s\n', hex(node)) + fm.plain('# Parent %s\n' % hex(prev)) if len(parents) > 1: - write("# Parent %s\n" % hex(parents[1])) - + fm.plain('# Parent %s\n' % hex(parents[1])) + fm.data(parents=fm.formatlist(pycompat.maplist(hex, parents), name='node')) + + # TODO: redesign extraexportmap function to support formatter for headerid in extraexport: header = extraexportmap[headerid](seqno, ctx) if header is not None: - write('# %s\n' % header) - write(ctx.description().rstrip()) - write("\n\n") - - for chunk, label in patch.diffui(repo, prev, node, match, opts=diffopts): - write(chunk, label=label) - -def export(repo, revs, fntemplate='hg-%h.patch', fp=None, switch_parent=False, + fm.plain('# %s\n' % header) + + fm.write('desc', '%s\n', ctx.description().rstrip()) + fm.plain('\n') + + if fm.isplain(): + chunkiter = patch.diffui(repo, prev, node, match, opts=diffopts) + for chunk, label in chunkiter: + fm.plain(chunk, label=label) + else: + chunkiter = patch.diff(repo, prev, node, match, opts=diffopts) + # TODO: make it structured? + fm.data(diff=b''.join(chunkiter)) + +def _exportfile(repo, revs, fm, dest, switch_parent, diffopts, match): + """Export changesets to stdout or a single file""" + for seqno, rev in enumerate(revs, 1): + ctx = repo[rev] + if not dest.startswith('<'): + repo.ui.note("%s\n" % dest) + fm.startitem() + _exportsingle(repo, ctx, fm, match, switch_parent, seqno, diffopts) + +def _exportfntemplate(repo, revs, basefm, fntemplate, switch_parent, diffopts, + match): + """Export changesets to possibly multiple files""" + total = len(revs) + revwidth = max(len(str(rev)) for rev in revs) + filemap = util.sortdict() # filename: [(seqno, rev), ...] + + for seqno, rev in enumerate(revs, 1): + ctx = repo[rev] + dest = makefilename(ctx, fntemplate, + total=total, seqno=seqno, revwidth=revwidth) + filemap.setdefault(dest, []).append((seqno, rev)) + + for dest in filemap: + with formatter.maybereopen(basefm, dest) as fm: + repo.ui.note("%s\n" % dest) + for seqno, rev in filemap[dest]: + fm.startitem() + ctx = repo[rev] + _exportsingle(repo, ctx, fm, match, switch_parent, seqno, + diffopts) + +def export(repo, revs, basefm, fntemplate='hg-%h.patch', switch_parent=False, opts=None, match=None): '''export changesets as hg patches Args: repo: The repository from which we're exporting revisions. revs: A list of revisions to export as revision numbers. + basefm: A formatter to which patches should be written. fntemplate: An optional string to use for generating patch file names. - fp: An optional file-like object to which patches should be written. switch_parent: If True, show diffs against second parent when not nullid. Default is false, which always shows diff against p1. opts: diff options to use for generating the patch. @@ -1545,538 +1647,25 @@ Side Effect: "HG Changeset Patch" data is emitted to one of the following destinations: - fp is specified: All revs are written to the specified - file-like object. fntemplate specified: Each rev is written to a unique file named using the given template. - Neither fp nor template specified: All revs written to repo.ui.write() + Otherwise: All revs will be written to basefm. ''' - - total = len(revs) - revwidth = max(len(str(rev)) for rev in revs) - filemode = {} - - write = None - dest = '' - if fp: - dest = getattr(fp, 'name', dest) - def write(s, **kw): - fp.write(s) - elif not fntemplate: - write = repo.ui.write - - for seqno, rev in enumerate(revs, 1): - ctx = repo[rev] - fo = None - if not fp and fntemplate: - desc_lines = ctx.description().rstrip().split('\n') - desc = desc_lines[0] #Commit always has a first line. - fo = makefileobj(repo, fntemplate, ctx.node(), desc=desc, - total=total, seqno=seqno, revwidth=revwidth, - mode='wb', modemap=filemode) - dest = fo.name - def write(s, **kw): - fo.write(s) - if not dest.startswith('<'): - repo.ui.note("%s\n" % dest) - _exportsingle( - repo, ctx, match, switch_parent, rev, seqno, write, opts) - if fo is not None: - fo.close() - -def diffordiffstat(ui, repo, diffopts, node1, node2, match, - changes=None, stat=False, fp=None, prefix='', - root='', listsubrepos=False, hunksfilterfn=None): - '''show diff or diffstat.''' - if fp is None: - write = ui.write - else: - def write(s, **kw): - fp.write(s) - - if root: - relroot = pathutil.canonpath(repo.root, repo.getcwd(), root) - else: - relroot = '' - if relroot != '': - # XXX relative roots currently don't work if the root is within a - # subrepo - uirelroot = match.uipath(relroot) - relroot += '/' - for matchroot in match.files(): - if not matchroot.startswith(relroot): - ui.warn(_('warning: %s not inside relative root %s\n') % ( - match.uipath(matchroot), uirelroot)) - - if stat: - diffopts = diffopts.copy(context=0, noprefix=False) - width = 80 - if not ui.plain(): - width = ui.termwidth() - chunks = patch.diff(repo, node1, node2, match, changes, opts=diffopts, - prefix=prefix, relroot=relroot, - hunksfilterfn=hunksfilterfn) - for chunk, label in patch.diffstatui(util.iterlines(chunks), - width=width): - write(chunk, label=label) + scmutil.prefetchfiles(repo, revs, match) + + if not fntemplate: + _exportfile(repo, revs, basefm, '', switch_parent, opts, match) else: - for chunk, label in patch.diffui(repo, node1, node2, match, - changes, opts=diffopts, prefix=prefix, - relroot=relroot, - hunksfilterfn=hunksfilterfn): - write(chunk, label=label) - - if listsubrepos: - ctx1 = repo[node1] - ctx2 = repo[node2] - for subpath, sub in scmutil.itersubrepos(ctx1, ctx2): - tempnode2 = node2 - try: - if node2 is not None: - tempnode2 = ctx2.substate[subpath][1] - except KeyError: - # A subrepo that existed in node1 was deleted between node1 and - # node2 (inclusive). Thus, ctx2's substate won't contain that - # subpath. The best we can do is to ignore it. - tempnode2 = None - submatch = matchmod.subdirmatcher(subpath, match) - sub.diff(ui, diffopts, tempnode2, submatch, changes=changes, - stat=stat, fp=fp, prefix=prefix) - -def _changesetlabels(ctx): - labels = ['log.changeset', 'changeset.%s' % ctx.phasestr()] - if ctx.obsolete(): - labels.append('changeset.obsolete') - if ctx.isunstable(): - labels.append('changeset.unstable') - for instability in ctx.instabilities(): - labels.append('instability.%s' % instability) - return ' '.join(labels) - -class changeset_printer(object): - '''show changeset information when templating not requested.''' - - def __init__(self, ui, repo, matchfn, diffopts, buffered): - self.ui = ui - self.repo = repo - self.buffered = buffered - self.matchfn = matchfn - self.diffopts = diffopts - self.header = {} - self.hunk = {} - self.lastheader = None - self.footer = None - self._columns = templatekw.getlogcolumns() - - def flush(self, ctx): - rev = ctx.rev() - if rev in self.header: - h = self.header[rev] - if h != self.lastheader: - self.lastheader = h - self.ui.write(h) - del self.header[rev] - if rev in self.hunk: - self.ui.write(self.hunk[rev]) - del self.hunk[rev] - - def close(self): - if self.footer: - self.ui.write(self.footer) - - def show(self, ctx, copies=None, matchfn=None, hunksfilterfn=None, - **props): - props = pycompat.byteskwargs(props) - if self.buffered: - self.ui.pushbuffer(labeled=True) - self._show(ctx, copies, matchfn, hunksfilterfn, props) - self.hunk[ctx.rev()] = self.ui.popbuffer() - else: - self._show(ctx, copies, matchfn, hunksfilterfn, props) - - def _show(self, ctx, copies, matchfn, hunksfilterfn, props): - '''show a single changeset or file revision''' - changenode = ctx.node() - rev = ctx.rev() - - if self.ui.quiet: - self.ui.write("%s\n" % scmutil.formatchangeid(ctx), - label='log.node') - return - - columns = self._columns - self.ui.write(columns['changeset'] % scmutil.formatchangeid(ctx), - label=_changesetlabels(ctx)) - - # branches are shown first before any other names due to backwards - # compatibility - branch = ctx.branch() - # don't show the default branch name - if branch != 'default': - self.ui.write(columns['branch'] % branch, label='log.branch') - - for nsname, ns in self.repo.names.iteritems(): - # branches has special logic already handled above, so here we just - # skip it - if nsname == 'branches': - continue - # we will use the templatename as the color name since those two - # should be the same - for name in ns.names(self.repo, changenode): - self.ui.write(ns.logfmt % name, - label='log.%s' % ns.colorname) - if self.ui.debugflag: - self.ui.write(columns['phase'] % ctx.phasestr(), label='log.phase') - for pctx in scmutil.meaningfulparents(self.repo, ctx): - label = 'log.parent changeset.%s' % pctx.phasestr() - self.ui.write(columns['parent'] % scmutil.formatchangeid(pctx), - label=label) - - if self.ui.debugflag and rev is not None: - mnode = ctx.manifestnode() - mrev = self.repo.manifestlog._revlog.rev(mnode) - self.ui.write(columns['manifest'] - % scmutil.formatrevnode(self.ui, mrev, mnode), - label='ui.debug log.manifest') - self.ui.write(columns['user'] % ctx.user(), label='log.user') - self.ui.write(columns['date'] % util.datestr(ctx.date()), - label='log.date') - - if ctx.isunstable(): - instabilities = ctx.instabilities() - self.ui.write(columns['instability'] % ', '.join(instabilities), - label='log.instability') - - elif ctx.obsolete(): - self._showobsfate(ctx) - - self._exthook(ctx) - - if self.ui.debugflag: - files = ctx.p1().status(ctx)[:3] - for key, value in zip(['files', 'files+', 'files-'], files): - if value: - self.ui.write(columns[key] % " ".join(value), - label='ui.debug log.files') - elif ctx.files() and self.ui.verbose: - self.ui.write(columns['files'] % " ".join(ctx.files()), - label='ui.note log.files') - if copies and self.ui.verbose: - copies = ['%s (%s)' % c for c in copies] - self.ui.write(columns['copies'] % ' '.join(copies), - label='ui.note log.copies') - - extra = ctx.extra() - if extra and self.ui.debugflag: - for key, value in sorted(extra.items()): - self.ui.write(columns['extra'] % (key, util.escapestr(value)), - label='ui.debug log.extra') - - description = ctx.description().strip() - if description: - if self.ui.verbose: - self.ui.write(_("description:\n"), - label='ui.note log.description') - self.ui.write(description, - label='ui.note log.description') - self.ui.write("\n\n") - else: - self.ui.write(columns['summary'] % description.splitlines()[0], - label='log.summary') - self.ui.write("\n") - - self.showpatch(ctx, matchfn, hunksfilterfn=hunksfilterfn) - - def _showobsfate(self, ctx): - obsfate = templatekw.showobsfate(repo=self.repo, ctx=ctx, ui=self.ui) - - if obsfate: - for obsfateline in obsfate: - self.ui.write(self._columns['obsolete'] % obsfateline, - label='log.obsfate') - - def _exthook(self, ctx): - '''empty method used by extension as a hook point - ''' - - def showpatch(self, ctx, matchfn, hunksfilterfn=None): - if not matchfn: - matchfn = self.matchfn - if matchfn: - stat = self.diffopts.get('stat') - diff = self.diffopts.get('patch') - diffopts = patch.diffallopts(self.ui, self.diffopts) - node = ctx.node() - prev = ctx.p1().node() - if stat: - diffordiffstat(self.ui, self.repo, diffopts, prev, node, - match=matchfn, stat=True, - hunksfilterfn=hunksfilterfn) - if diff: - if stat: - self.ui.write("\n") - diffordiffstat(self.ui, self.repo, diffopts, prev, node, - match=matchfn, stat=False, - hunksfilterfn=hunksfilterfn) - if stat or diff: - self.ui.write("\n") - -class jsonchangeset(changeset_printer): - '''format changeset information.''' - - def __init__(self, ui, repo, matchfn, diffopts, buffered): - changeset_printer.__init__(self, ui, repo, matchfn, diffopts, buffered) - self.cache = {} - self._first = True - - def close(self): - if not self._first: - self.ui.write("\n]\n") - else: - self.ui.write("[]\n") - - def _show(self, ctx, copies, matchfn, hunksfilterfn, props): - '''show a single changeset or file revision''' - rev = ctx.rev() - if rev is None: - jrev = jnode = 'null' - else: - jrev = '%d' % rev - jnode = '"%s"' % hex(ctx.node()) - j = encoding.jsonescape - - if self._first: - self.ui.write("[\n {") - self._first = False - else: - self.ui.write(",\n {") - - if self.ui.quiet: - self.ui.write(('\n "rev": %s') % jrev) - self.ui.write((',\n "node": %s') % jnode) - self.ui.write('\n }') - return - - self.ui.write(('\n "rev": %s') % jrev) - self.ui.write((',\n "node": %s') % jnode) - self.ui.write((',\n "branch": "%s"') % j(ctx.branch())) - self.ui.write((',\n "phase": "%s"') % ctx.phasestr()) - self.ui.write((',\n "user": "%s"') % j(ctx.user())) - self.ui.write((',\n "date": [%d, %d]') % ctx.date()) - self.ui.write((',\n "desc": "%s"') % j(ctx.description())) - - self.ui.write((',\n "bookmarks": [%s]') % - ", ".join('"%s"' % j(b) for b in ctx.bookmarks())) - self.ui.write((',\n "tags": [%s]') % - ", ".join('"%s"' % j(t) for t in ctx.tags())) - self.ui.write((',\n "parents": [%s]') % - ", ".join('"%s"' % c.hex() for c in ctx.parents())) - - if self.ui.debugflag: - if rev is None: - jmanifestnode = 'null' - else: - jmanifestnode = '"%s"' % hex(ctx.manifestnode()) - self.ui.write((',\n "manifest": %s') % jmanifestnode) - - self.ui.write((',\n "extra": {%s}') % - ", ".join('"%s": "%s"' % (j(k), j(v)) - for k, v in ctx.extra().items())) - - files = ctx.p1().status(ctx) - self.ui.write((',\n "modified": [%s]') % - ", ".join('"%s"' % j(f) for f in files[0])) - self.ui.write((',\n "added": [%s]') % - ", ".join('"%s"' % j(f) for f in files[1])) - self.ui.write((',\n "removed": [%s]') % - ", ".join('"%s"' % j(f) for f in files[2])) - - elif self.ui.verbose: - self.ui.write((',\n "files": [%s]') % - ", ".join('"%s"' % j(f) for f in ctx.files())) - - if copies: - self.ui.write((',\n "copies": {%s}') % - ", ".join('"%s": "%s"' % (j(k), j(v)) - for k, v in copies)) - - matchfn = self.matchfn - if matchfn: - stat = self.diffopts.get('stat') - diff = self.diffopts.get('patch') - diffopts = patch.difffeatureopts(self.ui, self.diffopts, git=True) - node, prev = ctx.node(), ctx.p1().node() - if stat: - self.ui.pushbuffer() - diffordiffstat(self.ui, self.repo, diffopts, prev, node, - match=matchfn, stat=True) - self.ui.write((',\n "diffstat": "%s"') - % j(self.ui.popbuffer())) - if diff: - self.ui.pushbuffer() - diffordiffstat(self.ui, self.repo, diffopts, prev, node, - match=matchfn, stat=False) - self.ui.write((',\n "diff": "%s"') % j(self.ui.popbuffer())) - - self.ui.write("\n }") - -class changeset_templater(changeset_printer): - '''format changeset information. - - Note: there are a variety of convenience functions to build a - changeset_templater for common cases. See functions such as: - makelogtemplater, show_changeset, buildcommittemplate, or other - functions that use changesest_templater. - ''' - - # Arguments before "buffered" used to be positional. Consider not - # adding/removing arguments before "buffered" to not break callers. - def __init__(self, ui, repo, tmplspec, matchfn=None, diffopts=None, - buffered=False): - diffopts = diffopts or {} - - changeset_printer.__init__(self, ui, repo, matchfn, diffopts, buffered) - tres = formatter.templateresources(ui, repo) - self.t = formatter.loadtemplater(ui, tmplspec, - defaults=templatekw.keywords, - resources=tres, - cache=templatekw.defaulttempl) - self._counter = itertools.count() - self.cache = tres['cache'] # shared with _graphnodeformatter() - - self._tref = tmplspec.ref - self._parts = {'header': '', 'footer': '', - tmplspec.ref: tmplspec.ref, - 'docheader': '', 'docfooter': '', - 'separator': ''} - if tmplspec.mapfile: - # find correct templates for current mode, for backward - # compatibility with 'log -v/-q/--debug' using a mapfile - tmplmodes = [ - (True, ''), - (self.ui.verbose, '_verbose'), - (self.ui.quiet, '_quiet'), - (self.ui.debugflag, '_debug'), - ] - for mode, postfix in tmplmodes: - for t in self._parts: - cur = t + postfix - if mode and cur in self.t: - self._parts[t] = cur - else: - partnames = [p for p in self._parts.keys() if p != tmplspec.ref] - m = formatter.templatepartsmap(tmplspec, self.t, partnames) - self._parts.update(m) - - if self._parts['docheader']: - self.ui.write(templater.stringify(self.t(self._parts['docheader']))) - - def close(self): - if self._parts['docfooter']: - if not self.footer: - self.footer = "" - self.footer += templater.stringify(self.t(self._parts['docfooter'])) - return super(changeset_templater, self).close() - - def _show(self, ctx, copies, matchfn, hunksfilterfn, props): - '''show a single changeset or file revision''' - props = props.copy() - props['ctx'] = ctx - props['index'] = index = next(self._counter) - props['revcache'] = {'copies': copies} - props = pycompat.strkwargs(props) - - # write separator, which wouldn't work well with the header part below - # since there's inherently a conflict between header (across items) and - # separator (per item) - if self._parts['separator'] and index > 0: - self.ui.write(templater.stringify(self.t(self._parts['separator']))) - - # write header - if self._parts['header']: - h = templater.stringify(self.t(self._parts['header'], **props)) - if self.buffered: - self.header[ctx.rev()] = h - else: - if self.lastheader != h: - self.lastheader = h - self.ui.write(h) - - # write changeset metadata, then patch if requested - key = self._parts[self._tref] - self.ui.write(templater.stringify(self.t(key, **props))) - self.showpatch(ctx, matchfn, hunksfilterfn=hunksfilterfn) - - if self._parts['footer']: - if not self.footer: - self.footer = templater.stringify( - self.t(self._parts['footer'], **props)) - -def logtemplatespec(tmpl, mapfile): - if mapfile: - return formatter.templatespec('changeset', tmpl, mapfile) - else: - return formatter.templatespec('', tmpl, None) - -def _lookuplogtemplate(ui, tmpl, style): - """Find the template matching the given template spec or style - - See formatter.lookuptemplate() for details. - """ - - # ui settings - if not tmpl and not style: # template are stronger than style - tmpl = ui.config('ui', 'logtemplate') - if tmpl: - return logtemplatespec(templater.unquotestring(tmpl), None) - else: - style = util.expandpath(ui.config('ui', 'style')) - - if not tmpl and style: - mapfile = style - if not os.path.split(mapfile)[0]: - mapname = (templater.templatepath('map-cmdline.' + mapfile) - or templater.templatepath(mapfile)) - if mapname: - mapfile = mapname - return logtemplatespec(None, mapfile) - - if not tmpl: - return logtemplatespec(None, None) - - return formatter.lookuptemplate(ui, 'changeset', tmpl) - -def makelogtemplater(ui, repo, tmpl, buffered=False): - """Create a changeset_templater from a literal template 'tmpl' - byte-string.""" - spec = logtemplatespec(tmpl, None) - return changeset_templater(ui, repo, spec, buffered=buffered) - -def show_changeset(ui, repo, opts, buffered=False): - """show one changeset using template or regular display. - - Display format will be the first non-empty hit of: - 1. option 'template' - 2. option 'style' - 3. [ui] setting 'logtemplate' - 4. [ui] setting 'style' - If all of these values are either the unset or the empty string, - regular display via changeset_printer() is done. - """ - # options - match = None - if opts.get('patch') or opts.get('stat'): - match = scmutil.matchall(repo) - - if opts.get('template') == 'json': - return jsonchangeset(ui, repo, match, opts, buffered) - - spec = _lookuplogtemplate(ui, opts.get('template'), opts.get('style')) - - if not spec.ref and not spec.tmpl and not spec.mapfile: - return changeset_printer(ui, repo, match, opts, buffered) - - return changeset_templater(ui, repo, spec, match, opts, buffered) + _exportfntemplate(repo, revs, basefm, fntemplate, switch_parent, opts, + match) + +def exportfile(repo, revs, fp, switch_parent=False, opts=None, match=None): + """Export changesets to the given file stream""" + scmutil.prefetchfiles(repo, revs, match) + + dest = getattr(fp, 'name', '') + with formatter.formatter(repo.ui, fp, 'export', {}) as fm: + _exportfile(repo, revs, fm, dest, switch_parent, opts, match) def showmarker(fm, marker, index=None): """utility function to display obsolescence marker in a readable way @@ -2096,13 +1685,14 @@ fm.write('date', '(%s) ', fm.formatdate(marker.date())) meta = marker.metadata().copy() meta.pop('date', None) - fm.write('metadata', '{%s}', fm.formatdict(meta, fmt='%r: %r', sep=', ')) + smeta = util.rapply(pycompat.maybebytestr, meta) + fm.write('metadata', '{%s}', fm.formatdict(smeta, fmt='%r: %r', sep=', ')) fm.plain('\n') def finddate(ui, repo, date): """Find the tipmost changeset that matches the given date spec""" - df = util.matchdate(date) + df = dateutil.matchdate(date) m = scmutil.matchall(repo) results = {} @@ -2115,7 +1705,7 @@ rev = ctx.rev() if rev in results: ui.status(_("found revision %s from %s\n") % - (rev, util.datestr(results[rev]))) + (rev, dateutil.datestr(results[rev]))) return '%d' % rev raise error.Abort(_("revision matching date not found")) @@ -2301,7 +1891,7 @@ wanted = set() slowpath = match.anypats() or (not match.always() and opts.get('removed')) fncache = {} - change = repo.changectx + change = repo.__getitem__ # First step is to fill wanted, the set of revisions that we want to yield. # When it does not induce extra cost, we also fill fncache for revisions in @@ -2353,7 +1943,7 @@ else: self.revs.discard(value) ctx = change(value) - matches = filter(match, ctx.files()) + matches = [f for f in ctx.files() if match(f)] if matches: fncache[value] = matches self.set.add(value) @@ -2416,394 +2006,6 @@ return iterate() -def _makelogmatcher(repo, revs, pats, opts): - """Build matcher and expanded patterns from log options - - If --follow, revs are the revisions to follow from. - - Returns (match, pats, slowpath) where - - match: a matcher built from the given pats and -I/-X opts - - pats: patterns used (globs are expanded on Windows) - - slowpath: True if patterns aren't as simple as scanning filelogs - """ - # pats/include/exclude are passed to match.match() directly in - # _matchfiles() revset but walkchangerevs() builds its matcher with - # scmutil.match(). The difference is input pats are globbed on - # platforms without shell expansion (windows). - wctx = repo[None] - match, pats = scmutil.matchandpats(wctx, pats, opts) - slowpath = match.anypats() or (not match.always() and opts.get('removed')) - if not slowpath: - follow = opts.get('follow') or opts.get('follow_first') - startctxs = [] - if follow and opts.get('rev'): - startctxs = [repo[r] for r in revs] - for f in match.files(): - if follow and startctxs: - # No idea if the path was a directory at that revision, so - # take the slow path. - if any(f not in c for c in startctxs): - slowpath = True - continue - elif follow and f not in wctx: - # If the file exists, it may be a directory, so let it - # take the slow path. - if os.path.exists(repo.wjoin(f)): - slowpath = True - continue - else: - raise error.Abort(_('cannot follow file not in parent ' - 'revision: "%s"') % f) - filelog = repo.file(f) - if not filelog: - # A zero count may be a directory or deleted file, so - # try to find matching entries on the slow path. - if follow: - raise error.Abort( - _('cannot follow nonexistent file: "%s"') % f) - slowpath = True - - # We decided to fall back to the slowpath because at least one - # of the paths was not a file. Check to see if at least one of them - # existed in history - in that case, we'll continue down the - # slowpath; otherwise, we can turn off the slowpath - if slowpath: - for path in match.files(): - if path == '.' or path in repo.store: - break - else: - slowpath = False - - return match, pats, slowpath - -def _fileancestors(repo, revs, match, followfirst): - fctxs = [] - for r in revs: - ctx = repo[r] - fctxs.extend(ctx[f].introfilectx() for f in ctx.walk(match)) - - # When displaying a revision with --patch --follow FILE, we have - # to know which file of the revision must be diffed. With - # --follow, we want the names of the ancestors of FILE in the - # revision, stored in "fcache". "fcache" is populated as a side effect - # of the graph traversal. - fcache = {} - def filematcher(rev): - return scmutil.matchfiles(repo, fcache.get(rev, [])) - - def revgen(): - for rev, cs in dagop.filectxancestors(fctxs, followfirst=followfirst): - fcache[rev] = [c.path() for c in cs] - yield rev - return smartset.generatorset(revgen(), iterasc=False), filematcher - -def _makenofollowlogfilematcher(repo, pats, opts): - '''hook for extensions to override the filematcher for non-follow cases''' - return None - -_opt2logrevset = { - 'no_merges': ('not merge()', None), - 'only_merges': ('merge()', None), - '_matchfiles': (None, '_matchfiles(%ps)'), - 'date': ('date(%s)', None), - 'branch': ('branch(%s)', '%lr'), - '_patslog': ('filelog(%s)', '%lr'), - 'keyword': ('keyword(%s)', '%lr'), - 'prune': ('ancestors(%s)', 'not %lr'), - 'user': ('user(%s)', '%lr'), -} - -def _makelogrevset(repo, match, pats, slowpath, opts): - """Return a revset string built from log options and file patterns""" - opts = dict(opts) - # follow or not follow? - follow = opts.get('follow') or opts.get('follow_first') - - # branch and only_branch are really aliases and must be handled at - # the same time - opts['branch'] = opts.get('branch', []) + opts.get('only_branch', []) - opts['branch'] = [repo.lookupbranch(b) for b in opts['branch']] - - if slowpath: - # See walkchangerevs() slow path. - # - # pats/include/exclude cannot be represented as separate - # revset expressions as their filtering logic applies at file - # level. For instance "-I a -X b" matches a revision touching - # "a" and "b" while "file(a) and not file(b)" does - # not. Besides, filesets are evaluated against the working - # directory. - matchargs = ['r:', 'd:relpath'] - for p in pats: - matchargs.append('p:' + p) - for p in opts.get('include', []): - matchargs.append('i:' + p) - for p in opts.get('exclude', []): - matchargs.append('x:' + p) - opts['_matchfiles'] = matchargs - elif not follow: - opts['_patslog'] = list(pats) - - expr = [] - for op, val in sorted(opts.iteritems()): - if not val: - continue - if op not in _opt2logrevset: - continue - revop, listop = _opt2logrevset[op] - if revop and '%' not in revop: - expr.append(revop) - elif not listop: - expr.append(revsetlang.formatspec(revop, val)) - else: - if revop: - val = [revsetlang.formatspec(revop, v) for v in val] - expr.append(revsetlang.formatspec(listop, val)) - - if expr: - expr = '(' + ' and '.join(expr) + ')' - else: - expr = None - return expr - -def _logrevs(repo, opts): - """Return the initial set of revisions to be filtered or followed""" - follow = opts.get('follow') or opts.get('follow_first') - if opts.get('rev'): - revs = scmutil.revrange(repo, opts['rev']) - elif follow and repo.dirstate.p1() == nullid: - revs = smartset.baseset() - elif follow: - revs = repo.revs('.') - else: - revs = smartset.spanset(repo) - revs.reverse() - return revs - -def getlogrevs(repo, pats, opts): - """Return (revs, filematcher) where revs is a smartset - - filematcher is a callable taking a revision number and returning a match - objects filtering the files to be detailed when displaying the revision. - """ - follow = opts.get('follow') or opts.get('follow_first') - followfirst = opts.get('follow_first') - limit = loglimit(opts) - revs = _logrevs(repo, opts) - if not revs: - return smartset.baseset(), None - match, pats, slowpath = _makelogmatcher(repo, revs, pats, opts) - filematcher = None - if follow: - if slowpath or match.always(): - revs = dagop.revancestors(repo, revs, followfirst=followfirst) - else: - revs, filematcher = _fileancestors(repo, revs, match, followfirst) - revs.reverse() - if filematcher is None: - filematcher = _makenofollowlogfilematcher(repo, pats, opts) - if filematcher is None: - def filematcher(rev): - return match - - expr = _makelogrevset(repo, match, pats, slowpath, opts) - if opts.get('graph') and opts.get('rev'): - # User-specified revs might be unsorted, but don't sort before - # _makelogrevset because it might depend on the order of revs - if not (revs.isdescending() or revs.istopo()): - revs.sort(reverse=True) - if expr: - matcher = revset.match(None, expr) - revs = matcher(repo, revs) - if limit is not None: - revs = revs.slice(0, limit) - return revs, filematcher - -def _parselinerangelogopt(repo, opts): - """Parse --line-range log option and return a list of tuples (filename, - (fromline, toline)). - """ - linerangebyfname = [] - for pat in opts.get('line_range', []): - try: - pat, linerange = pat.rsplit(',', 1) - except ValueError: - raise error.Abort(_('malformatted line-range pattern %s') % pat) - try: - fromline, toline = map(int, linerange.split(':')) - except ValueError: - raise error.Abort(_("invalid line range for %s") % pat) - msg = _("line range pattern '%s' must match exactly one file") % pat - fname = scmutil.parsefollowlinespattern(repo, None, pat, msg) - linerangebyfname.append( - (fname, util.processlinerange(fromline, toline))) - return linerangebyfname - -def getloglinerangerevs(repo, userrevs, opts): - """Return (revs, filematcher, hunksfilter). - - "revs" are revisions obtained by processing "line-range" log options and - walking block ancestors of each specified file/line-range. - - "filematcher(rev) -> match" is a factory function returning a match object - for a given revision for file patterns specified in --line-range option. - If neither --stat nor --patch options are passed, "filematcher" is None. - - "hunksfilter(rev) -> filterfn(fctx, hunks)" is a factory function - returning a hunks filtering function. - If neither --stat nor --patch options are passed, "filterhunks" is None. - """ - wctx = repo[None] - - # Two-levels map of "rev -> file ctx -> [line range]". - linerangesbyrev = {} - for fname, (fromline, toline) in _parselinerangelogopt(repo, opts): - if fname not in wctx: - raise error.Abort(_('cannot follow file not in parent ' - 'revision: "%s"') % fname) - fctx = wctx.filectx(fname) - for fctx, linerange in dagop.blockancestors(fctx, fromline, toline): - rev = fctx.introrev() - if rev not in userrevs: - continue - linerangesbyrev.setdefault( - rev, {}).setdefault( - fctx.path(), []).append(linerange) - - filematcher = None - hunksfilter = None - if opts.get('patch') or opts.get('stat'): - - def nofilterhunksfn(fctx, hunks): - return hunks - - def hunksfilter(rev): - fctxlineranges = linerangesbyrev.get(rev) - if fctxlineranges is None: - return nofilterhunksfn - - def filterfn(fctx, hunks): - lineranges = fctxlineranges.get(fctx.path()) - if lineranges is not None: - for hr, lines in hunks: - if hr is None: # binary - yield hr, lines - continue - if any(mdiff.hunkinrange(hr[2:], lr) - for lr in lineranges): - yield hr, lines - else: - for hunk in hunks: - yield hunk - - return filterfn - - def filematcher(rev): - files = list(linerangesbyrev.get(rev, [])) - return scmutil.matchfiles(repo, files) - - revs = sorted(linerangesbyrev, reverse=True) - - return revs, filematcher, hunksfilter - -def _graphnodeformatter(ui, displayer): - spec = ui.config('ui', 'graphnodetemplate') - if not spec: - return templatekw.showgraphnode # fast path for "{graphnode}" - - spec = templater.unquotestring(spec) - tres = formatter.templateresources(ui) - if isinstance(displayer, changeset_templater): - tres['cache'] = displayer.cache # reuse cache of slow templates - templ = formatter.maketemplater(ui, spec, defaults=templatekw.keywords, - resources=tres) - def formatnode(repo, ctx): - props = {'ctx': ctx, 'repo': repo, 'revcache': {}} - return templ.render(props) - return formatnode - -def displaygraph(ui, repo, dag, displayer, edgefn, getrenamed=None, - filematcher=None, props=None): - props = props or {} - formatnode = _graphnodeformatter(ui, displayer) - state = graphmod.asciistate() - styles = state['styles'] - - # only set graph styling if HGPLAIN is not set. - if ui.plain('graph'): - # set all edge styles to |, the default pre-3.8 behaviour - styles.update(dict.fromkeys(styles, '|')) - else: - edgetypes = { - 'parent': graphmod.PARENT, - 'grandparent': graphmod.GRANDPARENT, - 'missing': graphmod.MISSINGPARENT - } - for name, key in edgetypes.items(): - # experimental config: experimental.graphstyle.* - styles[key] = ui.config('experimental', 'graphstyle.%s' % name, - styles[key]) - if not styles[key]: - styles[key] = None - - # experimental config: experimental.graphshorten - state['graphshorten'] = ui.configbool('experimental', 'graphshorten') - - for rev, type, ctx, parents in dag: - char = formatnode(repo, ctx) - copies = None - if getrenamed and ctx.rev(): - copies = [] - for fn in ctx.files(): - rename = getrenamed(fn, ctx.rev()) - if rename: - copies.append((fn, rename[0])) - revmatchfn = None - if filematcher is not None: - revmatchfn = filematcher(ctx.rev()) - edges = edgefn(type, char, state, rev, parents) - firstedge = next(edges) - width = firstedge[2] - displayer.show(ctx, copies=copies, matchfn=revmatchfn, - _graphwidth=width, **pycompat.strkwargs(props)) - lines = displayer.hunk.pop(rev).split('\n') - if not lines[-1]: - del lines[-1] - displayer.flush(ctx) - for type, char, width, coldata in itertools.chain([firstedge], edges): - graphmod.ascii(ui, state, type, char, lines, coldata) - lines = [] - displayer.close() - -def graphlog(ui, repo, revs, filematcher, opts): - # Parameters are identical to log command ones - revdag = graphmod.dagwalker(repo, revs) - - getrenamed = None - if opts.get('copies'): - endrev = None - if opts.get('rev'): - endrev = scmutil.revrange(repo, opts.get('rev')).max() + 1 - getrenamed = templatekw.getrenamedfn(repo, endrev=endrev) - - ui.pager('log') - displayer = show_changeset(ui, repo, opts, buffered=True) - displaygraph(ui, repo, revdag, displayer, graphmod.asciiedges, getrenamed, - filematcher) - -def checkunsupportedgraphflags(pats, opts): - for op in ["newest_first"]: - if op in opts and opts[op]: - raise error.Abort(_("-G/--graph option is incompatible with --%s") - % op.replace("_", "-")) - -def graphrevs(repo, nodes, opts): - limit = loglimit(opts) - nodes.reverse() - if limit is not None: - nodes = nodes[:limit] - return graphmod.nodes(repo, nodes) - def add(ui, repo, match, prefix, explicitonly, **opts): join = lambda f: os.path.join(prefix, f) bad = [] @@ -2856,7 +2058,9 @@ for subpath in ctx.substate: ctx.sub(subpath).addwebdirpath(serverpath, webconf) -def forget(ui, repo, match, prefix, explicitonly): +def forget(ui, repo, match, prefix, explicitonly, dryrun, interactive): + if dryrun and interactive: + raise error.Abort(_("cannot specify both --dry-run and --interactive")) join = lambda f: os.path.join(prefix, f) bad = [] badfn = lambda x, y: bad.append(x) or match.bad(x, y) @@ -2872,7 +2076,8 @@ sub = wctx.sub(subpath) try: submatch = matchmod.subdirmatcher(subpath, match) - subbad, subforgot = sub.forget(submatch, prefix) + subbad, subforgot = sub.forget(submatch, prefix, dryrun=dryrun, + interactive=interactive) bad.extend([subpath + '/' + f for f in subbad]) forgot.extend([subpath + '/' + f for f in subforgot]) except error.LookupError: @@ -2895,13 +2100,40 @@ % match.rel(f)) bad.append(f) + if interactive: + responses = _('[Ynsa?]' + '$$ &Yes, forget this file' + '$$ &No, skip this file' + '$$ &Skip remaining files' + '$$ Include &all remaining files' + '$$ &? (display help)') + for filename in forget[:]: + r = ui.promptchoice(_('forget %s %s') % (filename, responses)) + if r == 4: # ? + while r == 4: + for c, t in ui.extractchoices(responses)[1]: + ui.write('%s - %s\n' % (c, encoding.lower(t))) + r = ui.promptchoice(_('forget %s %s') % (filename, + responses)) + if r == 0: # yes + continue + elif r == 1: # no + forget.remove(filename) + elif r == 2: # Skip + fnindex = forget.index(filename) + del forget[fnindex:] + break + elif r == 3: # All + break + for f in forget: - if ui.verbose or not match.exact(f): + if ui.verbose or not match.exact(f) or interactive: ui.status(_('removing %s\n') % match.rel(f)) - rejected = wctx.forget(forget, prefix) - bad.extend(f for f in rejected if f in match.files()) - forgot.extend(f for f in forget if f not in rejected) + if not dryrun: + rejected = wctx.forget(forget, prefix) + bad.extend(f for f in rejected if f in match.files()) + forgot.extend(f for f in forget if f not in rejected) return bad, forgot def files(ui, ctx, m, fm, fmt, subrepos): @@ -2934,7 +2166,7 @@ return ret -def remove(ui, repo, m, prefix, after, force, subrepos, warnings=None): +def remove(ui, repo, m, prefix, after, force, subrepos, dryrun, warnings=None): join = lambda f: os.path.join(prefix, f) ret = 0 s = repo.status(match=m, clean=True) @@ -2959,7 +2191,7 @@ sub = wctx.sub(subpath) try: if sub.removefiles(submatch, prefix, after, force, subrepos, - warnings): + dryrun, warnings): ret = 1 except error.LookupError: warnings.append(_("skipping missing subrepository: %s\n") @@ -3039,13 +2271,14 @@ ui.status(_('removing %s\n') % m.rel(f)) ui.progress(_('deleting'), None) - with repo.wlock(): - if not after: - for f in list: - if f in added: - continue # we never unlink added files on remove - repo.wvfs.unlinkpath(f, ignoremissing=True) - repo[None].forget(list) + if not dryrun: + with repo.wlock(): + if not after: + for f in list: + if f in added: + continue # we never unlink added files on remove + repo.wvfs.unlinkpath(f, ignoremissing=True) + repo[None].forget(list) if warn: for warning in warnings: @@ -3072,14 +2305,14 @@ def write(path): filename = None if fntemplate: - filename = makefilename(repo, fntemplate, ctx.node(), + filename = makefilename(ctx, fntemplate, pathname=os.path.join(prefix, path)) # attempt to create the directory if it does not already exist try: os.makedirs(os.path.dirname(filename)) except OSError: pass - with formatter.maybereopen(basefm, filename, opts) as fm: + with formatter.maybereopen(basefm, filename) as fm: _updatecatformatter(fm, ctx, matcher, path, opts.get('decode')) # Automation often uses hg cat on single files, so special case it @@ -3090,11 +2323,14 @@ mfnode = ctx.manifestnode() try: if mfnode and mfl[mfnode].find(file)[0]: + scmutil.prefetchfiles(repo, [ctx.rev()], matcher) write(file) return 0 except KeyError: pass + scmutil.prefetchfiles(repo, [ctx.rev()], matcher) + for abs in ctx.walk(matcher): write(abs) err = 0 @@ -3118,7 +2354,7 @@ '''commit the specified files or all outstanding changes''' date = opts.get('date') if date: - opts['date'] = util.parsedate(date) + opts['date'] = dateutil.parsedate(date) message = logmessage(ui, opts) matcher = scmutil.match(repo[None], pats, opts) @@ -3183,7 +2419,7 @@ date = opts.get('date') or old.date() # Parse the date to allow comparison between date and old.date() - date = util.parsedate(date) + date = dateutil.parsedate(date) if len(old.parents()) > 1: # ctx.files() isn't reliable for merges, so fall back to the @@ -3205,16 +2441,13 @@ # subrepo.precommit(). To minimize the risk of this hack, we do # nothing if .hgsub does not exist. if '.hgsub' in wctx or '.hgsub' in old: - from . import subrepo # avoid cycle: cmdutil -> subrepo -> cmdutil - subs, commitsubs, newsubstate = subrepo.precommit( + subs, commitsubs, newsubstate = subrepoutil.precommit( ui, wctx, wctx._status, matcher) # amend should abort if commitsubrepos is enabled assert not commitsubs if subs: - subrepo.writestate(repo, newsubstate) - - # avoid cycle (TODO: should be removed in default branch) - from . import merge as mergemod + subrepoutil.writestate(repo, newsubstate) + ms = mergemod.mergestate.read(repo) mergeutil.checkunresolved(ms) @@ -3404,7 +2637,7 @@ def buildcommittemplate(repo, ctx, subs, extramsg, ref): ui = repo.ui spec = formatter.templatespec(ref, None, None) - t = changeset_templater(ui, repo, spec, None, {}, False) + t = logcmdutil.changesettemplater(ui, repo, spec) t.t.cache.update((k, templater.unquotestring(v)) for k, v in repo.ui.configitems('committemplate')) @@ -3487,12 +2720,12 @@ if not opts.get('close_branch'): for r in parents: if r.closesbranch() and r.branch() == branch: - repo.ui.status(_('reopening closed branch head %d\n') % r) + repo.ui.status(_('reopening closed branch head %d\n') % r.rev()) if repo.ui.debugflag: - repo.ui.write(_('committed changeset %d:%s\n') % (int(ctx), ctx.hex())) + repo.ui.write(_('committed changeset %d:%s\n') % (ctx.rev(), ctx.hex())) elif repo.ui.verbose: - repo.ui.write(_('committed changeset %d:%s\n') % (int(ctx), ctx)) + repo.ui.write(_('committed changeset %d:%s\n') % (ctx.rev(), ctx)) def postcommitstatus(repo, pats, opts): return repo.status(match=scmutil.match(repo[None], pats, opts)) @@ -3769,7 +3002,18 @@ if not opts.get('dry_run'): needdata = ('revert', 'add', 'undelete') - _revertprefetch(repo, ctx, *[actions[name][0] for name in needdata]) + if _revertprefetch is not _revertprefetchstub: + ui.deprecwarn("'cmdutil._revertprefetch' is deprecated, " + "add a callback to 'scmutil.fileprefetchhooks'", + '4.6', stacklevel=1) + _revertprefetch(repo, ctx, + *[actions[name][0] for name in needdata]) + oplist = [actions[name][0] for name in needdata] + prefetch = scmutil.prefetchfiles + matchfiles = scmutil.matchfiles + prefetch(repo, [ctx.rev()], + matchfiles(repo, + [f for sublist in oplist for f in sublist])) _performrevert(repo, parents, ctx, actions, interactive, tobackup) if targetsubs: @@ -3782,8 +3026,11 @@ raise error.Abort("subrepository '%s' does not exist in %s!" % (sub, short(ctx.node()))) -def _revertprefetch(repo, ctx, *files): - """Let extension changing the storage layer prefetch content""" +def _revertprefetchstub(repo, ctx, *files): + """Stub method for detecting extension wrapping of _revertprefetch(), to + issue a deprecation warning.""" + +_revertprefetch = _revertprefetchstub def _performrevert(repo, parents, ctx, actions, interactive=False, tobackup=None): @@ -3797,7 +3044,6 @@ parent, p2 = parents node = ctx.node() excluded_files = [] - matcher_opts = {"exclude": excluded_files} def checkout(f): fc = ctx[f] @@ -3818,7 +3064,7 @@ if choice == 0: repo.dirstate.drop(f) else: - excluded_files.append(repo.wjoin(f)) + excluded_files.append(f) else: repo.dirstate.drop(f) for f in actions['remove'][0]: @@ -3829,7 +3075,7 @@ if choice == 0: doremove(f) else: - excluded_files.append(repo.wjoin(f)) + excluded_files.append(f) else: doremove(f) for f in actions['drop'][0]: @@ -3849,8 +3095,8 @@ newlyaddedandmodifiedfiles = set() if interactive: # Prompt the user for changes to revert - torevert = [repo.wjoin(f) for f in actions['revert'][0]] - m = scmutil.match(ctx, torevert, matcher_opts) + torevert = [f for f in actions['revert'][0] if f not in excluded_files] + m = scmutil.matchfiles(repo, torevert) diffopts = patch.difffeatureopts(repo.ui, whitespace=True) diffopts.nodates = True diffopts.git = True @@ -3895,7 +3141,7 @@ try: patch.internalpatch(repo.ui, repo, fp, 1, eolmode=None) except error.PatchError as err: - raise error.Abort(str(err)) + raise error.Abort(pycompat.bytestr(err)) del fp else: for f in actions['revert'][0]: @@ -4031,3 +3277,23 @@ if after[1]: hint = after[0] raise error.Abort(_('no %s in progress') % task, hint=hint) + +class changeset_printer(logcmdutil.changesetprinter): + + def __init__(self, ui, *args, **kwargs): + msg = ("'cmdutil.changeset_printer' is deprecated, " + "use 'logcmdutil.logcmdutil'") + ui.deprecwarn(msg, "4.6") + super(changeset_printer, self).__init__(ui, *args, **kwargs) + +def displaygraph(ui, *args, **kwargs): + msg = ("'cmdutil.displaygraph' is deprecated, " + "use 'logcmdutil.displaygraph'") + ui.deprecwarn(msg, "4.6") + return logcmdutil.displaygraph(ui, *args, **kwargs) + +def show_changeset(ui, *args, **kwargs): + msg = ("'cmdutil.show_changeset' is deprecated, " + "use 'logcmdutil.changesetdisplayer'") + ui.deprecwarn(msg, "4.6") + return logcmdutil.changesetdisplayer(ui, *args, **kwargs) diff -r fb92df8b634c -r ed5448edcbfa mercurial/color.py --- a/mercurial/color.py Wed Apr 04 10:35:09 2018 -0400 +++ b/mercurial/color.py Wed Apr 18 15:32:08 2018 -0400 @@ -14,7 +14,10 @@ from . import ( encoding, pycompat, - util +) + +from .utils import ( + stringutil, ) try: @@ -87,14 +90,16 @@ 'branches.inactive': 'none', 'diff.changed': 'white', 'diff.deleted': 'red', - 'diff.deleted.highlight': 'red bold underline', + 'diff.deleted.changed': 'red', + 'diff.deleted.unchanged': 'red dim', 'diff.diffline': 'bold', 'diff.extended': 'cyan bold', 'diff.file_a': 'red bold', 'diff.file_b': 'green bold', 'diff.hunk': 'magenta', 'diff.inserted': 'green', - 'diff.inserted.highlight': 'green bold underline', + 'diff.inserted.changed': 'green', + 'diff.inserted.unchanged': 'green dim', 'diff.tab': '', 'diff.trailingwhitespace': 'bold red_background', 'changeset.public': '', @@ -165,15 +170,15 @@ ui._terminfoparams.clear() return - for key, (b, e, c) in ui._terminfoparams.items(): + for key, (b, e, c) in ui._terminfoparams.copy().items(): if not b: continue - if not c and not curses.tigetstr(e): + if not c and not curses.tigetstr(pycompat.sysstr(e)): # Most terminals don't support dim, invis, etc, so don't be # noisy and use ui.debug(). ui.debug("no terminfo entry for %s\n" % e) del ui._terminfoparams[key] - if not curses.tigetstr('setaf') or not curses.tigetstr('setab'): + if not curses.tigetstr(r'setaf') or not curses.tigetstr(r'setab'): # Only warn about missing terminfo entries if we explicitly asked for # terminfo mode and we're in a formatted terminal. if mode == "terminfo" and formatted: @@ -200,7 +205,7 @@ auto = (config == 'auto') always = False - if not auto and util.parsebool(config): + if not auto and stringutil.parsebool(config): # We want the config to behave like a boolean, "on" is actually auto, # but "always" value is treated as a special case to reduce confusion. if ui.configsource('ui', 'color') == '--color' or config == 'always': @@ -322,11 +327,11 @@ if termcode: return termcode else: - return curses.tigetstr(val) + return curses.tigetstr(pycompat.sysstr(val)) elif bg: - return curses.tparm(curses.tigetstr('setab'), val) + return curses.tparm(curses.tigetstr(r'setab'), val) else: - return curses.tparm(curses.tigetstr('setaf'), val) + return curses.tparm(curses.tigetstr(r'setaf'), val) def _mergeeffects(text, start, stop): """Insert start sequence at every occurrence of stop sequence @@ -371,7 +376,7 @@ """add color control code according to the mode""" if ui._colormode == 'debug': if label and msg: - if msg[-1] == '\n': + if msg.endswith('\n'): msg = "[%s|%s]\n" % (label, msg[:-1]) else: msg = "[%s|%s]" % (label, msg) diff -r fb92df8b634c -r ed5448edcbfa mercurial/commands.py --- a/mercurial/commands.py Wed Apr 04 10:35:09 2018 -0400 +++ b/mercurial/commands.py Wed Apr 18 15:32:08 2018 -0400 @@ -41,6 +41,7 @@ help, hg, lock as lockmod, + logcmdutil, merge as mergemod, obsolete, obsutil, @@ -53,12 +54,17 @@ rewriteutil, scmutil, server, - sshserver, streamclone, tags as tagsmod, templatekw, ui as uimod, util, + wireprotoserver, +) +from .utils import ( + dateutil, + procutil, + stringutil, ) release = lockmod.release @@ -67,7 +73,7 @@ table.update(debugcommandsmod.command._table) command = registrar.command(table) -readonly = registrar.command.readonly +INTENT_READONLY = registrar.INTENT_READONLY # common command options @@ -241,14 +247,10 @@ Returns 0 if all files are successfully added. """ opts = pycompat.byteskwargs(opts) - try: - sim = float(opts.get('similarity') or 100) - except ValueError: - raise error.Abort(_('similarity must be a number')) - if sim < 0 or sim > 100: - raise error.Abort(_('similarity must be between 0 and 100')) + if not opts.get('similarity'): + opts['similarity'] = '100' matcher = scmutil.match(repo[None], pats, opts) - return scmutil.addremove(repo, matcher, "", opts, similarity=sim / 100.0) + return scmutil.addremove(repo, matcher, "", opts) @command('^annotate|blame', [('r', 'rev', '', _('annotate the specified revision'), _('REV')), @@ -301,9 +303,9 @@ rootfm = ui.formatter('annotate', opts) if ui.quiet: - datefunc = util.shortdate + datefunc = dateutil.shortdate else: - datefunc = util.datestr + datefunc = dateutil.datestr if ctx.rev() is None: def hexfn(node): if node is None: @@ -336,8 +338,8 @@ ('number', ' ', lambda x: x.fctx.rev(), formatrev), ('changeset', ' ', lambda x: hexfn(x.fctx.node()), formathex), ('date', ' ', lambda x: x.fctx.date(), util.cachefunc(datefunc)), - ('file', ' ', lambda x: x.fctx.path(), str), - ('line_number', ':', lambda x: x.lineno, str), + ('file', ' ', lambda x: x.fctx.path(), pycompat.bytestr), + ('line_number', ':', lambda x: x.lineno, pycompat.bytestr), ] fieldnamemap = {'number': 'rev', 'changeset': 'node'} @@ -384,9 +386,9 @@ % ((pats and m.rel(abs)) or abs)) continue - fm = rootfm.nested('lines') - lines = fctx.annotate(follow=follow, linenumber=linenumber, - skiprevs=skiprevs, diffopts=diffopts) + fm = rootfm.nested('lines', tmpl='{rev}: {line}') + lines = fctx.annotate(follow=follow, skiprevs=skiprevs, + diffopts=diffopts) if not lines: fm.end() continue @@ -394,7 +396,7 @@ pieces = [] for f, sep in funcmap: - l = [f(n) for n, dummy in lines] + l = [f(n) for n in lines] if fm.isplain(): sizes = [encoding.colwidth(x) for x in l] ml = max(sizes) @@ -403,16 +405,17 @@ formats.append(['%s' for x in l]) pieces.append(l) - for f, p, l in zip(zip(*formats), zip(*pieces), lines): + for f, p, n in zip(zip(*formats), zip(*pieces), lines): fm.startitem() + fm.context(fctx=n.fctx) fm.write(fields, "".join(f), *p) - if l[0].skip: + if n.skip: fmt = "* %s" else: fmt = ": %s" - fm.write('line', fmt, l[1]) - - if not lines[-1][1].endswith('\n'): + fm.write('line', fmt, n.text) + + if not lines[-1].text.endswith('\n'): fm.plain('\n') fm.end() @@ -475,7 +478,7 @@ if not ctx: raise error.Abort(_('no working directory: please specify a revision')) node = ctx.node() - dest = cmdutil.makefilename(repo, dest, node) + dest = cmdutil.makefilename(ctx, dest) if os.path.realpath(dest) == repo.root: raise error.Abort(_('repository root cannot be destination')) @@ -485,11 +488,11 @@ if dest == '-': if kind == 'files': raise error.Abort(_('cannot archive plain files to stdout')) - dest = cmdutil.makefileobj(repo, dest) + dest = cmdutil.makefileobj(ctx, dest) if not prefix: prefix = os.path.basename(repo.root) + '-%h' - prefix = cmdutil.makefilename(repo, prefix, node) + prefix = cmdutil.makefilename(ctx, prefix) match = scmutil.match(ctx, [], opts) archival.archive(repo, dest, node, kind, not opts.get('no_decode'), match, prefix, subrepos=opts.get('subrepos')) @@ -583,7 +586,7 @@ date = opts.get('date') if date: - opts['date'] = util.parsedate(date) + opts['date'] = dateutil.parsedate(date) cmdutil.checkunfinished(repo) cmdutil.bailifchanged(repo) @@ -622,7 +625,7 @@ repo.setparents(op1, op2) dsguard.close() hg._showstats(repo, stats) - if stats[3]: + if stats.unresolvedcount: repo.ui.status(_("use 'hg resolve' to retry unresolved " "file merges\n")) return 1 @@ -802,7 +805,7 @@ # update state if good or bad or skip: if rev: - nodes = [repo.lookup(i) for i in scmutil.revrange(repo, [rev])] + nodes = [repo[i].node() for i in scmutil.revrange(repo, [rev])] else: nodes = [repo.lookup('.')] if good: @@ -823,7 +826,7 @@ cmdutil.bailifchanged(repo) return hg.clean(repo, node, show_stats=show_stats) - displayer = cmdutil.show_changeset(ui, repo, {}) + displayer = logcmdutil.changesetdisplayer(ui, repo, {}) if command: changesets = 1 @@ -859,7 +862,8 @@ transition = "bad" state[transition].append(node) ctx = repo[node] - ui.status(_('changeset %d:%s: %s\n') % (ctx, ctx, transition)) + ui.status(_('changeset %d:%s: %s\n') % (ctx.rev(), ctx, + transition)) hbisect.checkstate(state) # bisect nodes, changesets, bgood = hbisect.bisect(repo, state) @@ -1079,7 +1083,8 @@ _('show only branches that have unmerged heads (DEPRECATED)')), ('c', 'closed', False, _('show normal and closed branches')), ] + formatteropts, - _('[-c]'), cmdtype=readonly) + _('[-c]'), + intents={INTENT_READONLY}) def branches(ui, repo, active=False, closed=False, **opts): """list repository named branches @@ -1129,7 +1134,7 @@ fm.startitem() fm.write('branch', '%s', tag, label=label) rev = ctx.rev() - padsize = max(31 - len(str(rev)) - encoding.colwidth(tag), 0) + padsize = max(31 - len("%d" % rev) - encoding.colwidth(tag), 0) fmt = ' ' * padsize + ' %d:%s' fm.condwrite(not ui.quiet, 'rev node', fmt, rev, hexfunc(ctx.node()), label='log.changeset changeset.%s' % ctx.phasestr()) @@ -1156,13 +1161,15 @@ def bundle(ui, repo, fname, dest=None, **opts): """create a bundle file - Generate a bundle file containing data to be added to a repository. + Generate a bundle file containing data to be transferred to another + repository. To create a bundle containing all changesets, use -a/--all (or --base null). Otherwise, hg assumes the destination will have all the nodes you specify with --base parameters. Otherwise, hg will assume the repository has all the nodes in destination, or - default-push/default if no destination is specified. + default-push/default if no destination is specified, where destination + is the repository you provide through DEST option. You can change bundle format with the -t/--type option. See :hg:`help bundlespec` for documentation on this format. By default, @@ -1189,12 +1196,12 @@ bundletype = opts.get('type', 'bzip2').lower() try: - bcompression, cgversion, params = exchange.parsebundlespec( - repo, bundletype, strict=False) + bundlespec = exchange.parsebundlespec(repo, bundletype, strict=False) except error.UnsupportedBundleSpecification as e: - raise error.Abort(str(e), + raise error.Abort(pycompat.bytestr(e), hint=_("see 'hg help bundlespec' for supported " "values for --type")) + cgversion = bundlespec.contentopts["cg.version"] # Packed bundles are a pseudo bundle format for now. if cgversion == 's1': @@ -1218,13 +1225,14 @@ if dest: raise error.Abort(_("--base is incompatible with specifying " "a destination")) - common = [repo.lookup(rev) for rev in base] - heads = revs and map(repo.lookup, revs) or None + common = [repo[rev].node() for rev in base] + heads = [repo[r].node() for r in revs] if revs else None outgoing = discovery.outgoing(repo, common, heads) else: dest = ui.expandpath(dest or 'default-push', dest or 'default') dest, branches = hg.parseurl(dest, opts.get('branch')) other = hg.peer(repo, opts, dest) + revs = [repo[r].hex() for r in revs] revs, checkout = hg.addbranchrevs(repo, repo, branches, revs) heads = revs and map(repo.lookup, revs) or revs outgoing = discovery.findcommonoutgoing(repo, other, @@ -1237,12 +1245,11 @@ return 1 if cgversion == '01': #bundle1 - if bcompression is None: - bcompression = 'UN' - bversion = 'HG10' + bcompression + bversion = 'HG10' + bundlespec.wirecompression bcompression = None elif cgversion in ('02', '03'): bversion = 'HG20' + bcompression = bundlespec.wirecompression else: raise error.ProgrammingError( 'bundle: unexpected changegroup version %s' % cgversion) @@ -1252,18 +1259,22 @@ # level without a) formalizing the bundlespec changes to declare it # b) introducing a command flag. compopts = {} - complevel = ui.configint('experimental', 'bundlecomplevel') + complevel = ui.configint('experimental', + 'bundlecomplevel.' + bundlespec.compression) + if complevel is None: + complevel = ui.configint('experimental', 'bundlecomplevel') if complevel is not None: compopts['level'] = complevel - - contentopts = {'cg.version': cgversion} + # Allow overriding the bundling of obsmarker in phases through + # configuration while we don't have a bundle version that include them if repo.ui.configbool('experimental', 'evolution.bundle-obsmarker'): - contentopts['obsolescence'] = True + bundlespec.contentopts['obsolescence'] = True if repo.ui.configbool('experimental', 'bundle-phases'): - contentopts['phases'] = True + bundlespec.contentopts['phases'] = True + bundle2.writenewbundle(ui, repo, 'bundle', fname, bversion, outgoing, - contentopts, compression=bcompression, + bundlespec.contentopts, compression=bcompression, compopts=compopts) @command('cat', @@ -1273,7 +1284,8 @@ ('', 'decode', None, _('apply any matching decode filter')), ] + walkopts + formatteropts, _('[OPTION]... FILE...'), - inferrepo=True, cmdtype=readonly) + inferrepo=True, + intents={INTENT_READONLY}) def cat(ui, repo, file1, *pats, **opts): """output the current or given revision of files @@ -1281,7 +1293,9 @@ no revision is given, the parent of the working directory is used. Output may be to a file, in which case the name of the file is - given using a format string. The formatting rules as follows: + given using a template string. See :hg:`help templates`. In addition + to the common template keywords, the following formatting rules are + supported: :``%%``: literal "%" character :``%s``: basename of file being printed @@ -1292,6 +1306,7 @@ :``%h``: short-form changeset hash (12 hexadecimal digits) :``%r``: zero-padded changeset revision number :``%b``: basename of the exporting repository + :``\\``: literal "\\" character Returns 0 on success. """ @@ -1306,7 +1321,7 @@ fntemplate = '' if fntemplate: - fm = formatter.nullformatter(ui, 'cat') + fm = formatter.nullformatter(ui, 'cat', opts) else: ui.pager('cat') fm = ui.formatter('cat', opts) @@ -1319,8 +1334,10 @@ 'directory (only a repository)')), ('u', 'updaterev', '', _('revision, tag, or branch to check out'), _('REV')), - ('r', 'rev', [], _('include the specified changeset'), _('REV')), - ('b', 'branch', [], _('clone only the specified branch'), _('BRANCH')), + ('r', 'rev', [], _('do not clone everything, but include this changeset' + ' and its ancestors'), _('REV')), + ('b', 'branch', [], _('do not clone everything, but include this branch\'s' + ' changesets and their ancestors'), _('BRANCH')), ('', 'pull', None, _('use pull protocol to copy metadata')), ('', 'uncompressed', None, _('an alias to --stream (DEPRECATED)')), @@ -1399,12 +1416,13 @@ i) tip When cloning from servers that support it, Mercurial may fetch - pre-generated data from a server-advertised URL. When this is done, - hooks operating on incoming changesets and changegroups may fire twice, - once for the bundle fetched from the URL and another for any additional - data not fetched from this URL. In addition, if an error occurs, the - repository may be rolled back to a partial clone. This behavior may - change in future releases. See :hg:`help -e clonebundles` for more. + pre-generated data from a server-advertised URL or inline from the + same stream. When this is done, hooks operating on incoming changesets + and changegroups may fire more than once, once for each pre-generated + bundle and as well as for any additional remaining data. In addition, + if an error occurs, the repository may be rolled back to a partial + clone. This behavior may change in future releases. + See :hg:`help -e clonebundles` for more. Examples: @@ -1443,7 +1461,7 @@ r = hg.clone(ui, opts, source, dest, pull=opts.get('pull'), stream=opts.get('stream') or opts.get('uncompressed'), - rev=opts.get('rev'), + revs=opts.get('rev'), update=opts.get('updaterev') or not opts.get('noupdate'), branch=opts.get('branch'), shareopts=opts.get('shareopts')) @@ -1550,7 +1568,7 @@ extra = {} if opts.get('close_branch'): - extra['close'] = 1 + extra['close'] = '1' if not bheads: raise error.Abort(_('can only close branch heads')) @@ -1618,7 +1636,8 @@ ('l', 'local', None, _('edit repository config')), ('g', 'global', None, _('edit global config'))] + formatteropts, _('[-u] [NAME]...'), - optionalrepo=True, cmdtype=readonly) + optionalrepo=True, + intents={INTENT_READONLY}) def config(ui, repo, *values, **opts): """show combined config settings from all hgrc files @@ -1628,7 +1647,7 @@ of that config item. With multiple arguments, print names and values of all config - items with matching section names. + items with matching section names or section.names. With --edit, start an editor on the user-level config file. With --global, edit the system-wide config file. With --local, edit the @@ -1689,11 +1708,15 @@ else: raise error.ProgrammingError('unknown rctype: %s' % t) untrusted = bool(opts.get('untrusted')) + + selsections = selentries = [] if values: - sections = [v for v in values if '.' not in v] - items = [v for v in values if '.' in v] - if len(items) > 1 or items and sections: - raise error.Abort(_('only one config item permitted')) + selsections = [v for v in values if '.' not in v] + selentries = [v for v in values if '.' in v] + uniquesel = (len(selentries) == 1 and not selsections) + selsections = set(selsections) + selentries = set(selentries) + matched = False for section, name, value in ui.walkconfig(untrusted=untrusted): source = ui.configsource(section, name, untrusted) @@ -1702,24 +1725,16 @@ source = source or 'none' value = value.replace('\n', '\\n') entryname = section + '.' + name - if values: - for v in values: - if v == section: - fm.startitem() - fm.condwrite(ui.debugflag, 'source', '%s: ', source) - fm.write('name value', '%s=%s\n', entryname, value) - matched = True - elif v == entryname: - fm.startitem() - fm.condwrite(ui.debugflag, 'source', '%s: ', source) - fm.write('value', '%s\n', value) - fm.data(name=entryname) - matched = True + if values and not (section in selsections or entryname in selentries): + continue + fm.startitem() + fm.condwrite(ui.debugflag, 'source', '%s: ', source) + if uniquesel: + fm.data(name=entryname) + fm.write('value', '%s\n', value) else: - fm.startitem() - fm.condwrite(ui.debugflag, 'source', '%s: ', source) fm.write('name value', '%s=%s\n', entryname, value) - matched = True + matched = True fm.end() if matched: return 0 @@ -1791,7 +1806,8 @@ ('c', 'change', '', _('change made by revision'), _('REV')) ] + diffopts + diffopts2 + walkopts + subrepoopts, _('[OPTION]... ([-c REV] | [-r REV1 [-r REV2]]) [FILE]...'), - inferrepo=True, cmdtype=readonly) + inferrepo=True, + intents={INTENT_READONLY}) def diff(ui, repo, *pats, **opts): """diff repository (or selected files) @@ -1861,29 +1877,31 @@ raise error.Abort(msg) elif change: repo = scmutil.unhidehashlikerevs(repo, [change], 'nowarn') - node2 = scmutil.revsingle(repo, change, None).node() - node1 = repo[node2].p1().node() + ctx2 = scmutil.revsingle(repo, change, None) + ctx1 = ctx2.p1() else: repo = scmutil.unhidehashlikerevs(repo, revs, 'nowarn') - node1, node2 = scmutil.revpair(repo, revs) + ctx1, ctx2 = scmutil.revpair(repo, revs) + node1, node2 = ctx1.node(), ctx2.node() if reverse: node1, node2 = node2, node1 diffopts = patch.diffallopts(ui, opts) - m = scmutil.match(repo[node2], pats, opts) + m = scmutil.match(ctx2, pats, opts) ui.pager('diff') - cmdutil.diffordiffstat(ui, repo, diffopts, node1, node2, m, stat=stat, - listsubrepos=opts.get('subrepos'), - root=opts.get('root')) + logcmdutil.diffordiffstat(ui, repo, diffopts, node1, node2, m, stat=stat, + listsubrepos=opts.get('subrepos'), + root=opts.get('root')) @command('^export', [('o', 'output', '', _('print output to file with formatted name'), _('FORMAT')), ('', 'switch-parent', None, _('diff against the second parent')), ('r', 'rev', [], _('revisions to export'), _('REV')), - ] + diffopts, - _('[OPTION]... [-o OUTFILESPEC] [-r] [REV]...'), cmdtype=readonly) + ] + diffopts + formatteropts, + _('[OPTION]... [-o OUTFILESPEC] [-r] [REV]...'), + intents={INTENT_READONLY}) def export(ui, repo, *changesets, **opts): """dump the header and diffs for one or more changesets @@ -1901,7 +1919,9 @@ first parent only. Output may be to a file, in which case the name of the file is - given using a format string. The formatting rules are as follows: + given using a template string. See :hg:`help templates`. In addition + to the common template keywords, the following formatting rules are + supported: :``%%``: literal "%" character :``%H``: changeset hash (40 hexadecimal digits) @@ -1912,6 +1932,7 @@ :``%m``: first line of the commit message (only alphanumeric characters) :``%n``: zero-padded sequence number, starting at 1 :``%r``: zero-padded changeset revision number + :``\\``: literal "\\" character Without the -a/--text option, export will avoid generating diffs of files it detects as binary. With -a, export will generate a @@ -1956,16 +1977,27 @@ ui.note(_('exporting patches:\n')) else: ui.note(_('exporting patch:\n')) - ui.pager('export') - cmdutil.export(repo, revs, fntemplate=opts.get('output'), - switch_parent=opts.get('switch_parent'), - opts=patch.diffallopts(ui, opts)) + + fntemplate = opts.get('output') + if cmdutil.isstdiofilename(fntemplate): + fntemplate = '' + + if fntemplate: + fm = formatter.nullformatter(ui, 'export', opts) + else: + ui.pager('export') + fm = ui.formatter('export', opts) + with fm: + cmdutil.export(repo, revs, fm, fntemplate=fntemplate, + switch_parent=opts.get('switch_parent'), + opts=patch.diffallopts(ui, opts)) @command('files', [('r', 'rev', '', _('search the repository as it is in REV'), _('REV')), ('0', 'print0', None, _('end filenames with NUL, for use with xargs')), ] + walkopts + formatteropts + subrepoopts, - _('[OPTION]... [FILE]...'), cmdtype=readonly) + _('[OPTION]... [FILE]...'), + intents={INTENT_READONLY}) def files(ui, repo, *pats, **opts): """list tracked files @@ -2027,7 +2059,11 @@ with ui.formatter('files', opts) as fm: return cmdutil.files(ui, ctx, m, fm, fmt, opts.get('subrepos')) -@command('^forget', walkopts, _('[OPTION]... FILE...'), inferrepo=True) +@command( + '^forget', + [('i', 'interactive', None, _('use interactive mode')), + ] + walkopts + dryrunopts, + _('[OPTION]... FILE...'), inferrepo=True) def forget(ui, repo, *pats, **opts): """forget the specified files on the next commit @@ -2062,7 +2098,10 @@ raise error.Abort(_('no files specified')) m = scmutil.match(repo[None], pats, opts) - rejected = cmdutil.forget(ui, repo, m, prefix="", explicitonly=False)[0] + dryrun, interactive = opts.get('dry_run'), opts.get('interactive') + rejected = cmdutil.forget(ui, repo, m, prefix="", + explicitonly=False, dryrun=dryrun, + interactive=interactive)[0] return rejected and 1 or 0 @command( @@ -2153,7 +2192,7 @@ if not opts.get('user') and opts.get('currentuser'): opts['user'] = ui.username() if not opts.get('date') and opts.get('currentdate'): - opts['date'] = "%d %d" % util.makedate() + opts['date'] = "%d %d" % dateutil.makedate() editor = cmdutil.getcommiteditor(editform='graft', **pycompat.strkwargs(opts)) @@ -2172,10 +2211,10 @@ raise cmdutil.wrongtooltocontinue(repo, _('graft')) else: + if not revs: + raise error.Abort(_('no revisions specified')) cmdutil.checkunfinished(repo) cmdutil.bailifchanged(repo) - if not revs: - raise error.Abort(_('no revisions specified')) revs = scmutil.revrange(repo, revs) skipped = set() @@ -2292,15 +2331,15 @@ finally: repo.ui.setconfig('ui', 'forcemerge', '', 'graft') # report any conflicts - if stats and stats[3] > 0: + if stats.unresolvedcount > 0: # write out state for --continue nodelines = [repo[rev].hex() + "\n" for rev in revs[pos:]] repo.vfs.write('graftstate', ''.join(nodelines)) extra = '' if opts.get('user'): - extra += ' --user %s' % util.shellquote(opts['user']) + extra += ' --user %s' % procutil.shellquote(opts['user']) if opts.get('date'): - extra += ' --date %s' % util.shellquote(opts['date']) + extra += ' --date %s' % procutil.shellquote(opts['date']) if opts.get('log'): extra += ' --log' hint=_("use 'hg resolve' and 'hg graft --continue%s'") % extra @@ -2341,7 +2380,8 @@ ('d', 'date', None, _('list the date (short with -q)')), ] + formatteropts + walkopts, _('[OPTION]... PATTERN [FILE]...'), - inferrepo=True, cmdtype=readonly) + inferrepo=True, + intents={INTENT_READONLY}) def grep(ui, repo, pattern, *pats, **opts): """search revision history for a pattern in specified files @@ -2370,7 +2410,7 @@ try: regexp = util.re.compile(pattern, reflags) except re.error as inst: - ui.warn(_("grep: invalid match pattern: %s\n") % inst) + ui.warn(_("grep: invalid match pattern: %s\n") % pycompat.bytestr(inst)) return 1 sep, eol = ':', '\n' if opts.get('print0'): @@ -2454,7 +2494,7 @@ @util.cachefunc def binary(): flog = getfile(fn) - return util.binary(flog.read(ctx.filenode(fn))) + return stringutil.binary(flog.read(ctx.filenode(fn))) fieldnamemap = {'filename': 'file', 'linenumber': 'line_number'} if opts.get('all'): @@ -2486,7 +2526,7 @@ if not opts.get('text') and binary(): fm.plain(_(" Binary file matches")) else: - displaymatches(fm.nested('texts'), l) + displaymatches(fm.nested('texts', tmpl='{text}'), l) fm.plain(eol) found = True if opts.get('files_with_matches'): @@ -2571,8 +2611,11 @@ skip[fn] = True if copy: skip[copy] = True - del matches[rev] del revfiles[rev] + # We will keep the matches dict for the duration of the window + # clear the matches dict once the window is over + if not revfiles: + matches.clear() fm.end() return not found @@ -2584,7 +2627,8 @@ ('a', 'active', False, _('show active branchheads only (DEPRECATED)')), ('c', 'closed', False, _('show normal and closed branch heads')), ] + templateopts, - _('[-ct] [-r STARTREV] [REV]...'), cmdtype=readonly) + _('[-ct] [-r STARTREV] [REV]...'), + intents={INTENT_READONLY}) def heads(ui, repo, *branchrevs, **opts): """show branch heads @@ -2626,7 +2670,8 @@ heads = [repo[h] for h in heads] if branchrevs: - branches = set(repo[br].branch() for br in branchrevs) + branches = set(repo[r].branch() + for r in scmutil.revrange(repo, branchrevs)) heads = [h for h in heads if h.branch() in branches] if opts.get('active') and branchrevs: @@ -2647,7 +2692,7 @@ ui.pager('heads') heads = sorted(heads, key=lambda x: -x.rev()) - displayer = cmdutil.show_changeset(ui, repo, opts) + displayer = logcmdutil.changesetdisplayer(ui, repo, opts) for ctx in heads: displayer.show(ctx) displayer.close() @@ -2659,7 +2704,8 @@ ('s', 'system', [], _('show help for specific platform(s)')), ], _('[-ecks] [TOPIC]'), - norepo=True, cmdtype=readonly) + norepo=True, + intents={INTENT_READONLY}) def help_(ui, name=None, **opts): """show help for a given topic or a help overview @@ -2701,7 +2747,8 @@ ('B', 'bookmarks', None, _('show bookmarks')), ] + remoteopts + formatteropts, _('[-nibtB] [-r REV] [SOURCE]'), - optionalrepo=True, cmdtype=readonly) + optionalrepo=True, + intents={INTENT_READONLY}) def identify(ui, repo, source=None, rev=None, num=None, id=None, branch=None, tags=None, bookmarks=None, **opts): """identify the working directory or specified revision @@ -2824,7 +2871,7 @@ numoutput = ["%d" % p.rev() for p in parents] output.append("%s%s" % ('+'.join(numoutput), dirty)) - fn = fm.nested('parents') + fn = fm.nested('parents', tmpl='{rev}:{node|formatnode}', sep=' ') for p in parents: fn.startitem() fn.data(rev=p.rev()) @@ -3003,7 +3050,7 @@ date = opts.get('date') if date: - opts['date'] = util.parsedate(date) + opts['date'] = dateutil.parsedate(date) exact = opts.get('exact') update = not opts.get('bypass') @@ -3055,9 +3102,10 @@ haspatch = False for hunk in patch.split(patchfile): - (msg, node, rej) = cmdutil.tryimportone(ui, repo, hunk, - parents, opts, - msgs, hg.clean) + with patch.extract(ui, hunk) as patchdata: + msg, node, rej = cmdutil.tryimportone(ui, repo, patchdata, + parents, opts, + msgs, hg.clean) if msg: haspatch = True ui.note(msg + '\n') @@ -3155,11 +3203,11 @@ """ opts = pycompat.byteskwargs(opts) if opts.get('graph'): - cmdutil.checkunsupportedgraphflags([], opts) + logcmdutil.checkunsupportedgraphflags([], opts) def display(other, chlist, displayer): - revdag = cmdutil.graphrevs(other, chlist, opts) - cmdutil.displaygraph(ui, repo, revdag, displayer, - graphmod.asciiedges) + revdag = logcmdutil.graphrevs(other, chlist, opts) + logcmdutil.displaygraph(ui, repo, revdag, displayer, + graphmod.asciiedges) hg._incoming(display, lambda: 1, ui, repo, source, opts, buffered=True) return 0 @@ -3236,10 +3284,9 @@ end = '\0' else: end = '\n' - rev = scmutil.revsingle(repo, opts.get('rev'), None).node() + ctx = scmutil.revsingle(repo, opts.get('rev'), None) ret = 1 - ctx = repo[rev] m = scmutil.match(ctx, pats, opts, default='relglob', badfn=lambda x, y: False) @@ -3278,7 +3325,8 @@ _('do not display revision or any of its ancestors'), _('REV')), ] + logopts + walkopts, _('[OPTION]... [FILE]'), - inferrepo=True, cmdtype=readonly) + inferrepo=True, + intents={INTENT_READONLY}) def log(ui, repo, *pats, **opts): """show revision history of entire repository or files @@ -3414,71 +3462,40 @@ raise error.Abort(_('--line-range requires --follow')) if linerange and pats: + # TODO: take pats as patterns with no line-range filter raise error.Abort( _('FILE arguments are not compatible with --line-range option') ) repo = scmutil.unhidehashlikerevs(repo, opts.get('rev'), 'nowarn') - revs, filematcher = cmdutil.getlogrevs(repo, pats, opts) - hunksfilter = None - - if opts.get('graph'): - if linerange: - raise error.Abort(_('graph not supported with line range patterns')) - return cmdutil.graphlog(ui, repo, revs, filematcher, opts) - + revs, differ = logcmdutil.getrevs(repo, pats, opts) if linerange: - revs, lrfilematcher, hunksfilter = cmdutil.getloglinerangerevs( - repo, revs, opts) - - if filematcher is not None and lrfilematcher is not None: - basefilematcher = filematcher - - def filematcher(rev): - files = (basefilematcher(rev).files() - + lrfilematcher(rev).files()) - return scmutil.matchfiles(repo, files) - - elif filematcher is None: - filematcher = lrfilematcher + # TODO: should follow file history from logcmdutil._initialrevs(), + # then filter the result by logcmdutil._makerevset() and --limit + revs, differ = logcmdutil.getlinerangerevs(repo, revs, opts) getrenamed = None if opts.get('copies'): endrev = None - if opts.get('rev'): - endrev = scmutil.revrange(repo, opts.get('rev')).max() + 1 + if revs: + endrev = revs.max() + 1 getrenamed = templatekw.getrenamedfn(repo, endrev=endrev) ui.pager('log') - displayer = cmdutil.show_changeset(ui, repo, opts, buffered=True) - for rev in revs: - ctx = repo[rev] - copies = None - if getrenamed is not None and rev: - copies = [] - for fn in ctx.files(): - rename = getrenamed(fn, rev) - if rename: - copies.append((fn, rename[0])) - if filematcher: - revmatchfn = filematcher(ctx.rev()) - else: - revmatchfn = None - if hunksfilter: - revhunksfilter = hunksfilter(rev) - else: - revhunksfilter = None - displayer.show(ctx, copies=copies, matchfn=revmatchfn, - hunksfilterfn=revhunksfilter) - displayer.flush(ctx) - - displayer.close() + displayer = logcmdutil.changesetdisplayer(ui, repo, opts, differ, + buffered=True) + if opts.get('graph'): + displayfn = logcmdutil.displaygraphrevs + else: + displayfn = logcmdutil.displayrevs + displayfn(ui, repo, revs, displayer, getrenamed) @command('manifest', [('r', 'rev', '', _('revision to display'), _('REV')), ('', 'all', False, _("list files from all revisions"))] + formatteropts, - _('[-r REV]'), cmdtype=readonly) + _('[-r REV]'), + intents={INTENT_READONLY}) def manifest(ui, repo, node=None, rev=None, **opts): """output the current or given revision of the project manifest @@ -3501,17 +3518,13 @@ if rev or node: raise error.Abort(_("can't specify a revision with --all")) - res = [] - prefix = "data/" - suffix = ".i" - plen = len(prefix) - slen = len(suffix) - with repo.lock(): - for fn, b, size in repo.store.datafiles(): - if size != 0 and fn[-slen:] == suffix and fn[:plen] == prefix: - res.append(fn[plen:-slen]) + res = set() + for rev in repo: + ctx = repo[rev] + res |= set(ctx.files()) + ui.pager('manifest') - for f in res: + for f in sorted(res): fm.startitem() fm.write("path", '%s\n', f) fm.end() @@ -3523,8 +3536,8 @@ if not node: node = rev - char = {'l': '@', 'x': '*', '': ''} - mode = {'l': '644', 'x': '755', '': '644'} + char = {'l': '@', 'x': '*', '': '', 't': 'd'} + mode = {'l': '644', 'x': '755', '': '644', 't': '755'} if node: repo = scmutil.unhidehashlikerevs(repo, [node], 'nowarn') ctx = scmutil.revsingle(repo, node) @@ -3601,10 +3614,10 @@ if opts.get('preview'): # find nodes that are ancestors of p2 but not of p1 p1 = repo.lookup('.') - p2 = repo.lookup(node) + p2 = node nodes = repo.changelog.findmissing(common=[p1], heads=[p2]) - displayer = cmdutil.show_changeset(ui, repo, opts) + displayer = logcmdutil.changesetdisplayer(ui, repo, opts) for node in nodes: displayer.show(repo[node]) displayer.close() @@ -3668,16 +3681,17 @@ """ opts = pycompat.byteskwargs(opts) if opts.get('graph'): - cmdutil.checkunsupportedgraphflags([], opts) + logcmdutil.checkunsupportedgraphflags([], opts) o, other = hg._outgoing(ui, repo, dest, opts) if not o: cmdutil.outgoinghooks(ui, repo, other, opts, o) return - revdag = cmdutil.graphrevs(repo, o, opts) + revdag = logcmdutil.graphrevs(repo, o, opts) ui.pager('outgoing') - displayer = cmdutil.show_changeset(ui, repo, opts, buffered=True) - cmdutil.displaygraph(ui, repo, revdag, displayer, graphmod.asciiedges) + displayer = logcmdutil.changesetdisplayer(ui, repo, opts, buffered=True) + logcmdutil.displaygraph(ui, repo, revdag, displayer, + graphmod.asciiedges) cmdutil.outgoinghooks(ui, repo, other, opts, o) return 0 @@ -3752,14 +3766,14 @@ else: p = [cp.node() for cp in ctx.parents()] - displayer = cmdutil.show_changeset(ui, repo, opts) + displayer = logcmdutil.changesetdisplayer(ui, repo, opts) for n in p: if n != nullid: displayer.show(repo[n]) displayer.close() @command('paths', formatteropts, _('[NAME]'), optionalrepo=True, - cmdtype=readonly) + intents={INTENT_READONLY}) def paths(ui, repo, search=None, **opts): """show aliases for remote repositories @@ -3804,7 +3818,7 @@ if fm.isplain(): hidepassword = util.hidepassword else: - hidepassword = str + hidepassword = bytes if ui.quiet: namefmt = '%s\n' else: @@ -3930,7 +3944,7 @@ try: return hg.updatetotally(ui, repo, checkout, brev) except error.UpdateAbort as inst: - msg = _("not updating: %s") % str(inst) + msg = _("not updating: %s") % stringutil.forcebytestr(inst) hint = inst.hint raise error.UpdateAbort(msg, hint=hint) if modheads > 1: @@ -3965,6 +3979,12 @@ -R is specified). By default, this does not update the copy of the project in the working directory. + When cloning from servers that support it, Mercurial may fetch + pre-generated data. When this is done, hooks operating on incoming + changesets and changegroups may fire more than once, once for each + pre-generated bundle and as well as for any additional remaining + data. See :hg:`help -e clonebundles` for more. + Use :hg:`incoming` if you want to see what would have been added by a pull at the time you issued this command. If you then decide to add those changes to the repository, you should use :hg:`pull @@ -4018,7 +4038,9 @@ oldrevs = revs revs = [] # actually, nodes for r in oldrevs: - node = other.lookup(r) + with other.commandexecutor() as e: + node = e.callcommand('lookup', {'key': r}).result() + revs.append(node) if r == checkout: checkout = node @@ -4043,7 +4065,7 @@ brev = None if checkout: - checkout = str(repo.changelog.rev(checkout)) + checkout = repo.changelog.rev(checkout) # order below depends on implementation of # hg.addbranchrevs(). opts['bookmark'] is ignored, @@ -4159,7 +4181,7 @@ other = hg.peer(repo, opts, dest) if revs: - revs = [repo.lookup(r) for r in scmutil.revrange(repo, revs)] + revs = [repo[r].node() for r in scmutil.revrange(repo, revs)] if not revs: raise error.Abort(_("specified revisions evaluate to an empty set"), hint=_("use different revision arguments")) @@ -4176,7 +4198,7 @@ repo._subtoppath = dest try: # push subrepos depth-first for coherent ordering - c = repo[''] + c = repo['.'] subs = c.substate # only repos that are committed for s in sorted(subs): result = c.sub(s).push(opts) @@ -4223,7 +4245,7 @@ [('A', 'after', None, _('record delete for missing files')), ('f', 'force', None, _('forget added files, delete modified files')), - ] + subrepoopts + walkopts, + ] + subrepoopts + walkopts + dryrunopts, _('[OPTION]... FILE...'), inferrepo=True) def remove(ui, repo, *pats, **opts): @@ -4267,12 +4289,14 @@ opts = pycompat.byteskwargs(opts) after, force = opts.get('after'), opts.get('force') + dryrun = opts.get('dry_run') if not pats and not after: raise error.Abort(_('no files specified')) m = scmutil.match(repo[None], pats, opts) subrepos = opts.get('subrepos') - return cmdutil.remove(ui, repo, m, "", after, force, subrepos) + return cmdutil.remove(ui, repo, m, "", after, force, subrepos, + dryrun=dryrun) @command('rename|move|mv', [('A', 'after', None, _('record a rename that has already occurred')), @@ -4373,11 +4397,12 @@ # as 'P'. Resolved path conflicts show as 'R', the same as normal # resolved conflicts. mergestateinfo = { - 'u': ('resolve.unresolved', 'U'), - 'r': ('resolve.resolved', 'R'), - 'pu': ('resolve.unresolved', 'P'), - 'pr': ('resolve.resolved', 'R'), - 'd': ('resolve.driverresolved', 'D'), + mergemod.MERGE_RECORD_UNRESOLVED: ('resolve.unresolved', 'U'), + mergemod.MERGE_RECORD_RESOLVED: ('resolve.resolved', 'R'), + mergemod.MERGE_RECORD_UNRESOLVED_PATH: ('resolve.unresolved', 'P'), + mergemod.MERGE_RECORD_RESOLVED_PATH: ('resolve.resolved', 'R'), + mergemod.MERGE_RECORD_DRIVER_RESOLVED: ('resolve.driverresolved', + 'D'), } for f in ms: @@ -4400,7 +4425,8 @@ wctx = repo[None] - if ms.mergedriver and ms.mdstate() == 'u': + if (ms.mergedriver + and ms.mdstate() == mergemod.MERGE_DRIVER_STATE_UNMARKED): proceed = mergemod.driverpreprocess(repo, ms, wctx) ms.commit() # allow mark and unmark to go through @@ -4421,7 +4447,7 @@ # don't let driver-resolved files be marked, and run the conclude # step if asked to resolve - if ms[f] == "d": + if ms[f] == mergemod.MERGE_RECORD_DRIVER_RESOLVED: exact = m.exact(f) if mark: if exact: @@ -4436,20 +4462,21 @@ continue # path conflicts must be resolved manually - if ms[f] in ("pu", "pr"): + if ms[f] in (mergemod.MERGE_RECORD_UNRESOLVED_PATH, + mergemod.MERGE_RECORD_RESOLVED_PATH): if mark: - ms.mark(f, "pr") + ms.mark(f, mergemod.MERGE_RECORD_RESOLVED_PATH) elif unmark: - ms.mark(f, "pu") - elif ms[f] == "pu": + ms.mark(f, mergemod.MERGE_RECORD_UNRESOLVED_PATH) + elif ms[f] == mergemod.MERGE_RECORD_UNRESOLVED_PATH: ui.warn(_('%s: path conflict must be resolved manually\n') % f) continue if mark: - ms.mark(f, "r") + ms.mark(f, mergemod.MERGE_RECORD_RESOLVED) elif unmark: - ms.mark(f, "u") + ms.mark(f, mergemod.MERGE_RECORD_UNRESOLVED) else: # backup pre-resolve (merge uses .orig for its own purposes) a = repo.wjoin(f) @@ -4513,7 +4540,7 @@ for f in ms: if not m(f): continue - flags = ''.join(['-%s ' % o[0] for o in flaglist + flags = ''.join(['-%s ' % o[0:1] for o in flaglist if opts.get(o)]) hint = _("(try: hg resolve %s%s)\n") % ( flags, @@ -4686,7 +4713,7 @@ return repo.rollback(dryrun=opts.get(r'dry_run'), force=opts.get(r'force')) -@command('root', [], cmdtype=readonly) +@command('root', [], intents={INTENT_READONLY}) def root(ui, repo): """print the root (top) of the current working directory @@ -4757,7 +4784,7 @@ if repo is None: raise error.RepoError(_("there is no Mercurial repository here" " (.hg not found)")) - s = sshserver.sshserver(ui, repo) + s = wireprotoserver.sshserver(ui, repo) s.serve_forever() service = server.createservice(ui, repo, opts) @@ -4780,7 +4807,8 @@ ('', 'change', '', _('list the changed files of a revision'), _('REV')), ] + walkopts + subrepoopts + formatteropts, _('[OPTION]... [FILE]...'), - inferrepo=True, cmdtype=readonly) + inferrepo=True, + intents={INTENT_READONLY}) def status(ui, repo, *pats, **opts): """show changed files in the working directory @@ -4879,11 +4907,11 @@ raise error.Abort(msg) elif change: repo = scmutil.unhidehashlikerevs(repo, [change], 'nowarn') - node2 = scmutil.revsingle(repo, change, None).node() - node1 = repo[node2].p1().node() + ctx2 = scmutil.revsingle(repo, change, None) + ctx1 = ctx2.p1() else: repo = scmutil.unhidehashlikerevs(repo, revs, 'nowarn') - node1, node2 = scmutil.revpair(repo, revs) + ctx1, ctx2 = scmutil.revpair(repo, revs) if pats or ui.configbool('commands', 'status.relative'): cwd = repo.getcwd() @@ -4906,16 +4934,16 @@ else: show = states[:5] - m = scmutil.match(repo[node2], pats, opts) + m = scmutil.match(ctx2, pats, opts) if terse: # we need to compute clean and unknown to terse - stat = repo.status(node1, node2, m, + stat = repo.status(ctx1.node(), ctx2.node(), m, 'ignored' in show or 'i' in terse, True, True, opts.get('subrepos')) stat = cmdutil.tersedir(stat, terse) else: - stat = repo.status(node1, node2, m, + stat = repo.status(ctx1.node(), ctx2.node(), m, 'ignored' in show, 'clean' in show, 'unknown' in show, opts.get('subrepos')) @@ -4923,7 +4951,7 @@ if (opts.get('all') or opts.get('copies') or ui.configbool('ui', 'statuscopies')) and not opts.get('no_status'): - copy = copies.pathcopies(repo[node1], repo[node2], m) + copy = copies.pathcopies(ctx1, ctx2, m) ui.pager('status') fm = ui.formatter('status', opts) @@ -4948,7 +4976,8 @@ @command('^summary|sum', [('', 'remote', None, _('check for push and pull'))], - '[--remote]', cmdtype=readonly) + '[--remote]', + intents={INTENT_READONLY}) def summary(ui, repo, **opts): """summarize working directory state @@ -4984,7 +5013,7 @@ # shows a working directory parent *changeset*: # i18n: column positioning for "hg summary" ui.write(_('parent: %d:%s ') % (p.rev(), p), - label=cmdutil._changesetlabels(p)) + label=logcmdutil.changesetlabels(p)) ui.write(' '.join(p.tags()), label='log.tag') if p.bookmarks(): marks.extend(p.bookmarks()) @@ -5321,16 +5350,16 @@ if not opts.get('force') and bheads and p1 not in bheads: raise error.Abort(_('working directory is not at a branch head ' '(use -f to force)')) - r = scmutil.revsingle(repo, rev_).node() + node = scmutil.revsingle(repo, rev_).node() if not message: # we don't translate commit messages message = ('Added tag %s for changeset %s' % - (', '.join(names), short(r))) + (', '.join(names), short(node))) date = opts.get('date') if date: - date = util.parsedate(date) + date = dateutil.parsedate(date) if opts.get('remove'): editform = 'tag.remove' @@ -5344,12 +5373,12 @@ scmutil.revsingle(repo, rev_).rev() == nullrev): raise error.Abort(_("cannot tag null revision")) - tagsmod.tag(repo, names, r, message, opts.get('local'), + tagsmod.tag(repo, names, node, message, opts.get('local'), opts.get('user'), date, editor=editor) finally: release(lock, wlock) -@command('tags', formatteropts, '', cmdtype=readonly) +@command('tags', formatteropts, '', intents={INTENT_READONLY}) def tags(ui, repo, **opts): """list repository tags @@ -5406,7 +5435,7 @@ Returns 0 on success. """ opts = pycompat.byteskwargs(opts) - displayer = cmdutil.show_changeset(ui, repo, opts) + displayer = logcmdutil.changesetdisplayer(ui, repo, opts) displayer.show(repo['tip']) displayer.close() @@ -5458,8 +5487,7 @@ ('r', 'rev', '', _('revision'), _('REV')) ] + mergetoolopts, _('[-C|-c|-m] [-d DATE] [[-r] REV]')) -def update(ui, repo, node=None, rev=None, clean=False, date=None, check=False, - merge=None, tool=None): +def update(ui, repo, node=None, **opts): """update working directory (or switch revisions) Update the repository's working directory to the specified @@ -5514,6 +5542,11 @@ Returns 0 on success, 1 if there are unresolved files. """ + rev = opts.get(r'rev') + date = opts.get(r'date') + clean = opts.get(r'clean') + check = opts.get(r'check') + merge = opts.get(r'merge') if rev and node: raise error.Abort(_("please specify just one revision")) @@ -5558,7 +5591,7 @@ obsfatemsg = obsutil._getfilteredreason(repo, ctxstr, ctx) ui.warn("(%s)\n" % obsfatemsg) - repo.ui.setconfig('ui', 'forcemerge', tool, 'update') + repo.ui.setconfig('ui', 'forcemerge', opts.get(r'tool'), 'update') return hg.updatetotally(ui, repo, rev, brev, clean=clean, updatecheck=updatecheck) @@ -5582,7 +5615,8 @@ """ return hg.verify(repo) -@command('version', [] + formatteropts, norepo=True, cmdtype=readonly) +@command('version', [] + formatteropts, norepo=True, + intents={INTENT_READONLY}) def version_(ui, **opts): """output version and copyright information""" opts = pycompat.byteskwargs(opts) @@ -5612,7 +5646,7 @@ names.append(name) vers.append(extensions.moduleversion(module) or None) isinternals.append(extensions.ismoduleinternal(module)) - fn = fm.nested("extensions") + fn = fm.nested("extensions", tmpl='{name}\n') if names: namefmt = " %%-%ds " % max(len(n) for n in names) places = [_("external"), _("internal")] diff -r fb92df8b634c -r ed5448edcbfa mercurial/commandserver.py --- a/mercurial/commandserver.py Wed Apr 04 10:35:09 2018 -0400 +++ b/mercurial/commandserver.py Wed Apr 18 15:32:08 2018 -0400 @@ -16,14 +16,22 @@ import struct import traceback +try: + import selectors + selectors.BaseSelector +except ImportError: + from .thirdparty import selectors2 as selectors + from .i18n import _ -from .thirdparty import selectors2 from . import ( encoding, error, pycompat, util, ) +from .utils import ( + procutil, +) logfile = None @@ -280,7 +288,7 @@ hellomsg += '\n' hellomsg += 'encoding: ' + encoding.encoding hellomsg += '\n' - hellomsg += 'pid: %d' % util.getpid() + hellomsg += 'pid: %d' % procutil.getpid() if util.safehasattr(os, 'getpgid'): hellomsg += '\n' hellomsg += 'pgid: %d' % os.getpgid(0) @@ -298,29 +306,6 @@ return 0 -def _protectio(ui): - """ duplicates streams and redirect original to null if ui uses stdio """ - ui.flush() - newfiles = [] - nullfd = os.open(os.devnull, os.O_RDWR) - for f, sysf, mode in [(ui.fin, util.stdin, pycompat.sysstr('rb')), - (ui.fout, util.stdout, pycompat.sysstr('wb'))]: - if f is sysf: - newfd = os.dup(f.fileno()) - os.dup2(nullfd, f.fileno()) - f = os.fdopen(newfd, mode) - newfiles.append(f) - os.close(nullfd) - return tuple(newfiles) - -def _restoreio(ui, fin, fout): - """ restores streams from duplicated ones """ - ui.flush() - for f, uif in [(fin, ui.fin), (fout, ui.fout)]: - if f is not uif: - os.dup2(f.fileno(), uif.fileno()) - f.close() - class pipeservice(object): def __init__(self, ui, repo, opts): self.ui = ui @@ -333,13 +318,12 @@ ui = self.ui # redirect stdio to null device so that broken extensions or in-process # hooks will never cause corruption of channel protocol. - fin, fout = _protectio(ui) - try: - sv = server(ui, self.repo, fin, fout) - return sv.serve() - finally: - sv.cleanup() - _restoreio(ui, fin, fout) + with procutil.protectedstdio(ui.fin, ui.fout) as (fin, fout): + try: + sv = server(ui, self.repo, fin, fout) + return sv.serve() + finally: + sv.cleanup() def _initworkerprocess(): # use a different process group from the master process, in order to: @@ -449,8 +433,8 @@ def init(self): self._sock = socket.socket(socket.AF_UNIX) self._servicehandler.bindsocket(self._sock, self.address) - if util.safehasattr(util, 'unblocksignal'): - util.unblocksignal(signal.SIGCHLD) + if util.safehasattr(procutil, 'unblocksignal'): + procutil.unblocksignal(signal.SIGCHLD) o = signal.signal(signal.SIGCHLD, self._sigchldhandler) self._oldsigchldhandler = o self._socketunlinked = False @@ -476,8 +460,8 @@ def _mainloop(self): exiting = False h = self._servicehandler - selector = selectors2.DefaultSelector() - selector.register(self._sock, selectors2.EVENT_READ) + selector = selectors.DefaultSelector() + selector.register(self._sock, selectors.EVENT_READ) while True: if not exiting and h.shouldexit(): # clients can no longer connect() to the domain socket, so diff -r fb92df8b634c -r ed5448edcbfa mercurial/config.py --- a/mercurial/config.py Wed Apr 04 10:35:09 2018 -0400 +++ b/mercurial/config.py Wed Apr 18 15:32:08 2018 -0400 @@ -154,7 +154,7 @@ if inst.errno != errno.ENOENT: raise error.ParseError(_("cannot include %s (%s)") % (inc, inst.strerror), - "%s:%s" % (src, line)) + "%s:%d" % (src, line)) continue if emptyre.match(l): continue @@ -185,7 +185,7 @@ self._unset.append((section, name)) continue - raise error.ParseError(l.rstrip(), ("%s:%s" % (src, line))) + raise error.ParseError(l.rstrip(), ("%s:%d" % (src, line))) def read(self, path, fp=None, sections=None, remap=None): if not fp: diff -r fb92df8b634c -r ed5448edcbfa mercurial/configitems.py --- a/mercurial/configitems.py Wed Apr 04 10:35:09 2018 -0400 +++ b/mercurial/configitems.py Wed Apr 18 15:32:08 2018 -0400 @@ -114,7 +114,7 @@ coreconfigitem = getitemregister(coreitems) coreconfigitem('alias', '.*', - default=None, + default=dynamicdefault, generic=True, ) coreconfigitem('annotate', 'nodates', @@ -440,6 +440,18 @@ coreconfigitem('experimental', 'bundlecomplevel', default=None, ) +coreconfigitem('experimental', 'bundlecomplevel.bzip2', + default=None, +) +coreconfigitem('experimental', 'bundlecomplevel.gzip', + default=None, +) +coreconfigitem('experimental', 'bundlecomplevel.none', + default=None, +) +coreconfigitem('experimental', 'bundlecomplevel.zstd', + default=None, +) coreconfigitem('experimental', 'changegroup3', default=False, ) @@ -502,6 +514,9 @@ coreconfigitem('experimental', 'maxdeltachainspan', default=-1, ) +coreconfigitem('experimental', 'mergetempdirprefix', + default=None, +) coreconfigitem('experimental', 'mmapindexthreshold', default=None, ) @@ -535,10 +550,10 @@ coreconfigitem('experimental', 'hook-track-tags', default=False, ) -coreconfigitem('experimental', 'httppostargs', +coreconfigitem('experimental', 'httppeer.advertise-v2', default=False, ) -coreconfigitem('experimental', 'manifestv2', +coreconfigitem('experimental', 'httppostargs', default=False, ) coreconfigitem('experimental', 'mergedriver', @@ -556,6 +571,9 @@ coreconfigitem('experimental', 'single-head-per-branch', default=False, ) +coreconfigitem('experimental', 'sshserver.support-v2', + default=False, +) coreconfigitem('experimental', 'spacemovesdown', default=False, ) @@ -574,6 +592,21 @@ coreconfigitem('experimental', 'update.atomic-file', default=False, ) +coreconfigitem('experimental', 'sshpeer.advertise-v2', + default=False, +) +coreconfigitem('experimental', 'web.apiserver', + default=False, +) +coreconfigitem('experimental', 'web.api.http-v2', + default=False, +) +coreconfigitem('experimental', 'web.api.debugreflect', + default=False, +) +coreconfigitem('experimental', 'xdiff', + default=False, +) coreconfigitem('extensions', '.*', default=None, generic=True, @@ -743,6 +776,16 @@ generic=True, priority=-1, ) +coreconfigitem('merge-tools', br'.*\.mergemarkers$', + default='basic', + generic=True, + priority=-1, +) +coreconfigitem('merge-tools', br'.*\.mergemarkertemplate$', + default=dynamicdefault, # take from ui.mergemarkertemplate + generic=True, + priority=-1, +) coreconfigitem('merge-tools', br'.*\.priority$', default=0, generic=True, @@ -889,6 +932,12 @@ coreconfigitem('server', 'disablefullbundle', default=False, ) +coreconfigitem('server', 'streamunbundle', + default=False, +) +coreconfigitem('server', 'pullbundle', + default=False, +) coreconfigitem('server', 'maxhttpheaderlen', default=1024, ) @@ -907,6 +956,9 @@ coreconfigitem('server', 'zliblevel', default=-1, ) +coreconfigitem('server', 'zstdlevel', + default=3, +) coreconfigitem('share', 'pool', default=None, ) @@ -1013,9 +1065,6 @@ coreconfigitem('ui', 'graphnodetemplate', default=None, ) -coreconfigitem('ui', 'http2debuglevel', - default=None, -) coreconfigitem('ui', 'interactive', default=None, ) @@ -1114,9 +1163,6 @@ coreconfigitem('ui', 'tweakdefaults', default=False, ) -coreconfigitem('ui', 'usehttp2', - default=False, -) coreconfigitem('ui', 'username', alias=[('ui', 'user')] ) @@ -1242,6 +1288,9 @@ coreconfigitem('web', 'refreshinterval', default=20, ) +coreconfigitem('web', 'server-header', + default=None, +) coreconfigitem('web', 'staticurl', default=None, ) diff -r fb92df8b634c -r ed5448edcbfa mercurial/context.py --- a/mercurial/context.py Wed Apr 04 10:35:09 2018 -0400 +++ b/mercurial/context.py Wed Apr 18 15:32:08 2018 -0400 @@ -22,21 +22,17 @@ nullid, nullrev, short, + wdirfilenodeids, wdirid, - wdirnodes, wdirrev, ) -from .thirdparty import ( - attr, -) from . import ( + dagop, encoding, error, fileset, match as matchmod, - mdiff, obsolete as obsmod, - obsutil, patch, pathutil, phases, @@ -46,12 +42,17 @@ scmutil, sparse, subrepo, + subrepoutil, util, ) +from .utils import ( + dateutil, + stringutil, +) propertycache = util.propertycache -nonascii = re.compile(r'[^\x21-\x7f]').search +nonascii = re.compile(br'[^\x21-\x7f]').search class basectx(object): """A basectx object represents the common logic for its children: @@ -60,26 +61,15 @@ be committed, memctx: a context that represents changes in-memory and can also be committed.""" - def __new__(cls, repo, changeid='', *args, **kwargs): - if isinstance(changeid, basectx): - return changeid - - o = super(basectx, cls).__new__(cls) - - o._repo = repo - o._rev = nullrev - o._node = nullid - - return o + + def __init__(self, repo): + self._repo = repo def __bytes__(self): return short(self.node()) __str__ = encoding.strmethod(__bytes__) - def __int__(self): - return self.rev() - def __repr__(self): return r"<%s %s>" % (type(self).__name__, str(self)) @@ -148,7 +138,7 @@ removed.append(fn) elif flag1 != flag2: modified.append(fn) - elif node2 not in wdirnodes: + elif node2 not in wdirfilenodeids: # When comparing files between two commits, we save time by # not comparing the file contents when the nodeids differ. # Note that this means we incorrectly report a reverted change @@ -173,7 +163,7 @@ @propertycache def substate(self): - return subrepo.state(self, self._repo.ui) + return subrepoutil.state(self, self._repo.ui) def subrev(self, subpath): return self.substate[subpath][1] @@ -206,22 +196,10 @@ """True if the changeset is extinct""" return self.rev() in obsmod.getrevs(self._repo, 'extinct') - def unstable(self): - msg = ("'context.unstable' is deprecated, " - "use 'context.orphan'") - self._repo.ui.deprecwarn(msg, '4.4') - return self.orphan() - def orphan(self): """True if the changeset is not obsolete but it's ancestor are""" return self.rev() in obsmod.getrevs(self._repo, 'orphan') - def bumped(self): - msg = ("'context.bumped' is deprecated, " - "use 'context.phasedivergent'") - self._repo.ui.deprecwarn(msg, '4.4') - return self.phasedivergent() - def phasedivergent(self): """True if the changeset try to be a successor of a public changeset @@ -229,12 +207,6 @@ """ return self.rev() in obsmod.getrevs(self._repo, 'phasedivergent') - def divergent(self): - msg = ("'context.divergent' is deprecated, " - "use 'context.contentdivergent'") - self._repo.ui.deprecwarn(msg, '4.4') - return self.contentdivergent() - def contentdivergent(self): """Is a successors of a changeset with multiple possible successors set @@ -242,33 +214,10 @@ """ return self.rev() in obsmod.getrevs(self._repo, 'contentdivergent') - def troubled(self): - msg = ("'context.troubled' is deprecated, " - "use 'context.isunstable'") - self._repo.ui.deprecwarn(msg, '4.4') - return self.isunstable() - def isunstable(self): """True if the changeset is either unstable, bumped or divergent""" return self.orphan() or self.phasedivergent() or self.contentdivergent() - def troubles(self): - """Keep the old version around in order to avoid breaking extensions - about different return values. - """ - msg = ("'context.troubles' is deprecated, " - "use 'context.instabilities'") - self._repo.ui.deprecwarn(msg, '4.4') - - troubles = [] - if self.orphan(): - troubles.append('orphan') - if self.phasedivergent(): - troubles.append('bumped') - if self.contentdivergent(): - troubles.append('divergent') - return troubles - def instabilities(self): """return the list of instabilities affecting this changeset. @@ -428,54 +377,44 @@ return r -def _filterederror(repo, changeid): - """build an exception to be raised about a filtered changeid - - This is extracted in a function to help extensions (eg: evolve) to - experiment with various message variants.""" - if repo.filtername.startswith('visible'): - - # Check if the changeset is obsolete - unfilteredrepo = repo.unfiltered() - ctx = unfilteredrepo[changeid] - - # If the changeset is obsolete, enrich the message with the reason - # that made this changeset not visible - if ctx.obsolete(): - msg = obsutil._getfilteredreason(repo, changeid, ctx) - else: - msg = _("hidden revision '%s'") % changeid - - hint = _('use --hidden to access hidden revisions') - - return error.FilteredRepoLookupError(msg, hint=hint) - msg = _("filtered revision '%s' (not in '%s' subset)") - msg %= (changeid, repo.filtername) - return error.FilteredRepoLookupError(msg) +def changectxdeprecwarn(repo): + # changectx's constructor will soon lose support for these forms of + # changeids: + # * stringinfied ints + # * bookmarks, tags, branches, and other namespace identifiers + # * hex nodeid prefixes + # + # Depending on your use case, replace repo[x] by one of these: + # * If you want to support general revsets, use scmutil.revsingle(x) + # * If you know that "x" is a stringified int, use repo[int(x)] + # * If you know that "x" is a bookmark, use repo._bookmarks.changectx(x) + # * If you know that "x" is a tag, use repo[repo.tags()[x]] + # * If you know that "x" is a branch or in some other namespace, + # use the appropriate mechanism for that namespace + # * If you know that "x" is a hex nodeid prefix, use + # repo[scmutil.resolvehexnodeidprefix(repo, x)] + # * If "x" is a string that can be any of the above, but you don't want + # to allow general revsets (perhaps because "x" may come from a remote + # user and the revset may be too costly), use scmutil.revsymbol(repo, x) + # * If "x" can be a mix of the above, you'll have to figure it out + # yourself + repo.ui.deprecwarn("changectx.__init__ is getting more limited, see " + "context.changectxdeprecwarn() for details", "4.6", + stacklevel=4) class changectx(basectx): """A changecontext object makes access to data related to a particular changeset convenient. It represents a read-only context already present in the repo.""" - def __init__(self, repo, changeid=''): + def __init__(self, repo, changeid='.'): """changeid is a revision number, node, or tag""" - - # since basectx.__new__ already took care of copying the object, we - # don't need to do anything in __init__, so we just exit here - if isinstance(changeid, basectx): - return - - if changeid == '': - changeid = '.' - self._repo = repo + super(changectx, self).__init__(repo) try: if isinstance(changeid, int): self._node = repo.changelog.node(changeid) self._rev = changeid return - if not pycompat.ispy3 and isinstance(changeid, long): - changeid = str(changeid) if changeid == 'null': self._node = nullid self._rev = nullrev @@ -496,7 +435,7 @@ self._node = changeid self._rev = repo.changelog.rev(changeid) return - except error.FilteredRepoLookupError: + except error.FilteredLookupError: raise except LookupError: pass @@ -512,6 +451,7 @@ raise ValueError self._rev = r self._node = repo.changelog.node(r) + changectxdeprecwarn(repo) return except error.FilteredIndexError: raise @@ -532,17 +472,15 @@ try: self._node = repo.names.singlenode(repo, changeid) self._rev = repo.changelog.rev(self._node) + changectxdeprecwarn(repo) return except KeyError: pass - except error.FilteredRepoLookupError: - raise - except error.RepoLookupError: - pass - - self._node = repo.unfiltered().changelog._partialmatch(changeid) + + self._node = scmutil.resolvehexnodeidprefix(repo, changeid) if self._node is not None: self._rev = repo.changelog.rev(self._node) + changectxdeprecwarn(repo) return # lookup failed @@ -561,7 +499,7 @@ pass except (error.FilteredIndexError, error.FilteredLookupError, error.FilteredRepoLookupError): - raise _filterederror(repo, changeid) + raise except IndexError: pass raise error.RepoLookupError( @@ -691,7 +629,7 @@ # experimental config: merge.preferancestor for r in self._repo.ui.configlist('merge', 'preferancestor'): try: - ctx = changectx(self._repo, r) + ctx = scmutil.revsymbol(self._repo, r) except error.RepoLookupError: continue anc = ctx.node() @@ -790,7 +728,7 @@ __str__ = encoding.strmethod(__bytes__) def __repr__(self): - return "<%s %s>" % (type(self).__name__, str(self)) + return r"<%s %s>" % (type(self).__name__, str(self)) def __hash__(self): try: @@ -863,7 +801,7 @@ def isbinary(self): try: - return util.binary(self.data()) + return stringutil.binary(self.data()) except IOError: return False def isexec(self): @@ -954,7 +892,7 @@ """ lkr = self.linkrev() attrs = vars(self) - noctx = not ('_changeid' in attrs or '_changectx' in attrs) + noctx = not (r'_changeid' in attrs or r'_changectx' in attrs) if noctx or self.rev() == lkr: return self.linkrev() return self._adjustlinkrev(self.rev(), inclusive=True) @@ -970,14 +908,14 @@ def _parentfilectx(self, path, fileid, filelog): """create parent filectx keeping ancestry info for _adjustlinkrev()""" fctx = filectx(self._repo, path, fileid=fileid, filelog=filelog) - if '_changeid' in vars(self) or '_changectx' in vars(self): + if r'_changeid' in vars(self) or r'_changectx' in vars(self): # If self is associated with a changeset (probably explicitly # fed), ensure the created filectx is associated with a # changeset that is an ancestor of self.changectx. # This lets us later use _adjustlinkrev to get a correct link. fctx._descendantrev = self.rev() fctx._ancestrycontext = getattr(self, '_ancestrycontext', None) - elif '_descendantrev' in vars(self): + elif r'_descendantrev' in vars(self): # Otherwise propagate _descendantrev if we have one associated. fctx._descendantrev = self._descendantrev fctx._ancestrycontext = getattr(self, '_ancestrycontext', None) @@ -1012,28 +950,14 @@ return p[1] return filectx(self._repo, self._path, fileid=-1, filelog=self._filelog) - def annotate(self, follow=False, linenumber=False, skiprevs=None, - diffopts=None): - '''returns a list of tuples of ((ctx, number), line) for each line - in the file, where ctx is the filectx of the node where - that line was last changed; if linenumber parameter is true, number is - the line number at the first appearance in the managed file, otherwise, - number has a fixed value of False. - ''' - - def lines(text): - if text.endswith("\n"): - return text.count("\n") - return text.count("\n") + int(bool(text)) - - if linenumber: - def decorate(text, rev): - return ([annotateline(fctx=rev, lineno=i) - for i in xrange(1, lines(text) + 1)], text) - else: - def decorate(text, rev): - return ([annotateline(fctx=rev)] * lines(text), text) - + def annotate(self, follow=False, skiprevs=None, diffopts=None): + """Returns a list of annotateline objects for each line in the file + + - line.fctx is the filectx of the node where that line was last changed + - line.lineno is the line number at the first appearance in the managed + file + - line.text is the data on that line (including newline character) + """ getlog = util.lrucachefunc(lambda x: self._repo.file(x)) def parents(f): @@ -1051,7 +975,7 @@ # renamed filectx won't have a filelog yet, so set it # from the cache to save time for p in pl: - if not '_filelog' in p.__dict__: + if not r'_filelog' in p.__dict__: p._filelog = getlog(p.path()) return pl @@ -1069,60 +993,8 @@ ac = cl.ancestors([base.rev()], inclusive=True) base._ancestrycontext = ac - # This algorithm would prefer to be recursive, but Python is a - # bit recursion-hostile. Instead we do an iterative - # depth-first search. - - # 1st DFS pre-calculates pcache and needed - visit = [base] - pcache = {} - needed = {base: 1} - while visit: - f = visit.pop() - if f in pcache: - continue - pl = parents(f) - pcache[f] = pl - for p in pl: - needed[p] = needed.get(p, 0) + 1 - if p not in pcache: - visit.append(p) - - # 2nd DFS does the actual annotate - visit[:] = [base] - hist = {} - while visit: - f = visit[-1] - if f in hist: - visit.pop() - continue - - ready = True - pl = pcache[f] - for p in pl: - if p not in hist: - ready = False - visit.append(p) - if ready: - visit.pop() - curr = decorate(f.data(), f) - skipchild = False - if skiprevs is not None: - skipchild = f._changeid in skiprevs - curr = _annotatepair([hist[p] for p in pl], f, curr, skipchild, - diffopts) - for p in pl: - if needed[p] == 1: - del hist[p] - del needed[p] - else: - needed[p] -= 1 - - hist[f] = curr - del pcache[f] - - lineattrs, text = hist[base] - return pycompat.ziplist(lineattrs, mdiff.splitnewlines(text)) + return dagop.annotate(base, parents, skiprevs=skiprevs, + diffopts=diffopts) def ancestors(self, followfirst=False): visit = {} @@ -1147,74 +1019,6 @@ """ return self._repo.wwritedata(self.path(), self.data()) -@attr.s(slots=True, frozen=True) -class annotateline(object): - fctx = attr.ib() - lineno = attr.ib(default=False) - # Whether this annotation was the result of a skip-annotate. - skip = attr.ib(default=False) - -def _annotatepair(parents, childfctx, child, skipchild, diffopts): - r''' - Given parent and child fctxes and annotate data for parents, for all lines - in either parent that match the child, annotate the child with the parent's - data. - - Additionally, if `skipchild` is True, replace all other lines with parent - annotate data as well such that child is never blamed for any lines. - - See test-annotate.py for unit tests. - ''' - pblocks = [(parent, mdiff.allblocks(parent[1], child[1], opts=diffopts)) - for parent in parents] - - if skipchild: - # Need to iterate over the blocks twice -- make it a list - pblocks = [(p, list(blocks)) for (p, blocks) in pblocks] - # Mercurial currently prefers p2 over p1 for annotate. - # TODO: change this? - for parent, blocks in pblocks: - for (a1, a2, b1, b2), t in blocks: - # Changed blocks ('!') or blocks made only of blank lines ('~') - # belong to the child. - if t == '=': - child[0][b1:b2] = parent[0][a1:a2] - - if skipchild: - # Now try and match up anything that couldn't be matched, - # Reversing pblocks maintains bias towards p2, matching above - # behavior. - pblocks.reverse() - - # The heuristics are: - # * Work on blocks of changed lines (effectively diff hunks with -U0). - # This could potentially be smarter but works well enough. - # * For a non-matching section, do a best-effort fit. Match lines in - # diff hunks 1:1, dropping lines as necessary. - # * Repeat the last line as a last resort. - - # First, replace as much as possible without repeating the last line. - remaining = [(parent, []) for parent, _blocks in pblocks] - for idx, (parent, blocks) in enumerate(pblocks): - for (a1, a2, b1, b2), _t in blocks: - if a2 - a1 >= b2 - b1: - for bk in xrange(b1, b2): - if child[0][bk].fctx == childfctx: - ak = min(a1 + (bk - b1), a2 - 1) - child[0][bk] = attr.evolve(parent[0][ak], skip=True) - else: - remaining[idx][1].append((a1, a2, b1, b2)) - - # Then, look at anything left, which might involve repeating the last - # line. - for parent, blocks in remaining: - for a1, a2, b1, b2 in blocks: - for bk in xrange(b1, b2): - if child[0][bk].fctx == childfctx: - ak = min(a1 + (bk - b1), a2 - 1) - child[0][bk] = attr.evolve(parent[0][ak], skip=True) - return child - class filectx(basefilectx): """A filecontext object makes access to data related to a particular filerevision convenient.""" @@ -1326,12 +1130,12 @@ wants the ability to commit, e.g. workingctx or memctx.""" def __init__(self, repo, text="", user=None, date=None, extra=None, changes=None): - self._repo = repo + super(committablectx, self).__init__(repo) self._rev = None self._node = None self._text = text if date: - self._date = util.parsedate(date) + self._date = dateutil.parsedate(date) if user: self._user = user if changes: @@ -1408,7 +1212,7 @@ ui = self._repo.ui date = ui.configdate('devel', 'default-date') if date is None: - date = util.makedate() + date = dateutil.makedate() return date def subrev(self, subpath): @@ -1554,6 +1358,11 @@ p = p[:-1] return [changectx(self._repo, x) for x in p] + def _fileinfo(self, path): + # populate __dict__['_manifest'] as workingctx has no _manifestdelta + self._manifest + return super(workingctx, self)._fileinfo(path) + def filectx(self, path, filelog=None): """get a file context from the working directory""" return workingfilectx(self._repo, path, workingctx=self, @@ -1679,7 +1488,8 @@ for f in files: if self.flags(f) == 'l': d = self[f].data() - if d == '' or len(d) >= 1024 or '\n' in d or util.binary(d): + if (d == '' or len(d) >= 1024 or '\n' in d + or stringutil.binary(d)): self._repo.ui.debug('ignoring suspect symlink placeholder' ' "%s"\n' % f) continue @@ -1935,7 +1745,7 @@ def date(self): t, tz = self._changectx.date() try: - return (self._repo.wvfs.lstat(self._path).st_mtime, tz) + return (self._repo.wvfs.lstat(self._path)[stat.ST_MTIME], tz) except OSError as err: if err.errno != errno.ENOENT: raise @@ -1983,10 +1793,11 @@ wvfs.audit(f) if wvfs.isdir(f) and not wvfs.islink(f): wvfs.rmtree(f, forcibly=True) - for p in reversed(list(util.finddirs(f))): - if wvfs.isfileorlink(p): - wvfs.unlink(p) - break + if self._repo.ui.configbool('experimental', 'merge.checkpathconflicts'): + for p in reversed(list(util.finddirs(f))): + if wvfs.isfileorlink(p): + wvfs.unlink(p) + break def setflags(self, l, x): self._repo.wvfs.setflags(self._path, l, x) @@ -2008,7 +1819,6 @@ def __init__(self, repo): super(overlayworkingctx, self).__init__(repo) - self._repo = repo self.clean() def setbase(self, wrappedctx): @@ -2155,11 +1965,11 @@ if data is None: raise error.ProgrammingError("data must be non-None") self._auditconflicts(path) - self._markdirty(path, exists=True, data=data, date=util.makedate(), + self._markdirty(path, exists=True, data=data, date=dateutil.makedate(), flags=flags) def setflags(self, path, l, x): - self._markdirty(path, exists=True, date=util.makedate(), + self._markdirty(path, exists=True, date=dateutil.makedate(), flags=(l and 'l' or '') + (x and 'x' or '')) def remove(self, path): @@ -2448,7 +2258,7 @@ user receives the committer name and defaults to current repository username, date is the commit date in any format - supported by util.parsedate() and defaults to current date, extra + supported by dateutil.parsedate() and defaults to current date, extra is a dictionary of metadata or is left empty. """ @@ -2464,7 +2274,7 @@ self._node = None parents = [(p or nullid) for p in parents] p1, p2 = parents - self._parents = [changectx(self._repo, p) for p in (p1, p2)] + self._parents = [self._repo[p] for p in (p1, p2)] files = sorted(set(files)) self._files = files if branch is not None: @@ -2663,12 +2473,9 @@ user receives the committer name and defaults to current repository username, date is the commit date in any format supported by - util.parsedate() and defaults to current date, extra is a dictionary of + dateutil.parsedate() and defaults to current date, extra is a dictionary of metadata or is left empty. """ - def __new__(cls, repo, originalctx, *args, **kwargs): - return super(metadataonlyctx, cls).__new__(cls, repo) - def __init__(self, repo, originalctx, parents=None, text=None, user=None, date=None, extra=None, editor=False): if text is None: diff -r fb92df8b634c -r ed5448edcbfa mercurial/copies.py --- a/mercurial/copies.py Wed Apr 04 10:35:09 2018 -0400 +++ b/mercurial/copies.py Wed Apr 18 15:32:08 2018 -0400 @@ -123,7 +123,7 @@ t[k] = v # remove criss-crossed copies - for k, v in t.items(): + for k, v in list(t.items()): if k in src and v in dst: del t[k] @@ -280,7 +280,7 @@ ac = repo.changelog.ancestors(revs, inclusive=True) ctx._ancestrycontext = ac def makectx(f, n): - if n in node.wdirnodes: # in a working context? + if n in node.wdirfilenodeids: # in a working context? if ctx.rev() is None: return ctx.filectx(f) return repo[None][f] @@ -685,8 +685,8 @@ # the base and present in the source. # Presence in the base is important to exclude added files, presence in the # source is important to exclude removed files. - missingfiles = filter(lambda f: f not in m1 and f in base and f in c2, - changedfiles) + filt = lambda f: f not in m1 and f in base and f in c2 + missingfiles = [f for f in changedfiles if filt(f)] if missingfiles: basenametofilename = collections.defaultdict(list) @@ -698,9 +698,6 @@ basenametofilename[basename].append(f) dirnametofilename[dirname].append(f) - # in case of a rebase/graft, base may not be a common ancestor - anc = c1.ancestor(c2) - for f in missingfiles: basename = os.path.basename(f) dirname = os.path.dirname(f) @@ -723,7 +720,7 @@ for candidate in movecandidates: f1 = c1.filectx(candidate) - if _related(f1, f2, anc.rev()): + if _related(f1, f2): # if there are a few related copies then we'll merge # changes into all of them. This matches the behaviour # of upstream copytracing @@ -731,7 +728,7 @@ return copies, {}, {}, {}, {} -def _related(f1, f2, limit): +def _related(f1, f2): """return True if f1 and f2 filectx have a common ancestor Walk back to common ancestor to see if the two files originate @@ -758,10 +755,8 @@ f1 = next(g1) elif f2r > f1r: f2 = next(g2) - elif f1 == f2: - return f1 # a match - elif f1r == f2r or f1r < limit or f2r < limit: - return False # copy no longer relevant + else: # f1 and f2 point to files in the same linkrev + return f1 == f2 # true if they point to the same file except StopIteration: return False @@ -829,7 +824,7 @@ c2 = getdstfctx(of, mdst[of]) # c2 might be a plain new file on added on destination side that is # unrelated to the droids we are looking for. - cr = _related(oc, c2, tca.rev()) + cr = _related(oc, c2) if cr and (of == f or of == c2.path()): # non-divergent if backwards: data['copy'][of] = f diff -r fb92df8b634c -r ed5448edcbfa mercurial/crecord.py --- a/mercurial/crecord.py Wed Apr 04 10:35:09 2018 -0400 +++ b/mercurial/crecord.py Wed Apr 18 15:32:08 2018 -0400 @@ -23,6 +23,9 @@ scmutil, util, ) +from .utils import ( + stringutil, +) stringio = util.stringio # This is required for ncurses to display non-ASCII characters in default user @@ -547,7 +550,7 @@ chunkselector = curseschunkselector(headerlist, ui, operation) if testfn and os.path.exists(testfn): testf = open(testfn) - testcommands = map(lambda x: x.rstrip('\n'), testf.readlines()) + testcommands = [x.rstrip('\n') for x in testf.readlines()] testf.close() while True: if chunkselector.handlekeypressed(testcommands.pop(0), test=True): @@ -585,7 +588,7 @@ # long as not explicitly set to a falsy value - especially, # when not set at all. This is to stay most compatible with # previous (color only) behaviour. - uicolor = util.parsebool(self.ui.config('ui', 'color')) + uicolor = stringutil.parsebool(self.ui.config('ui', 'color')) self.usecolor = uicolor is not False # the currently selected header, hunk, or hunk-line @@ -950,7 +953,7 @@ # preprocess the text, converting tabs to spaces text = text.expandtabs(4) # strip \n, and convert control characters to ^[char] representation - text = re.sub(r'[\x00-\x08\x0a-\x1f]', + text = re.sub(br'[\x00-\x08\x0a-\x1f]', lambda m:'^' + chr(ord(m.group()) + 64), text.strip('\n')) if pair is not None: @@ -1058,7 +1061,7 @@ if len(lines) != self.numstatuslines: self.numstatuslines = len(lines) self.statuswin.resize(self.numstatuslines, self.xscreensize) - return [util.ellipsis(l, self.xscreensize - 1) for l in lines] + return [stringutil.ellipsis(l, self.xscreensize - 1) for l in lines] def updatescreen(self): self.statuswin.erase() @@ -1335,7 +1338,7 @@ # temporarily disable printing to windows by printstring patchdisplaystring = self.printitem(item, ignorefolding, recursechildren, towin=False) - numlines = len(patchdisplaystring) / self.xscreensize + numlines = len(patchdisplaystring) // self.xscreensize return numlines def sigwinchhandler(self, n, frame): diff -r fb92df8b634c -r ed5448edcbfa mercurial/dagop.py --- a/mercurial/dagop.py Wed Apr 04 10:35:09 2018 -0400 +++ b/mercurial/dagop.py Wed Apr 18 15:32:08 2018 -0400 @@ -9,11 +9,15 @@ import heapq +from .thirdparty import ( + attr, +) from . import ( error, mdiff, node, patch, + pycompat, smartset, ) @@ -358,6 +362,162 @@ if inrange: yield c, linerange1 +@attr.s(slots=True, frozen=True) +class annotateline(object): + fctx = attr.ib() + lineno = attr.ib() + # Whether this annotation was the result of a skip-annotate. + skip = attr.ib(default=False) + text = attr.ib(default=None) + +@attr.s(slots=True, frozen=True) +class _annotatedfile(object): + # list indexed by lineno - 1 + fctxs = attr.ib() + linenos = attr.ib() + skips = attr.ib() + # full file content + text = attr.ib() + +def _countlines(text): + if text.endswith("\n"): + return text.count("\n") + return text.count("\n") + int(bool(text)) + +def _decoratelines(text, fctx): + n = _countlines(text) + linenos = pycompat.rangelist(1, n + 1) + return _annotatedfile([fctx] * n, linenos, [False] * n, text) + +def _annotatepair(parents, childfctx, child, skipchild, diffopts): + r''' + Given parent and child fctxes and annotate data for parents, for all lines + in either parent that match the child, annotate the child with the parent's + data. + + Additionally, if `skipchild` is True, replace all other lines with parent + annotate data as well such that child is never blamed for any lines. + + See test-annotate.py for unit tests. + ''' + pblocks = [(parent, mdiff.allblocks(parent.text, child.text, opts=diffopts)) + for parent in parents] + + if skipchild: + # Need to iterate over the blocks twice -- make it a list + pblocks = [(p, list(blocks)) for (p, blocks) in pblocks] + # Mercurial currently prefers p2 over p1 for annotate. + # TODO: change this? + for parent, blocks in pblocks: + for (a1, a2, b1, b2), t in blocks: + # Changed blocks ('!') or blocks made only of blank lines ('~') + # belong to the child. + if t == '=': + child.fctxs[b1:b2] = parent.fctxs[a1:a2] + child.linenos[b1:b2] = parent.linenos[a1:a2] + child.skips[b1:b2] = parent.skips[a1:a2] + + if skipchild: + # Now try and match up anything that couldn't be matched, + # Reversing pblocks maintains bias towards p2, matching above + # behavior. + pblocks.reverse() + + # The heuristics are: + # * Work on blocks of changed lines (effectively diff hunks with -U0). + # This could potentially be smarter but works well enough. + # * For a non-matching section, do a best-effort fit. Match lines in + # diff hunks 1:1, dropping lines as necessary. + # * Repeat the last line as a last resort. + + # First, replace as much as possible without repeating the last line. + remaining = [(parent, []) for parent, _blocks in pblocks] + for idx, (parent, blocks) in enumerate(pblocks): + for (a1, a2, b1, b2), _t in blocks: + if a2 - a1 >= b2 - b1: + for bk in xrange(b1, b2): + if child.fctxs[bk] == childfctx: + ak = min(a1 + (bk - b1), a2 - 1) + child.fctxs[bk] = parent.fctxs[ak] + child.linenos[bk] = parent.linenos[ak] + child.skips[bk] = True + else: + remaining[idx][1].append((a1, a2, b1, b2)) + + # Then, look at anything left, which might involve repeating the last + # line. + for parent, blocks in remaining: + for a1, a2, b1, b2 in blocks: + for bk in xrange(b1, b2): + if child.fctxs[bk] == childfctx: + ak = min(a1 + (bk - b1), a2 - 1) + child.fctxs[bk] = parent.fctxs[ak] + child.linenos[bk] = parent.linenos[ak] + child.skips[bk] = True + return child + +def annotate(base, parents, skiprevs=None, diffopts=None): + """Core algorithm for filectx.annotate() + + `parents(fctx)` is a function returning a list of parent filectxs. + """ + + # This algorithm would prefer to be recursive, but Python is a + # bit recursion-hostile. Instead we do an iterative + # depth-first search. + + # 1st DFS pre-calculates pcache and needed + visit = [base] + pcache = {} + needed = {base: 1} + while visit: + f = visit.pop() + if f in pcache: + continue + pl = parents(f) + pcache[f] = pl + for p in pl: + needed[p] = needed.get(p, 0) + 1 + if p not in pcache: + visit.append(p) + + # 2nd DFS does the actual annotate + visit[:] = [base] + hist = {} + while visit: + f = visit[-1] + if f in hist: + visit.pop() + continue + + ready = True + pl = pcache[f] + for p in pl: + if p not in hist: + ready = False + visit.append(p) + if ready: + visit.pop() + curr = _decoratelines(f.data(), f) + skipchild = False + if skiprevs is not None: + skipchild = f._changeid in skiprevs + curr = _annotatepair([hist[p] for p in pl], f, curr, skipchild, + diffopts) + for p in pl: + if needed[p] == 1: + del hist[p] + del needed[p] + else: + needed[p] -= 1 + + hist[f] = curr + del pcache[f] + + a = hist[base] + return [annotateline(*r) for r in zip(a.fctxs, a.linenos, a.skips, + mdiff.splitnewlines(a.text))] + def toposort(revs, parentsfunc, firstbranch=()): """Yield revisions from heads to roots one (topo) branch at a time. diff -r fb92df8b634c -r ed5448edcbfa mercurial/dagparser.py --- a/mercurial/dagparser.py Wed Apr 04 10:35:09 2018 -0400 +++ b/mercurial/dagparser.py Wed Apr 18 15:32:08 2018 -0400 @@ -14,7 +14,9 @@ from . import ( error, pycompat, - util, +) +from .utils import ( + stringutil, ) def parsedag(desc): @@ -372,8 +374,8 @@ else: raise error.Abort(_("invalid event type in dag: " "('%s', '%s')") - % (util.escapestr(kind), - util.escapestr(data))) + % (stringutil.escapestr(kind), + stringutil.escapestr(data))) if run: yield '+%d' % run diff -r fb92df8b634c -r ed5448edcbfa mercurial/debugcommands.py --- a/mercurial/debugcommands.py Wed Apr 04 10:35:09 2018 -0400 +++ b/mercurial/debugcommands.py Wed Apr 18 15:32:08 2018 -0400 @@ -14,9 +14,12 @@ import operator import os import random +import re import socket import ssl +import stat import string +import subprocess import sys import tempfile import time @@ -30,6 +33,9 @@ nullrev, short, ) +from .thirdparty import ( + cbor, +) from . import ( bundle2, changegroup, @@ -46,8 +52,10 @@ fileset, formatter, hg, + httppeer, localrepo, lock as lockmod, + logcmdutil, merge as mergemod, obsolete, obsutil, @@ -64,6 +72,7 @@ setdiscovery, simplemerge, smartset, + sshpeer, sslutil, streamclone, templater, @@ -72,6 +81,14 @@ url as urlmod, util, vfs as vfsmod, + wireprotoframing, + wireprotoserver, + wireprotov2peer, +) +from .utils import ( + dateutil, + procutil, + stringutil, ) release = lockmod.release @@ -162,7 +179,7 @@ if mergeable_file: linesperrev = 2 # make a file with k lines per rev - initialmergedlines = [str(i) for i in xrange(0, total * linesperrev)] + initialmergedlines = ['%d' % i for i in xrange(0, total * linesperrev)] initialmergedlines.append("") tags = [] @@ -269,7 +286,7 @@ ui.write("\n%s%s\n" % (indent_string, named)) for deltadata in gen.deltaiter(): node, p1, p2, cs, deltabase, delta, flags = deltadata - ui.write("%s%s %s %s %s %s %s\n" % + ui.write("%s%s %s %s %s %s %d\n" % (indent_string, hex(node), hex(p1), hex(p2), hex(cs), hex(deltabase), len(delta))) @@ -339,11 +356,14 @@ if part.type == 'changegroup': version = part.params.get('version', '01') cg = changegroup.getunbundler(version, part, 'UN') - _debugchangegroup(ui, cg, all=all, indent=4, **opts) + if not ui.quiet: + _debugchangegroup(ui, cg, all=all, indent=4, **opts) if part.type == 'obsmarkers': - _debugobsmarkers(ui, part, indent=4, **opts) + if not ui.quiet: + _debugobsmarkers(ui, part, indent=4, **opts) if part.type == 'phase-heads': - _debugphaseheads(ui, part, indent=4) + if not ui.quiet: + _debugphaseheads(ui, part, indent=4) @command('debugbundle', [('a', 'all', None, _('show all details')), @@ -556,13 +576,13 @@ def debugdate(ui, date, range=None, **opts): """parse and display a date""" if opts[r"extended"]: - d = util.parsedate(date, util.extendeddateformats) + d = dateutil.parsedate(date, util.extendeddateformats) else: - d = util.parsedate(date) - ui.write(("internal: %s %s\n") % d) - ui.write(("standard: %s\n") % util.datestr(d)) + d = dateutil.parsedate(date) + ui.write(("internal: %d %d\n") % d) + ui.write(("standard: %s\n") % dateutil.datestr(d)) if range: - m = util.matchdate(range) + m = dateutil.matchdate(range) ui.write(("match: %s\n") % m(d[0])) @command('debugdeltachain', @@ -1001,7 +1021,7 @@ ignore = repo.dirstate._ignore if not files: # Show all the patterns - ui.write("%s\n" % repr(ignore)) + ui.write("%s\n" % pycompat.byterepr(ignore)) else: m = scmutil.match(repo[None], pats=files) for f in m.files(): @@ -1043,12 +1063,6 @@ if format not in (0, 1): raise error.Abort(_("unknown format %d") % format) - generaldelta = r.version & revlog.FLAG_GENERALDELTA - if generaldelta: - basehdr = ' delta' - else: - basehdr = ' base' - if ui.debugflag: shortfn = hex else: @@ -1061,32 +1075,46 @@ break if format == 0: - ui.write((" rev offset length " + basehdr + " linkrev" - " %s %s p2\n") % ("nodeid".ljust(idlen), "p1".ljust(idlen))) + if ui.verbose: + ui.write((" rev offset length linkrev" + " %s %s p2\n") % ("nodeid".ljust(idlen), + "p1".ljust(idlen))) + else: + ui.write((" rev linkrev %s %s p2\n") % ( + "nodeid".ljust(idlen), "p1".ljust(idlen))) elif format == 1: - ui.write((" rev flag offset length" - " size " + basehdr + " link p1 p2" - " %s\n") % "nodeid".rjust(idlen)) + if ui.verbose: + ui.write((" rev flag offset length size link p1" + " p2 %s\n") % "nodeid".rjust(idlen)) + else: + ui.write((" rev flag size link p1 p2 %s\n") % + "nodeid".rjust(idlen)) for i in r: node = r.node(i) - if generaldelta: - base = r.deltaparent(i) - else: - base = r.chainbase(i) if format == 0: try: pp = r.parents(node) except Exception: pp = [nullid, nullid] - ui.write("% 6d % 9d % 7d % 6d % 7d %s %s %s\n" % ( - i, r.start(i), r.length(i), base, r.linkrev(i), - shortfn(node), shortfn(pp[0]), shortfn(pp[1]))) + if ui.verbose: + ui.write("% 6d % 9d % 7d % 7d %s %s %s\n" % ( + i, r.start(i), r.length(i), r.linkrev(i), + shortfn(node), shortfn(pp[0]), shortfn(pp[1]))) + else: + ui.write("% 6d % 7d %s %s %s\n" % ( + i, r.linkrev(i), shortfn(node), shortfn(pp[0]), + shortfn(pp[1]))) elif format == 1: pr = r.parentrevs(i) - ui.write("% 6d %04x % 8d % 8d % 8d % 6d % 6d % 6d % 6d %s\n" % ( - i, r.flags(i), r.start(i), r.length(i), r.rawsize(i), - base, r.linkrev(i), pr[0], pr[1], shortfn(node))) + if ui.verbose: + ui.write("% 6d %04x % 8d % 8d % 8d % 6d % 6d % 6d %s\n" % ( + i, r.flags(i), r.start(i), r.length(i), r.rawsize(i), + r.linkrev(i), pr[0], pr[1], shortfn(node))) + else: + ui.write("% 6d %04x % 8d % 6d % 6d % 6d %s\n" % ( + i, r.flags(i), r.rawsize(i), r.linkrev(i), pr[0], pr[1], + shortfn(node))) @command('debugindexdot', cmdutil.debugrevlogopts, _('-c|-m|FILE'), optionalrepo=True) @@ -1113,7 +1141,7 @@ def writetemp(contents): (fd, name) = tempfile.mkstemp(prefix="hg-debuginstall-") - f = os.fdopen(fd, pycompat.sysstr("wb")) + f = os.fdopen(fd, r"wb") f.write(contents) f.close() return name @@ -1129,7 +1157,7 @@ try: codecs.lookup(pycompat.sysstr(encoding.encoding)) except LookupError as inst: - err = util.forcebytestr(inst) + err = stringutil.forcebytestr(inst) problems += 1 fm.condwrite(err, 'encodingerror', _(" %s\n" " (check that your locale is properly set)\n"), err) @@ -1185,7 +1213,7 @@ ) dir(bdiff), dir(mpatch), dir(base85), dir(osutil) # quiet pyflakes except Exception as inst: - err = util.forcebytestr(inst) + err = stringutil.forcebytestr(inst) problems += 1 fm.condwrite(err, 'extensionserror', " %s\n", err) @@ -1222,7 +1250,7 @@ try: templater.templater.frommapfile(m) except Exception as inst: - err = util.forcebytestr(inst) + err = stringutil.forcebytestr(inst) p = None fm.condwrite(err, 'defaulttemplateerror', " %s\n", err) else: @@ -1239,16 +1267,17 @@ # editor editor = ui.geteditor() editor = util.expandpath(editor) - fm.write('editor', _("checking commit editor... (%s)\n"), editor) - cmdpath = util.findexe(pycompat.shlexsplit(editor)[0]) + editorbin = procutil.shellsplit(editor)[0] + fm.write('editor', _("checking commit editor... (%s)\n"), editorbin) + cmdpath = procutil.findexe(editorbin) fm.condwrite(not cmdpath and editor == 'vi', 'vinotfound', _(" No commit editor set and can't find %s in PATH\n" " (specify a commit editor in your configuration" - " file)\n"), not cmdpath and editor == 'vi' and editor) + " file)\n"), not cmdpath and editor == 'vi' and editorbin) fm.condwrite(not cmdpath and editor != 'vi', 'editornotfound', _(" Can't find editor '%s' in PATH\n" " (specify a commit editor in your configuration" - " file)\n"), not cmdpath and editor) + " file)\n"), not cmdpath and editorbin) if not cmdpath and editor != 'vi': problems += 1 @@ -1258,7 +1287,7 @@ try: username = ui.username() except error.Abort as e: - err = util.forcebytestr(e) + err = stringutil.forcebytestr(e) problems += 1 fm.condwrite(username, 'username', _("checking username (%s)\n"), username) @@ -1367,9 +1396,9 @@ l.release() else: try: - stat = vfs.lstat(name) - age = now - stat.st_mtime - user = util.username(stat.st_uid) + st = vfs.lstat(name) + age = now - st[stat.ST_MTIME] + user = util.username(st.st_uid) locker = vfs.readlock(name) if ":" in locker: host, pid = locker.split(':') @@ -1405,7 +1434,7 @@ return h def printrecords(version): - ui.write(('* version %s records\n') % version) + ui.write(('* version %d records\n') % version) if version == 1: records = v1records else: @@ -1573,7 +1602,7 @@ try: date = opts.get('date') if date: - date = util.parsedate(date) + date = dateutil.parsedate(date) else: date = None prec = parsenodeid(precursor) @@ -1589,7 +1618,8 @@ metadata=metadata, ui=ui) tr.close() except ValueError as exc: - raise error.Abort(_('bad obsmarker input: %s') % exc) + raise error.Abort(_('bad obsmarker input: %s') % + pycompat.bytestr(exc)) finally: tr.release() finally: @@ -1692,6 +1722,25 @@ ui.write('\n'.join(repo.pathto(p, cwd) for p in sorted(files))) ui.write('\n') +@command('debugpeer', [], _('PATH'), norepo=True) +def debugpeer(ui, path): + """establish a connection to a peer repository""" + # Always enable peer request logging. Requires --debug to display + # though. + overrides = { + ('devel', 'debug.peer-request'): True, + } + + with ui.configoverride(overrides): + peer = hg.peer(ui, {}, path) + + local = peer.local() is not None + canpush = peer.canpush() + + ui.write(_('url: %s\n') % peer.url()) + ui.write(_('local: %s\n') % (_('yes') if local else _('no'))) + ui.write(_('pushable: %s\n') % (_('yes') if canpush else _('no'))) + @command('debugpickmergetool', [('r', 'rev', '', _('check for files in this revision'), _('REV')), ('', 'changedelete', None, _('emulate merging change and delete')), @@ -1744,15 +1793,15 @@ overrides = {} if opts['tool']: overrides[('ui', 'forcemerge')] = opts['tool'] - ui.note(('with --tool %r\n') % (opts['tool'])) + ui.note(('with --tool %r\n') % (pycompat.bytestr(opts['tool']))) with ui.configoverride(overrides, 'debugmergepatterns'): hgmerge = encoding.environ.get("HGMERGE") if hgmerge is not None: - ui.note(('with HGMERGE=%r\n') % (hgmerge)) + ui.note(('with HGMERGE=%r\n') % (pycompat.bytestr(hgmerge))) uimerge = ui.config("ui", "merge") if uimerge: - ui.note(('with ui.merge=%r\n') % (uimerge)) + ui.note(('with ui.merge=%r\n') % (pycompat.bytestr(uimerge))) ctx = scmutil.revsingle(repo, opts.get('rev')) m = scmutil.match(ctx, pats, opts) @@ -1784,13 +1833,20 @@ target = hg.peer(ui, {}, repopath) if keyinfo: key, old, new = keyinfo - r = target.pushkey(namespace, key, old, new) - ui.status(str(r) + '\n') + with target.commandexecutor() as e: + r = e.callcommand('pushkey', { + 'namespace': namespace, + 'key': key, + 'old': old, + 'new': new, + }).result() + + ui.status(pycompat.bytestr(r) + '\n') return not r else: for k, v in sorted(target.listkeys(namespace).iteritems()): - ui.write("%s\t%s\n" % (util.escapestr(k), - util.escapestr(v))) + ui.write("%s\t%s\n" % (stringutil.escapestr(k), + stringutil.escapestr(v))) @command('debugpvec', [], _('A B')) def debugpvec(ui, repo, a, b=None): @@ -2165,7 +2221,7 @@ treebystage = {} printedtree = None - tree = revsetlang.parse(expr, lookup=repo.__contains__) + tree = revsetlang.parse(expr, lookup=revset.lookupfn(repo)) for n, f in stages: treebystage[n] = tree = f(tree) if n in showalways or (n in showchanged and tree != printedtree): @@ -2206,7 +2262,38 @@ if not opts['show_revs']: return for c in revs: - ui.write("%s\n" % c) + ui.write("%d\n" % c) + +@command('debugserve', [ + ('', 'sshstdio', False, _('run an SSH server bound to process handles')), + ('', 'logiofd', '', _('file descriptor to log server I/O to')), + ('', 'logiofile', '', _('file to log server I/O to')), +], '') +def debugserve(ui, repo, **opts): + """run a server with advanced settings + + This command is similar to :hg:`serve`. It exists partially as a + workaround to the fact that ``hg serve --stdio`` must have specific + arguments for security reasons. + """ + opts = pycompat.byteskwargs(opts) + + if not opts['sshstdio']: + raise error.Abort(_('only --sshstdio is currently supported')) + + logfh = None + + if opts['logiofd'] and opts['logiofile']: + raise error.Abort(_('cannot use both --logiofd and --logiofile')) + + if opts['logiofd']: + # Line buffered because output is line based. + logfh = os.fdopen(int(opts['logiofd']), r'ab', 1) + elif opts['logiofile']: + logfh = open(opts['logiofile'], 'ab', 1) + + s = wireprotoserver.sshserver(ui, repo, logfh=logfh) + s.serve_forever() @command('debugsetparents', [], _('REV1 [REV2]')) def debugsetparents(ui, repo, rev1, rev2=None): @@ -2220,11 +2307,11 @@ Returns 0 on success. """ - r1 = scmutil.revsingle(repo, rev1).node() - r2 = scmutil.revsingle(repo, rev2, 'null').node() + node1 = scmutil.revsingle(repo, rev1).node() + node2 = scmutil.revsingle(repo, rev2, 'null').node() with repo.wlock(): - repo.setparents(r1, r2) + repo.setparents(node1, node2) @command('debugssl', [], '[SOURCE]', optionalrepo=True) def debugssl(ui, repo, source=None, **opts): @@ -2336,7 +2423,7 @@ """ # passed to successorssets caching computation from one call to another cache = {} - ctx2str = str + ctx2str = bytes node2str = short for rev in scmutil.revrange(repo, revs): ctx = repo[rev] @@ -2394,18 +2481,34 @@ if revs is None: tres = formatter.templateresources(ui, repo) t = formatter.maketemplater(ui, tmpl, resources=tres) - ui.write(t.render(props)) + ui.write(t.renderdefault(props)) else: - displayer = cmdutil.makelogtemplater(ui, repo, tmpl) + displayer = logcmdutil.maketemplater(ui, repo, tmpl) for r in revs: displayer.show(repo[r], **pycompat.strkwargs(props)) displayer.close() +@command('debuguigetpass', [ + ('p', 'prompt', '', _('prompt text'), _('TEXT')), +], _('[-p TEXT]'), norepo=True) +def debuguigetpass(ui, prompt=''): + """show prompt to type password""" + r = ui.getpass(prompt) + ui.write(('respose: %s\n') % r) + +@command('debuguiprompt', [ + ('p', 'prompt', '', _('prompt text'), _('TEXT')), +], _('[-p TEXT]'), norepo=True) +def debuguiprompt(ui, prompt=''): + """show plain prompt""" + r = ui.prompt(prompt) + ui.write(('response: %s\n') % r) + @command('debugupdatecaches', []) def debugupdatecaches(ui, repo, *pats, **opts): """warm all known caches in the repository""" with repo.wlock(), repo.lock(): - repo.updatecaches() + repo.updatecaches(full=True) @command('debugupgraderepo', [ ('o', 'optimize', [], _('extra optimization to perform'), _('NAME')), @@ -2452,6 +2555,17 @@ line = fmt % (abs, f(m.rel(abs)), m.exact(abs) and 'exact' or '') ui.write("%s\n" % line.rstrip()) +@command('debugwhyunstable', [], _('REV')) +def debugwhyunstable(ui, repo, rev): + """explain instabilities of a changeset""" + for entry in obsutil.whyunstable(repo, scmutil.revsingle(repo, rev)): + dnodes = '' + if entry.get('divergentnodes'): + dnodes = ' '.join('%s (%s)' % (ctx.hex(), ctx.phasestr()) + for ctx in entry['divergentnodes']) + ' ' + ui.write('%s: %s%s %s\n' % (entry['instability'], dnodes, + entry['reason'], entry['node'])) + @command('debugwireargs', [('', 'three', '', 'three'), ('', 'four', '', 'four'), @@ -2475,3 +2589,545 @@ ui.write("%s\n" % res1) if res1 != res2: ui.warn("%s\n" % res2) + +def _parsewirelangblocks(fh): + activeaction = None + blocklines = [] + + for line in fh: + line = line.rstrip() + if not line: + continue + + if line.startswith(b'#'): + continue + + if not line.startswith(' '): + # New block. Flush previous one. + if activeaction: + yield activeaction, blocklines + + activeaction = line + blocklines = [] + continue + + # Else we start with an indent. + + if not activeaction: + raise error.Abort(_('indented line outside of block')) + + blocklines.append(line) + + # Flush last block. + if activeaction: + yield activeaction, blocklines + +@command('debugwireproto', + [ + ('', 'localssh', False, _('start an SSH server for this repo')), + ('', 'peer', '', _('construct a specific version of the peer')), + ('', 'noreadstderr', False, _('do not read from stderr of the remote')), + ('', 'nologhandshake', False, + _('do not log I/O related to the peer handshake')), + ] + cmdutil.remoteopts, + _('[PATH]'), + optionalrepo=True) +def debugwireproto(ui, repo, path=None, **opts): + """send wire protocol commands to a server + + This command can be used to issue wire protocol commands to remote + peers and to debug the raw data being exchanged. + + ``--localssh`` will start an SSH server against the current repository + and connect to that. By default, the connection will perform a handshake + and establish an appropriate peer instance. + + ``--peer`` can be used to bypass the handshake protocol and construct a + peer instance using the specified class type. Valid values are ``raw``, + ``http2``, ``ssh1``, and ``ssh2``. ``raw`` instances only allow sending + raw data payloads and don't support higher-level command actions. + + ``--noreadstderr`` can be used to disable automatic reading from stderr + of the peer (for SSH connections only). Disabling automatic reading of + stderr is useful for making output more deterministic. + + Commands are issued via a mini language which is specified via stdin. + The language consists of individual actions to perform. An action is + defined by a block. A block is defined as a line with no leading + space followed by 0 or more lines with leading space. Blocks are + effectively a high-level command with additional metadata. + + Lines beginning with ``#`` are ignored. + + The following sections denote available actions. + + raw + --- + + Send raw data to the server. + + The block payload contains the raw data to send as one atomic send + operation. The data may not actually be delivered in a single system + call: it depends on the abilities of the transport being used. + + Each line in the block is de-indented and concatenated. Then, that + value is evaluated as a Python b'' literal. This allows the use of + backslash escaping, etc. + + raw+ + ---- + + Behaves like ``raw`` except flushes output afterwards. + + command + ----------- + + Send a request to run a named command, whose name follows the ``command`` + string. + + Arguments to the command are defined as lines in this block. The format of + each line is `` ``. e.g.:: + + command listkeys + namespace bookmarks + + If the value begins with ``eval:``, it will be interpreted as a Python + literal expression. Otherwise values are interpreted as Python b'' literals. + This allows sending complex types and encoding special byte sequences via + backslash escaping. + + The following arguments have special meaning: + + ``PUSHFILE`` + When defined, the *push* mechanism of the peer will be used instead + of the static request-response mechanism and the content of the + file specified in the value of this argument will be sent as the + command payload. + + This can be used to submit a local bundle file to the remote. + + batchbegin + ---------- + + Instruct the peer to begin a batched send. + + All ``command`` blocks are queued for execution until the next + ``batchsubmit`` block. + + batchsubmit + ----------- + + Submit previously queued ``command`` blocks as a batch request. + + This action MUST be paired with a ``batchbegin`` action. + + httprequest + --------------------------- + + (HTTP peer only) + + Send an HTTP request to the peer. + + The HTTP request line follows the ``httprequest`` action. e.g. ``GET /foo``. + + Arguments of the form ``: `` are interpreted as HTTP request + headers to add to the request. e.g. ``Accept: foo``. + + The following arguments are special: + + ``BODYFILE`` + The content of the file defined as the value to this argument will be + transferred verbatim as the HTTP request body. + + ``frame `` + Send a unified protocol frame as part of the request body. + + All frames will be collected and sent as the body to the HTTP + request. + + close + ----- + + Close the connection to the server. + + flush + ----- + + Flush data written to the server. + + readavailable + ------------- + + Close the write end of the connection and read all available data from + the server. + + If the connection to the server encompasses multiple pipes, we poll both + pipes and read available data. + + readline + -------- + + Read a line of output from the server. If there are multiple output + pipes, reads only the main pipe. + + ereadline + --------- + + Like ``readline``, but read from the stderr pipe, if available. + + read + -------- + + ``read()`` N bytes from the server's main output pipe. + + eread + --------- + + ``read()`` N bytes from the server's stderr pipe, if available. + + Specifying Unified Frame-Based Protocol Frames + ---------------------------------------------- + + It is possible to emit a *Unified Frame-Based Protocol* by using special + syntax. + + A frame is composed as a type, flags, and payload. These can be parsed + from a string of the form: + + + + ``request-id`` and ``stream-id`` are integers defining the request and + stream identifiers. + + ``type`` can be an integer value for the frame type or the string name + of the type. The strings are defined in ``wireprotoframing.py``. e.g. + ``command-name``. + + ``stream-flags`` and ``flags`` are a ``|`` delimited list of flag + components. Each component (and there can be just one) can be an integer + or a flag name for stream flags or frame flags, respectively. Values are + resolved to integers and then bitwise OR'd together. + + ``payload`` represents the raw frame payload. If it begins with + ``cbor:``, the following string is evaluated as Python code and the + resulting object is fed into a CBOR encoder. Otherwise it is interpreted + as a Python byte string literal. + """ + opts = pycompat.byteskwargs(opts) + + if opts['localssh'] and not repo: + raise error.Abort(_('--localssh requires a repository')) + + if opts['peer'] and opts['peer'] not in ('raw', 'http2', 'ssh1', 'ssh2'): + raise error.Abort(_('invalid value for --peer'), + hint=_('valid values are "raw", "ssh1", and "ssh2"')) + + if path and opts['localssh']: + raise error.Abort(_('cannot specify --localssh with an explicit ' + 'path')) + + if ui.interactive(): + ui.write(_('(waiting for commands on stdin)\n')) + + blocks = list(_parsewirelangblocks(ui.fin)) + + proc = None + stdin = None + stdout = None + stderr = None + opener = None + + if opts['localssh']: + # We start the SSH server in its own process so there is process + # separation. This prevents a whole class of potential bugs around + # shared state from interfering with server operation. + args = procutil.hgcmd() + [ + '-R', repo.root, + 'debugserve', '--sshstdio', + ] + proc = subprocess.Popen(args, stdin=subprocess.PIPE, + stdout=subprocess.PIPE, stderr=subprocess.PIPE, + bufsize=0) + + stdin = proc.stdin + stdout = proc.stdout + stderr = proc.stderr + + # We turn the pipes into observers so we can log I/O. + if ui.verbose or opts['peer'] == 'raw': + stdin = util.makeloggingfileobject(ui, proc.stdin, b'i', + logdata=True) + stdout = util.makeloggingfileobject(ui, proc.stdout, b'o', + logdata=True) + stderr = util.makeloggingfileobject(ui, proc.stderr, b'e', + logdata=True) + + # --localssh also implies the peer connection settings. + + url = 'ssh://localserver' + autoreadstderr = not opts['noreadstderr'] + + if opts['peer'] == 'ssh1': + ui.write(_('creating ssh peer for wire protocol version 1\n')) + peer = sshpeer.sshv1peer(ui, url, proc, stdin, stdout, stderr, + None, autoreadstderr=autoreadstderr) + elif opts['peer'] == 'ssh2': + ui.write(_('creating ssh peer for wire protocol version 2\n')) + peer = sshpeer.sshv2peer(ui, url, proc, stdin, stdout, stderr, + None, autoreadstderr=autoreadstderr) + elif opts['peer'] == 'raw': + ui.write(_('using raw connection to peer\n')) + peer = None + else: + ui.write(_('creating ssh peer from handshake results\n')) + peer = sshpeer.makepeer(ui, url, proc, stdin, stdout, stderr, + autoreadstderr=autoreadstderr) + + elif path: + # We bypass hg.peer() so we can proxy the sockets. + # TODO consider not doing this because we skip + # ``hg.wirepeersetupfuncs`` and potentially other useful functionality. + u = util.url(path) + if u.scheme != 'http': + raise error.Abort(_('only http:// paths are currently supported')) + + url, authinfo = u.authinfo() + openerargs = { + r'useragent': b'Mercurial debugwireproto', + } + + # Turn pipes/sockets into observers so we can log I/O. + if ui.verbose: + openerargs.update({ + r'loggingfh': ui, + r'loggingname': b's', + r'loggingopts': { + r'logdata': True, + r'logdataapis': False, + }, + }) + + if ui.debugflag: + openerargs[r'loggingopts'][r'logdataapis'] = True + + # Don't send default headers when in raw mode. This allows us to + # bypass most of the behavior of our URL handling code so we can + # have near complete control over what's sent on the wire. + if opts['peer'] == 'raw': + openerargs[r'sendaccept'] = False + + opener = urlmod.opener(ui, authinfo, **openerargs) + + if opts['peer'] == 'http2': + ui.write(_('creating http peer for wire protocol version 2\n')) + # We go through makepeer() because we need an API descriptor for + # the peer instance to be useful. + with ui.configoverride({ + ('experimental', 'httppeer.advertise-v2'): True}): + if opts['nologhandshake']: + ui.pushbuffer() + + peer = httppeer.makepeer(ui, path, opener=opener) + + if opts['nologhandshake']: + ui.popbuffer() + + if not isinstance(peer, httppeer.httpv2peer): + raise error.Abort(_('could not instantiate HTTP peer for ' + 'wire protocol version 2'), + hint=_('the server may not have the feature ' + 'enabled or is not allowing this ' + 'client version')) + + elif opts['peer'] == 'raw': + ui.write(_('using raw connection to peer\n')) + peer = None + elif opts['peer']: + raise error.Abort(_('--peer %s not supported with HTTP peers') % + opts['peer']) + else: + peer = httppeer.makepeer(ui, path, opener=opener) + + # We /could/ populate stdin/stdout with sock.makefile()... + else: + raise error.Abort(_('unsupported connection configuration')) + + batchedcommands = None + + # Now perform actions based on the parsed wire language instructions. + for action, lines in blocks: + if action in ('raw', 'raw+'): + if not stdin: + raise error.Abort(_('cannot call raw/raw+ on this peer')) + + # Concatenate the data together. + data = ''.join(l.lstrip() for l in lines) + data = stringutil.unescapestr(data) + stdin.write(data) + + if action == 'raw+': + stdin.flush() + elif action == 'flush': + if not stdin: + raise error.Abort(_('cannot call flush on this peer')) + stdin.flush() + elif action.startswith('command'): + if not peer: + raise error.Abort(_('cannot send commands unless peer instance ' + 'is available')) + + command = action.split(' ', 1)[1] + + args = {} + for line in lines: + # We need to allow empty values. + fields = line.lstrip().split(' ', 1) + if len(fields) == 1: + key = fields[0] + value = '' + else: + key, value = fields + + if value.startswith('eval:'): + value = stringutil.evalpythonliteral(value[5:]) + else: + value = stringutil.unescapestr(value) + + args[key] = value + + if batchedcommands is not None: + batchedcommands.append((command, args)) + continue + + ui.status(_('sending %s command\n') % command) + + if 'PUSHFILE' in args: + with open(args['PUSHFILE'], r'rb') as fh: + del args['PUSHFILE'] + res, output = peer._callpush(command, fh, + **pycompat.strkwargs(args)) + ui.status(_('result: %s\n') % stringutil.escapestr(res)) + ui.status(_('remote output: %s\n') % + stringutil.escapestr(output)) + else: + with peer.commandexecutor() as e: + res = e.callcommand(command, args).result() + + if isinstance(res, wireprotov2peer.commandresponse): + val = list(res.cborobjects()) + ui.status(_('response: %s\n') % stringutil.pprint(val)) + + else: + ui.status(_('response: %s\n') % stringutil.pprint(res)) + + elif action == 'batchbegin': + if batchedcommands is not None: + raise error.Abort(_('nested batchbegin not allowed')) + + batchedcommands = [] + elif action == 'batchsubmit': + # There is a batching API we could go through. But it would be + # difficult to normalize requests into function calls. It is easier + # to bypass this layer and normalize to commands + args. + ui.status(_('sending batch with %d sub-commands\n') % + len(batchedcommands)) + for i, chunk in enumerate(peer._submitbatch(batchedcommands)): + ui.status(_('response #%d: %s\n') % + (i, stringutil.escapestr(chunk))) + + batchedcommands = None + + elif action.startswith('httprequest '): + if not opener: + raise error.Abort(_('cannot use httprequest without an HTTP ' + 'peer')) + + request = action.split(' ', 2) + if len(request) != 3: + raise error.Abort(_('invalid httprequest: expected format is ' + '"httprequest ')) + + method, httppath = request[1:] + headers = {} + body = None + frames = [] + for line in lines: + line = line.lstrip() + m = re.match(b'^([a-zA-Z0-9_-]+): (.*)$', line) + if m: + headers[m.group(1)] = m.group(2) + continue + + if line.startswith(b'BODYFILE '): + with open(line.split(b' ', 1), 'rb') as fh: + body = fh.read() + elif line.startswith(b'frame '): + frame = wireprotoframing.makeframefromhumanstring( + line[len(b'frame '):]) + + frames.append(frame) + else: + raise error.Abort(_('unknown argument to httprequest: %s') % + line) + + url = path + httppath + + if frames: + body = b''.join(bytes(f) for f in frames) + + req = urlmod.urlreq.request(pycompat.strurl(url), body, headers) + + # urllib.Request insists on using has_data() as a proxy for + # determining the request method. Override that to use our + # explicitly requested method. + req.get_method = lambda: method + + try: + res = opener.open(req) + body = res.read() + except util.urlerr.urlerror as e: + e.read() + continue + + if res.headers.get('Content-Type') == 'application/mercurial-cbor': + ui.write(_('cbor> %s\n') % stringutil.pprint(cbor.loads(body))) + + elif action == 'close': + peer.close() + elif action == 'readavailable': + if not stdout or not stderr: + raise error.Abort(_('readavailable not available on this peer')) + + stdin.close() + stdout.read() + stderr.read() + + elif action == 'readline': + if not stdout: + raise error.Abort(_('readline not available on this peer')) + stdout.readline() + elif action == 'ereadline': + if not stderr: + raise error.Abort(_('ereadline not available on this peer')) + stderr.readline() + elif action.startswith('read '): + count = int(action.split(' ', 1)[1]) + if not stdout: + raise error.Abort(_('read not available on this peer')) + stdout.read(count) + elif action.startswith('eread '): + count = int(action.split(' ', 1)[1]) + if not stderr: + raise error.Abort(_('eread not available on this peer')) + stderr.read(count) + else: + raise error.Abort(_('unknown action: %s') % action) + + if batchedcommands is not None: + raise error.Abort(_('unclosed "batchbegin" request')) + + if peer: + peer.close() + + if proc: + proc.kill() diff -r fb92df8b634c -r ed5448edcbfa mercurial/default.d/mergetools.rc --- a/mercurial/default.d/mergetools.rc Wed Apr 04 10:35:09 2018 -0400 +++ b/mercurial/default.d/mergetools.rc Wed Apr 18 15:32:08 2018 -0400 @@ -1,7 +1,7 @@ # Some default global settings for common merge tools [merge-tools] -kdiff3.args=--auto --L1 base --L2 local --L3 other $base $local $other -o $output +kdiff3.args=--auto --L1 $labelbase --L2 $labellocal --L3 $labelother $base $local $other -o $output kdiff3.regkey=Software\KDiff3 kdiff3.regkeyalt=Software\Wow6432Node\KDiff3 kdiff3.regappend=\kdiff3.exe @@ -26,7 +26,7 @@ gpyfm.gui=True meld.gui=True -meld.args=--label='local' $local --label='merged' $base --label='other' $other -o $output +meld.args=--label=$labellocal $local --label='merged' $base --label=$labelother $other -o $output meld.check=changed meld.diffargs=-a --label=$plabel1 $parent --label=$clabel $child @@ -35,7 +35,7 @@ tkdiff.priority=-8 tkdiff.diffargs=-L $plabel1 $parent -L $clabel $child -xxdiff.args=--show-merged-pane --exit-with-merge-status --title1 local --title2 base --title3 other --merged-filename $output --merge $local $base $other +xxdiff.args=--show-merged-pane --exit-with-merge-status --title1 $labellocal --title2 $labelbase --title3 $labelother --merged-filename $output --merge $local $base $other xxdiff.gui=True xxdiff.priority=-8 xxdiff.diffargs=--title1 $plabel1 $parent --title2 $clabel $child @@ -44,7 +44,7 @@ diffmerge.regkeyalt=Software\Wow6432Node\SourceGear\SourceGear DiffMerge\ diffmerge.regname=Location diffmerge.priority=-7 -diffmerge.args=-nosplash -merge -title1=local -title2=merged -title3=other $local $base $other -result=$output +diffmerge.args=-nosplash -merge -title1=$labellocal -title2=merged -title3=$labelother $local $base $other -result=$output diffmerge.check=changed diffmerge.gui=True diffmerge.diffargs=--nosplash --title1=$plabel1 --title2=$clabel $parent $child @@ -72,7 +72,7 @@ tortoisemerge.priority=-8 tortoisemerge.diffargs=/base:$parent /mine:$child /basename:$plabel1 /minename:$clabel -ecmerge.args=$base $local $other --mode=merge3 --title0=base --title1=local --title2=other --to=$output +ecmerge.args=$base $local $other --mode=merge3 --title0=$labelbase --title1=$labellocal --title2=$labelother --to=$output ecmerge.regkey=Software\Elli\xc3\xa9 Computing\Merge ecmerge.regkeyalt=Software\Wow6432Node\Elli\xc3\xa9 Computing\Merge ecmerge.gui=True @@ -93,7 +93,7 @@ filemergexcode.gui=True ; Windows version of Beyond Compare -beyondcompare3.args=$local $other $base $output /ro /lefttitle=local /centertitle=base /righttitle=other /automerge /reviewconflicts /solo +beyondcompare3.args=$local $other $base $output /ro /lefttitle=$labellocal /centertitle=$labelbase /righttitle=$labelother /automerge /reviewconflicts /solo beyondcompare3.regkey=Software\Scooter Software\Beyond Compare 3 beyondcompare3.regname=ExePath beyondcompare3.gui=True @@ -113,7 +113,7 @@ bcomposx.priority=-1 bcomposx.diffargs=-lro -lefttitle=$plabel1 -righttitle=$clabel -solo -expandall $parent $child -winmerge.args=/e /x /wl /ub /dl other /dr local $other $local $output +winmerge.args=/e /x /wl /ub /dl $labelother /dr $labellocal $other $local $output winmerge.regkey=Software\Thingamahoochie\WinMerge winmerge.regkeyalt=Software\Wow6432Node\Thingamahoochie\WinMerge\ winmerge.regname=Executable diff -r fb92df8b634c -r ed5448edcbfa mercurial/destutil.py --- a/mercurial/destutil.py Wed Apr 04 10:35:09 2018 -0400 +++ b/mercurial/destutil.py Wed Apr 18 15:32:08 2018 -0400 @@ -13,8 +13,42 @@ error, obsutil, scmutil, + stack ) +def orphanpossibledestination(repo, rev): + """Return all changesets that may be a new parent for orphan `rev`. + + This function works fine on non-orphan revisions, it's just silly + because there's no destination implied by obsolete markers, so + it'll return nothing. + """ + tonode = repo.changelog.node + parents = repo.changelog.parentrevs + torev = repo.changelog.rev + dest = set() + tovisit = list(parents(rev)) + while tovisit: + r = tovisit.pop() + succsets = obsutil.successorssets(repo, tonode(r)) + if not succsets: + # if there are no successors for r, r was probably pruned + # and we should walk up to r's parents to try and find + # some successors. + tovisit.extend(parents(r)) + else: + # We should probably pick only one destination from split + # (case where '1 < len(ss)'), This could be the currently + # tipmost, but the correct result is less clear when + # results of the split have been moved such that they + # reside on multiple branches. + for ss in succsets: + for n in ss: + dr = torev(n) + if dr != -1: + dest.add(dr) + return dest + def _destupdateobs(repo, clean): """decide of an update destination from obsolescence markers""" node = None @@ -54,10 +88,10 @@ def _destupdatebook(repo, clean): """decide on an update destination from active bookmark""" # we also move the active bookmark, if any - activemark = None - node, movemark = bookmarks.calculateupdate(repo.ui, repo, None) - if node is not None: - activemark = node + node = None + activemark, movemark = bookmarks.calculateupdate(repo.ui, repo) + if activemark is not None: + node = repo._bookmarks[activemark] return node, movemark, activemark def _destupdatebranch(repo, clean): @@ -235,7 +269,7 @@ """find merge destination in the active bookmark case""" node = None bmheads = bookmarks.headsforactive(repo) - curhead = repo[repo._activebookmark].node() + curhead = repo._bookmarks[repo._activebookmark] if len(bmheads) == 2: if curhead == bmheads[0]: node = bmheads[1] @@ -339,30 +373,28 @@ onheadcheck=onheadcheck, destspace=destspace) return repo[node].rev() -histeditdefaultrevset = 'reverse(only(.) and not public() and not ::merge())' - def desthistedit(ui, repo): """Default base revision to edit for `hg histedit`.""" - default = ui.config('histedit', 'defaultrev', histeditdefaultrevset) - if default: + default = ui.config('histedit', 'defaultrev') + + if default is None: + revs = stack.getstack(repo) + elif default: revs = scmutil.revrange(repo, [default]) - if revs: - # The revset supplied by the user may not be in ascending order nor - # take the first revision. So do this manually. - revs.sort() - return revs.first() + + if revs: + # Take the first revision of the revset as the root + return revs.min() return None def stackbase(ui, repo): - # The histedit default base stops at public changesets, branchpoints, - # and merges, which is exactly what we want for a stack. - revs = scmutil.revrange(repo, [histeditdefaultrevset]) - return revs.last() if revs else None + revs = stack.getstack(repo) + return revs.first() if revs else None def _statusotherbook(ui, repo): bmheads = bookmarks.headsforactive(repo) - curhead = repo[repo._activebookmark].node() + curhead = repo._bookmarks[repo._activebookmark] if repo.revs('%n and parents()', curhead): # we are on the active bookmark bmheads = [b for b in bmheads if curhead != b] diff -r fb92df8b634c -r ed5448edcbfa mercurial/diffhelpers.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/mercurial/diffhelpers.py Wed Apr 18 15:32:08 2018 -0400 @@ -0,0 +1,77 @@ +# diffhelpers.py - helper routines for patch +# +# Copyright 2009 Matt Mackall and others +# +# This software may be used and distributed according to the terms of the +# GNU General Public License version 2 or any later version. + +from __future__ import absolute_import + +from .i18n import _ + +from . import ( + error, +) + +def addlines(fp, hunk, lena, lenb, a, b): + """Read lines from fp into the hunk + + The hunk is parsed into two arrays, a and b. a gets the old state of + the text, b gets the new state. The control char from the hunk is saved + when inserting into a, but not b (for performance while deleting files.) + """ + while True: + todoa = lena - len(a) + todob = lenb - len(b) + num = max(todoa, todob) + if num == 0: + break + for i in xrange(num): + s = fp.readline() + if not s: + raise error.ParseError(_('incomplete hunk')) + if s == "\\ No newline at end of file\n": + fixnewline(hunk, a, b) + continue + if s == '\n' or s == '\r\n': + # Some patches may be missing the control char + # on empty lines. Supply a leading space. + s = ' ' + s + hunk.append(s) + if s.startswith('+'): + b.append(s[1:]) + elif s.startswith('-'): + a.append(s) + else: + b.append(s[1:]) + a.append(s) + +def fixnewline(hunk, a, b): + """Fix up the last lines of a and b when the patch has no newline at EOF""" + l = hunk[-1] + # tolerate CRLF in last line + if l.endswith('\r\n'): + hline = l[:-2] + else: + hline = l[:-1] + + if hline.startswith((' ', '+')): + b[-1] = hline[1:] + if hline.startswith((' ', '-')): + a[-1] = hline + hunk[-1] = hline + +def testhunk(a, b, bstart): + """Compare the lines in a with the lines in b + + a is assumed to have a control char at the start of each line, this char + is ignored in the compare. + """ + alen = len(a) + blen = len(b) + if alen > blen - bstart: + return False + for i in xrange(alen): + if a[i][1:] != b[i + bstart]: + return False + return True diff -r fb92df8b634c -r ed5448edcbfa mercurial/dirstate.py --- a/mercurial/dirstate.py Wed Apr 04 10:35:09 2018 -0400 +++ b/mercurial/dirstate.py Wed Apr 18 15:32:08 2018 -0400 @@ -49,7 +49,7 @@ '''Get "now" timestamp on filesystem''' tmpfd, tmpname = vfs.mkstemp() try: - return os.fstat(tmpfd).st_mtime + return os.fstat(tmpfd)[stat.ST_MTIME] finally: os.close(tmpfd) vfs.unlink(tmpname) @@ -99,27 +99,6 @@ # normally, so we don't have a try/finally here on purpose. self._parentwriters -= 1 - def beginparentchange(self): - '''Marks the beginning of a set of changes that involve changing - the dirstate parents. If there is an exception during this time, - the dirstate will not be written when the wlock is released. This - prevents writing an incoherent dirstate where the parent doesn't - match the contents. - ''' - self._ui.deprecwarn('beginparentchange is obsoleted by the ' - 'parentchange context manager.', '4.3') - self._parentwriters += 1 - - def endparentchange(self): - '''Marks the end of a set of changes that involve changing the - dirstate parents. Once all parent changes have been marked done, - the wlock will be free to write the dirstate on release. - ''' - self._ui.deprecwarn('endparentchange is obsoleted by the ' - 'parentchange context manager.', '4.3') - if self._parentwriters > 0: - self._parentwriters -= 1 - def pendingparentchange(self): '''Returns true if the dirstate is in the middle of a set of changes that modify the dirstate parent. @@ -360,7 +339,7 @@ rereads the dirstate. Use localrepo.invalidatedirstate() if you want to check whether the dirstate has changed before rereading it.''' - for a in ("_map", "_branch", "_ignore"): + for a in (r"_map", r"_branch", r"_ignore"): if a in self.__dict__: delattr(self, a) self._lastnormaltime = 0 @@ -392,7 +371,8 @@ if state == 'a' or oldstate == 'r': scmutil.checkfilename(f) if self._map.hastrackeddir(f): - raise error.Abort(_('directory %r already in dirstate') % f) + raise error.Abort(_('directory %r already in dirstate') % + pycompat.bytestr(f)) # shadows for d in util.finddirs(f): if self._map.hastrackeddir(d): @@ -400,7 +380,8 @@ entry = self._map.get(d) if entry is not None and entry[0] != 'r': raise error.Abort( - _('file %r in dirstate clashes with %r') % (d, f)) + _('file %r in dirstate clashes with %r') % + (pycompat.bytestr(d), pycompat.bytestr(f))) self._dirty = True self._updatedfiles.add(f) self._map.addfile(f, oldstate, state, mode, size, mtime) @@ -408,7 +389,7 @@ def normal(self, f): '''Mark a file normal and clean.''' s = os.lstat(self._join(f)) - mtime = s.st_mtime + mtime = s[stat.ST_MTIME] self._addpath(f, 'n', s.st_mode, s.st_size & _rangemask, mtime & _rangemask) self._map.copymap.pop(f, None) @@ -647,7 +628,7 @@ self._origpl = None # use the modification time of the newly created temporary file as the # filesystem's notion of 'now' - now = util.fstat(st).st_mtime & _rangemask + now = util.fstat(st)[stat.ST_MTIME] & _rangemask # enough 'delaywrite' prevents 'pack_dirstate' from dropping # timestamp of each entries in dirstate, because of 'now > mtime' @@ -808,6 +789,17 @@ else: badfn(ff, encoding.strtolocal(inst.strerror)) + # match.files() may contain explicitly-specified paths that shouldn't + # be taken; drop them from the list of files found. dirsfound/notfound + # aren't filtered here because they will be tested later. + if match.anypats(): + for f in list(results): + if f == '.hg' or f in subrepos: + # keep sentinel to disable further out-of-repo walks + continue + if not match(f): + del results[f] + # Case insensitive filesystems cannot rely on lstat() failing to detect # a case-only rename. Prune the stat object for any file that does not # match the case in the filesystem, if there are multiple files that @@ -1078,9 +1070,10 @@ or size == -2 # other parent or fn in copymap): madd(fn) - elif time != st.st_mtime and time != st.st_mtime & _rangemask: + elif (time != st[stat.ST_MTIME] + and time != st[stat.ST_MTIME] & _rangemask): ladd(fn) - elif st.st_mtime == lastnormaltime: + elif st[stat.ST_MTIME] == lastnormaltime: # fn may have just been marked as normal and it may have # changed in the same second without changing its size. # This can happen if we quickly do multiple commits. @@ -1237,9 +1230,12 @@ util.clearcachedproperty(self, "nonnormalset") util.clearcachedproperty(self, "otherparentset") - def iteritems(self): + def items(self): return self._map.iteritems() + # forward for python2,3 compat + iteritems = items + def __len__(self): return len(self._map) @@ -1264,9 +1260,9 @@ def addfile(self, f, oldstate, state, mode, size, mtime): """Add a tracked file to the dirstate.""" - if oldstate in "?r" and "_dirs" in self.__dict__: + if oldstate in "?r" and r"_dirs" in self.__dict__: self._dirs.addpath(f) - if oldstate == "?" and "_alldirs" in self.__dict__: + if oldstate == "?" and r"_alldirs" in self.__dict__: self._alldirs.addpath(f) self._map[f] = dirstatetuple(state, mode, size, mtime) if state != 'n' or mtime == -1: @@ -1282,11 +1278,11 @@ the file's previous state. In the future, we should refactor this to be more explicit about what that state is. """ - if oldstate not in "?r" and "_dirs" in self.__dict__: + if oldstate not in "?r" and r"_dirs" in self.__dict__: self._dirs.delpath(f) - if oldstate == "?" and "_alldirs" in self.__dict__: + if oldstate == "?" and r"_alldirs" in self.__dict__: self._alldirs.addpath(f) - if "filefoldmap" in self.__dict__: + if r"filefoldmap" in self.__dict__: normed = util.normcase(f) self.filefoldmap.pop(normed, None) self._map[f] = dirstatetuple('r', 0, size, 0) @@ -1299,11 +1295,11 @@ """ exists = self._map.pop(f, None) is not None if exists: - if oldstate != "r" and "_dirs" in self.__dict__: + if oldstate != "r" and r"_dirs" in self.__dict__: self._dirs.delpath(f) - if "_alldirs" in self.__dict__: + if r"_alldirs" in self.__dict__: self._alldirs.delpath(f) - if "filefoldmap" in self.__dict__: + if r"filefoldmap" in self.__dict__: normed = util.normcase(f) self.filefoldmap.pop(normed, None) self.nonnormalset.discard(f) @@ -1438,7 +1434,7 @@ # This heuristic is imperfect in many ways, so in a future dirstate # format update it makes sense to just record the number of entries # on write. - self._map = parsers.dict_new_presized(len(st) / 71) + self._map = parsers.dict_new_presized(len(st) // 71) # Python's garbage collector triggers a GC each time a certain number # of container objects (the number being defined by diff -r fb92df8b634c -r ed5448edcbfa mercurial/discovery.py --- a/mercurial/discovery.py Wed Apr 04 10:35:09 2018 -0400 +++ b/mercurial/discovery.py Wed Apr 18 15:32:08 2018 -0400 @@ -53,13 +53,8 @@ return treediscovery.findcommonincoming(repo, remote, heads, force) if heads: - allknown = True knownnode = repo.changelog.hasnode # no nodemap until it is filtered - for h in heads: - if not knownnode(h): - allknown = False - break - if allknown: + if all(knownnode(h) for h in heads): return (heads, False, heads) res = setdiscovery.findcommonheads(repo.ui, repo, remote, @@ -208,13 +203,16 @@ headssum = {} # A. Create set of branches involved in the push. branches = set(repo[n].branch() for n in outgoing.missing) - remotemap = remote.branchmap() + + with remote.commandexecutor() as e: + remotemap = e.callcommand('branchmap', {}).result() + newbranches = branches - set(remotemap) branches.difference_update(newbranches) # A. register remote heads remotebranches = set() - for branch, heads in remote.branchmap().iteritems(): + for branch, heads in remotemap.iteritems(): remotebranches.add(branch) known = [] unsynced = [] @@ -292,7 +290,12 @@ repo = pushop.repo.unfiltered() remote = pushop.remote localbookmarks = repo._bookmarks - remotebookmarks = remote.listkeys('bookmarks') + + with remote.commandexecutor() as e: + remotebookmarks = e.callcommand('listkeys', { + 'namespace': 'bookmarks', + }).result() + bookmarkedheads = set() # internal config: bookmarks.pushing @@ -302,12 +305,12 @@ for bm in localbookmarks: rnode = remotebookmarks.get(bm) if rnode and rnode in repo: - lctx, rctx = repo[bm], repo[rnode] + lctx, rctx = localbookmarks.changectx(bm), repo[rnode] if bookmarks.validdest(repo, rctx, lctx): bookmarkedheads.add(lctx.node()) else: if bm in newbookmarks and bm not in remotebookmarks: - bookmarkedheads.add(repo[bm].node()) + bookmarkedheads.add(localbookmarks[bm]) return bookmarkedheads diff -r fb92df8b634c -r ed5448edcbfa mercurial/dispatch.py --- a/mercurial/dispatch.py Wed Apr 04 10:35:09 2018 -0400 +++ b/mercurial/dispatch.py Wed Apr 18 15:32:08 2018 -0400 @@ -35,13 +35,15 @@ hook, profiling, pycompat, - registrar, scmutil, ui as uimod, util, ) -unrecoverablewrite = registrar.command.unrecoverablewrite +from .utils import ( + procutil, + stringutil, +) class request(object): def __init__(self, args, ui=None, repo=None, fin=None, fout=None, @@ -85,7 +87,7 @@ req = request(pycompat.sysargv[1:]) err = None try: - status = (dispatch(req) or 0) & 255 + status = (dispatch(req) or 0) except error.StdioError as e: err = e status = -1 @@ -106,11 +108,36 @@ except IOError: status = -1 + _silencestdio() sys.exit(status & 255) -def _initstdio(): - for fp in (sys.stdin, sys.stdout, sys.stderr): - util.setbinary(fp) +if pycompat.ispy3: + def _initstdio(): + pass + + def _silencestdio(): + for fp in (sys.stdout, sys.stderr): + # Check if the file is okay + try: + fp.flush() + continue + except IOError: + pass + # Otherwise mark it as closed to silence "Exception ignored in" + # message emitted by the interpreter finalizer. Be careful to + # not close procutil.stdout, which may be a fdopen-ed file object + # and its close() actually closes the underlying file descriptor. + try: + fp.close() + except IOError: + pass +else: + def _initstdio(): + for fp in (sys.stdin, sys.stdout, sys.stderr): + procutil.setbinary(fp) + + def _silencestdio(): + pass def _getsimilar(symbols, value): sim = lambda x: difflib.SequenceMatcher(None, value, x).ratio() @@ -132,8 +159,8 @@ similar = _getsimilar(inst.symbols, inst.function) if len(inst.args) > 1: write(_("hg: parse error at %s: %s\n") % - (inst.args[1], inst.args[0])) - if (inst.args[0][0] == ' '): + (pycompat.bytestr(inst.args[1]), inst.args[0])) + if inst.args[0].startswith(' '): write(_("unexpected leading whitespace\n")) else: write(_("hg: parse error: %s\n") % inst.args[0]) @@ -142,7 +169,7 @@ write(_("(%s)\n") % inst.hint) def _formatargs(args): - return ' '.join(util.shellquote(a) for a in args) + return ' '.join(procutil.shellquote(a) for a in args) def dispatch(req): "run the command specified in req.args" @@ -151,7 +178,7 @@ elif req.ui: ferr = req.ui.ferr else: - ferr = util.stderr + ferr = procutil.stderr try: if not req.ui: @@ -383,7 +410,7 @@ if not util.safehasattr(fn, '_origfunc'): args = getattr(fn, 'args', args) if args: - cmd = ' '.join(map(util.shellquote, args)) + cmd = ' '.join(map(procutil.shellquote, args)) nums = [] def replacer(m): @@ -413,14 +440,14 @@ # parameters, separated out into words. Emulate the same behavior here by # quoting the arguments individually. POSIX shells will then typically # tokenize each argument into exactly one word. - replacemap['"$@"'] = ' '.join(util.shellquote(arg) for arg in args) + replacemap['"$@"'] = ' '.join(procutil.shellquote(arg) for arg in args) # escape '\$' for regex regex = '|'.join(replacemap.keys()).replace('$', br'\$') r = re.compile(regex) return r.sub(lambda x: replacemap[x.group()], cmd) class cmdalias(object): - def __init__(self, name, definition, cmdtable, source): + def __init__(self, ui, name, definition, cmdtable, source): self.name = self.cmd = name self.cmdname = '' self.definition = definition @@ -447,6 +474,7 @@ return if self.definition.startswith('!'): + shdef = self.definition[1:] self.shell = True def fn(ui, *args): env = {'HG_ARGS': ' '.join((self.name,) + args)} @@ -460,24 +488,26 @@ "of %i variable in alias '%s' definition.\n" % (int(m.groups()[0]), self.name)) return '' - cmd = re.sub(br'\$(\d+|\$)', _checkvar, self.definition[1:]) + cmd = re.sub(br'\$(\d+|\$)', _checkvar, shdef) cmd = aliasinterpolate(self.name, args, cmd) return ui.system(cmd, environ=env, blockedtag='alias_%s' % self.name) self.fn = fn + self._populatehelp(ui, name, shdef, self.fn) return try: args = pycompat.shlexsplit(self.definition) except ValueError as inst: self.badalias = (_("error in definition for alias '%s': %s") - % (self.name, inst)) + % (self.name, stringutil.forcebytestr(inst))) return earlyopts, args = _earlysplitopts(args) if earlyopts: self.badalias = (_("error in definition for alias '%s': %s may " "only be given on the command line") - % (self.name, '/'.join(zip(*earlyopts)[0]))) + % (self.name, '/'.join(pycompat.ziplist(*earlyopts) + [0]))) return self.cmdname = cmd = args.pop(0) self.givenargs = args @@ -485,14 +515,12 @@ try: tableentry = cmdutil.findcmd(cmd, cmdtable, False)[1] if len(tableentry) > 2: - self.fn, self.opts, self.help = tableentry + self.fn, self.opts, cmdhelp = tableentry else: self.fn, self.opts = tableentry + cmdhelp = None - if self.help.startswith("hg " + cmd): - # drop prefix in old-style help lines so hg shows the alias - self.help = self.help[4 + len(cmd):] - self.__doc__ = self.fn.__doc__ + self._populatehelp(ui, name, cmd, self.fn, cmdhelp) except error.UnknownCommand: self.badalias = (_("alias '%s' resolves to unknown command '%s'") @@ -502,13 +530,36 @@ self.badalias = (_("alias '%s' resolves to ambiguous command '%s'") % (self.name, cmd)) + def _populatehelp(self, ui, name, cmd, fn, defaulthelp=None): + # confine strings to be passed to i18n.gettext() + cfg = {} + for k in ('doc', 'help'): + v = ui.config('alias', '%s:%s' % (name, k), None) + if v is None: + continue + if not encoding.isasciistr(v): + self.badalias = (_("non-ASCII character in alias definition " + "'%s:%s'") % (name, k)) + return + cfg[k] = v + + self.help = cfg.get('help', defaulthelp or '') + if self.help and self.help.startswith("hg " + cmd): + # drop prefix in old-style help lines so hg shows the alias + self.help = self.help[4 + len(cmd):] + + doc = cfg.get('doc', pycompat.getdoc(fn)) + if doc is not None: + doc = pycompat.sysstr(doc) + self.__doc__ = doc + @property def args(self): args = pycompat.maplist(util.expandpath, self.givenargs) return aliasargs(self.fn, args) def __getattr__(self, name): - adefaults = {r'norepo': True, r'cmdtype': unrecoverablewrite, + adefaults = {r'norepo': True, r'intents': set(), r'optionalrepo': False, r'inferrepo': False} if name not in adefaults: raise AttributeError(name) @@ -546,7 +597,8 @@ class lazyaliasentry(object): """like a typical command entry (func, opts, help), but is lazy""" - def __init__(self, name, definition, cmdtable, source): + def __init__(self, ui, name, definition, cmdtable, source): + self.ui = ui self.name = name self.definition = definition self.cmdtable = cmdtable.copy() @@ -554,7 +606,8 @@ @util.propertycache def _aliasdef(self): - return cmdalias(self.name, self.definition, self.cmdtable, self.source) + return cmdalias(self.ui, self.name, self.definition, self.cmdtable, + self.source) def __getitem__(self, n): aliasdef = self._aliasdef @@ -578,7 +631,7 @@ # aliases are processed after extensions have been loaded, so they # may use extension commands. Aliases can also use other alias definitions, # but only if they have been defined prior to the current definition. - for alias, definition in ui.configitems('alias'): + for alias, definition in ui.configitems('alias', ignoresub=True): try: if cmdtable[alias].definition == definition: continue @@ -587,7 +640,7 @@ pass source = ui.configsource('alias', alias) - entry = lazyaliasentry(alias, definition, cmdtable, source) + entry = lazyaliasentry(ui, alias, definition, cmdtable, source) cmdtable[alias] = entry def _parse(ui, args): @@ -597,7 +650,7 @@ try: args = fancyopts.fancyopts(args, commands.globalopts, options) except getopt.GetoptError as inst: - raise error.CommandError(None, inst) + raise error.CommandError(None, stringutil.forcebytestr(inst)) if args: cmd, args = args[0], args[1:] @@ -621,7 +674,7 @@ try: args = fancyopts.fancyopts(args, c, cmdoptions, gnu=True) except getopt.GetoptError as inst: - raise error.CommandError(cmd, inst) + raise error.CommandError(cmd, stringutil.forcebytestr(inst)) # separate global options back out for o in commands.globalopts: @@ -646,7 +699,8 @@ configs.append((section, name, value)) except (IndexError, ValueError): raise error.Abort(_('malformed --config option: %r ' - '(use --config section.name=value)') % cfg) + '(use --config section.name=value)') + % pycompat.bytestr(cfg)) return configs @@ -821,9 +875,7 @@ if options['verbose'] or options['debug'] or options['quiet']: for opt in ('verbose', 'debug', 'quiet'): - val = str(bool(options[opt])) - if pycompat.ispy3: - val = val.encode('ascii') + val = pycompat.bytestr(bool(options[opt])) for ui_ in uis: ui_.setconfig('ui', opt, val, '--' + opt) @@ -847,7 +899,7 @@ ui_.setconfig('ui', 'color', coloropt, '--color') color.setup(ui_) - if util.parsebool(options['pager']): + if stringutil.parsebool(options['pager']): # ui.pager() expects 'internal-always-' prefix in this case ui.pager('internal-always-' + cmd) elif options['pager'] != 'auto': @@ -876,7 +928,8 @@ else: try: repo = hg.repository(ui, path=path, - presetupfuncs=req.prereposetups) + presetupfuncs=req.prereposetups, + intents=func.intents) if not repo.local(): raise error.Abort(_("repository '%s' is not local") % path) @@ -941,9 +994,9 @@ worst = None, ct, '' if ui.config('ui', 'supportcontact') is None: for name, mod in extensions.extensions(): - testedwith = getattr(mod, 'testedwith', '') - if pycompat.ispy3 and isinstance(testedwith, str): - testedwith = testedwith.encode(u'utf-8') + # 'testedwith' should be bytes, but not all extensions are ported + # to py3 and we don't want UnicodeException because of that. + testedwith = stringutil.forcebytestr(getattr(mod, 'testedwith', '')) report = getattr(mod, 'buglink', _('the extension author.')) if not testedwith.strip(): # We found an untested extension. It's likely the culprit. @@ -965,7 +1018,8 @@ if worst[0] is not None: name, testedwith, report = worst if not isinstance(testedwith, (bytes, str)): - testedwith = '.'.join([str(c) for c in testedwith]) + testedwith = '.'.join([stringutil.forcebytestr(c) + for c in testedwith]) warning = (_('** Unknown exception encountered with ' 'possibly-broken third-party extension %s\n' '** which supports versions %s of Mercurial.\n' @@ -978,11 +1032,7 @@ bugtracker = _("https://mercurial-scm.org/wiki/BugTracker") warning = (_("** unknown exception encountered, " "please report by visiting\n** ") + bugtracker + '\n') - if pycompat.ispy3: - sysversion = sys.version.encode(u'utf-8') - else: - sysversion = sys.version - sysversion = sysversion.replace('\n', '') + sysversion = pycompat.sysbytes(sys.version).replace('\n', '') warning += ((_("** Python %s\n") % sysversion) + (_("** Mercurial Distributed SCM (version %s)\n") % util.version()) + @@ -997,6 +1047,7 @@ this function returns False, ignored otherwise. """ warning = _exceptionwarning(ui) - ui.log("commandexception", "%s\n%s\n", warning, traceback.format_exc()) + ui.log("commandexception", "%s\n%s\n", warning, + pycompat.sysbytes(traceback.format_exc())) ui.warn(warning) return False # re-raise the exception diff -r fb92df8b634c -r ed5448edcbfa mercurial/encoding.py --- a/mercurial/encoding.py Wed Apr 04 10:35:09 2018 -0400 +++ b/mercurial/encoding.py Wed Apr 18 15:32:08 2018 -0400 @@ -7,7 +7,6 @@ from __future__ import absolute_import, print_function -import io import locale import os import unicodedata @@ -181,7 +180,8 @@ return u.encode("utf-8") except UnicodeDecodeError as inst: sub = s[max(0, inst.start - 10):inst.start + 10] - raise error.Abort("decoding near '%s': %s!" % (sub, inst)) + raise error.Abort("decoding near '%s': %s!" + % (sub, pycompat.bytestr(inst))) except LookupError as k: raise error.Abort(k, hint="please check your locale settings") @@ -580,18 +580,3 @@ c = pycompat.bytechr(ord(c.decode("utf-8", _utf8strict)) & 0xff) r += c return r - -if pycompat.ispy3: - class strio(io.TextIOWrapper): - """Wrapper around TextIOWrapper that respects hg's encoding assumptions. - - Also works around Python closing streams. - """ - - def __init__(self, buffer): - super(strio, self).__init__(buffer, encoding=_sysstr(encoding)) - - def __del__(self): - """Override __del__ so it doesn't close the underlying stream.""" -else: - strio = pycompat.identity diff -r fb92df8b634c -r ed5448edcbfa mercurial/error.py --- a/mercurial/error.py Wed Apr 04 10:35:09 2018 -0400 +++ b/mercurial/error.py Wed Apr 18 15:32:08 2018 -0400 @@ -47,7 +47,7 @@ # this can't be called 'message' because at least some installs of # Python 2.6+ complain about the 'message' property being deprecated self.lookupmessage = message - if isinstance(name, str) and len(name) == 20: + if isinstance(name, bytes) and len(name) == 20: from .node import short name = short(name) RevlogError.__init__(self, '%s@%s: %s' % (index, name, message)) diff -r fb92df8b634c -r ed5448edcbfa mercurial/exchange.py --- a/mercurial/exchange.py Wed Apr 04 10:35:09 2018 -0400 +++ b/mercurial/exchange.py Wed Apr 18 15:32:08 2018 -0400 @@ -17,6 +17,9 @@ hex, nullid, ) +from .thirdparty import ( + attr, +) from . import ( bookmarks as bookmod, bundle2, @@ -35,6 +38,9 @@ url as urlmod, util, ) +from .utils import ( + stringutil, +) urlerr = util.urlerr urlreq = util.urlreq @@ -46,10 +52,47 @@ 'bundle2': '02', #legacy } +# Maps bundle version with content opts to choose which part to bundle +_bundlespeccontentopts = { + 'v1': { + 'changegroup': True, + 'cg.version': '01', + 'obsolescence': False, + 'phases': False, + 'tagsfnodescache': False, + 'revbranchcache': False + }, + 'v2': { + 'changegroup': True, + 'cg.version': '02', + 'obsolescence': False, + 'phases': False, + 'tagsfnodescache': True, + 'revbranchcache': True + }, + 'packed1' : { + 'cg.version': 's1' + } +} +_bundlespeccontentopts['bundle2'] = _bundlespeccontentopts['v2'] + +_bundlespecvariants = {"streamv2": {"changegroup": False, "streamv2": True, + "tagsfnodescache": False, + "revbranchcache": False}} + # Compression engines allowed in version 1. THIS SHOULD NEVER CHANGE. _bundlespecv1compengines = {'gzip', 'bzip2', 'none'} -def parsebundlespec(repo, spec, strict=True, externalnames=False): +@attr.s +class bundlespec(object): + compression = attr.ib() + wirecompression = attr.ib() + version = attr.ib() + wireversion = attr.ib() + params = attr.ib() + contentopts = attr.ib() + +def parsebundlespec(repo, spec, strict=True): """Parse a bundle string specification into parts. Bundle specifications denote a well-defined bundle/exchange format. @@ -69,11 +112,9 @@ If ``strict`` is True (the default) is required. Otherwise, it is optional. - If ``externalnames`` is False (the default), the human-centric names will - be converted to their internal representation. - - Returns a 3-tuple of (compression, version, parameters). Compression will - be ``None`` if not in strict mode and a compression isn't defined. + Returns a bundlespec object of (compression, version, parameters). + Compression will be ``None`` if not in strict mode and a compression isn't + defined. An ``InvalidBundleSpecification`` is raised when the specification is not syntactically well formed. @@ -165,11 +206,20 @@ _('missing support for repository features: %s') % ', '.join(sorted(missingreqs))) - if not externalnames: - engine = util.compengines.forbundlename(compression) - compression = engine.bundletype()[1] - version = _bundlespeccgversions[version] - return compression, version, params + # Compute contentopts based on the version + contentopts = _bundlespeccontentopts.get(version, {}).copy() + + # Process the variants + if "stream" in params and params["stream"] == "v2": + variant = _bundlespecvariants["streamv2"] + contentopts.update(variant) + + engine = util.compengines.forbundlename(compression) + compression, wirecompression = engine.bundletype() + wireversion = _bundlespeccgversions[version] + + return bundlespec(compression, wirecompression, version, wireversion, + params, contentopts) def readbundle(ui, fh, fname, vfs=None): header = changegroup.readexactly(fh, 4) @@ -199,14 +249,6 @@ else: raise error.Abort(_('%s: unknown bundle version %s') % (fname, version)) -def _formatrequirementsspec(requirements): - return urlreq.quote(','.join(sorted(requirements))) - -def _formatrequirementsparams(requirements): - requirements = _formatrequirementsspec(requirements) - params = "%s%s" % (urlreq.quote("requirements="), requirements) - return params - def getbundlespec(ui, fh): """Infer the bundlespec from a bundle file handle. @@ -247,6 +289,13 @@ 'a known bundlespec') % version, hint=_('try upgrading your Mercurial ' 'client')) + elif part.type == 'stream2' and version is None: + # A stream2 part requires to be part of a v2 bundle + version = "v2" + requirements = urlreq.unquote(part.params['requirements']) + splitted = requirements.split() + params = bundle2._formatrequirementsparams(splitted) + return 'none-v2;stream=v2;%s' % params if not version: raise error.Abort(_('could not identify changegroup version in ' @@ -255,7 +304,8 @@ return '%s-%s' % (comp, version) elif isinstance(b, streamclone.streamcloneapplier): requirements = streamclone.readbundle1header(fh)[2] - return 'none-packed1;%s' % _formatrequirementsparams(requirements) + formatted = bundle2._formatrequirementsparams(requirements) + return 'none-packed1;%s' % formatted else: raise error.Abort(_('unknown bundle type: %s') % b) @@ -283,7 +333,6 @@ This function is used to allow testing of the older bundle version""" ui = op.repo.ui - forcebundle1 = False # The goal is this config is to allow developer to choose the bundle # version used during exchanged. This is especially handy during test. # Value is a list of bundle version to be picked from, highest version @@ -541,7 +590,8 @@ (computed for both success and failure case for changesets push)""" outgoing = pushop.outgoing unfi = pushop.repo.unfiltered() - remotephases = pushop.remote.listkeys('phases') + remotephases = listkeys(pushop.remote, 'phases') + if (pushop.ui.configbool('ui', '_usedassubrepo') and remotephases # server supports phases and not pushop.outgoing.missing # no changesets to be pushed @@ -588,14 +638,20 @@ @pushdiscovery('obsmarker') def _pushdiscoveryobsmarkers(pushop): - if (obsolete.isenabled(pushop.repo, obsolete.exchangeopt) - and pushop.repo.obsstore - and 'obsolete' in pushop.remote.listkeys('namespaces')): - repo = pushop.repo - # very naive computation, that can be quite expensive on big repo. - # However: evolution is currently slow on them anyway. - nodes = (c.node() for c in repo.set('::%ln', pushop.futureheads)) - pushop.outobsmarkers = pushop.repo.obsstore.relevantmarkers(nodes) + if not obsolete.isenabled(pushop.repo, obsolete.exchangeopt): + return + + if not pushop.repo.obsstore: + return + + if 'obsolete' not in listkeys(pushop.remote, 'namespaces'): + return + + repo = pushop.repo + # very naive computation, that can be quite expensive on big repo. + # However: evolution is currently slow on them anyway. + nodes = (c.node() for c in repo.set('::%ln', pushop.futureheads)) + pushop.outobsmarkers = pushop.repo.obsstore.relevantmarkers(nodes) @pushdiscovery('bookmarks') def _pushdiscoverybookmarks(pushop): @@ -607,7 +663,8 @@ if pushop.revs: revnums = map(repo.changelog.rev, pushop.revs) ancestors = repo.changelog.ancestors(revnums, inclusive=True) - remotebookmark = remote.listkeys('bookmarks') + + remotebookmark = listkeys(remote, 'bookmarks') explicit = set([repo._bookmarks.expandname(bookmark) for bookmark in pushop.bookmarks]) @@ -621,16 +678,25 @@ return hex(x) def hexifycompbookmarks(bookmarks): - for b, scid, dcid in bookmarks: - yield b, safehex(scid), safehex(dcid) + return [(b, safehex(scid), safehex(dcid)) + for (b, scid, dcid) in bookmarks] comp = [hexifycompbookmarks(marks) for marks in comp] + return _processcompared(pushop, ancestors, explicit, remotebookmark, comp) + +def _processcompared(pushop, pushed, explicit, remotebms, comp): + """take decision on bookmark to pull from the remote bookmark + + Exist to help extensions who want to alter this behavior. + """ addsrc, adddst, advsrc, advdst, diverge, differ, invalid, same = comp + repo = pushop.repo + for b, scid, dcid in advsrc: if b in explicit: explicit.remove(b) - if not ancestors or repo[scid].rev() in ancestors: + if not pushed or repo[scid].rev() in pushed: pushop.outbookmarks.append((b, dcid, scid)) # search added bookmark for b, scid, dcid in addsrc: @@ -656,8 +722,8 @@ if explicit: explicit = sorted(explicit) # we should probably list all of them - ui.warn(_('bookmark %s does not exist on the local ' - 'or remote repository!\n') % explicit[0]) + pushop.ui.warn(_('bookmark %s does not exist on the local ' + 'or remote repository!\n') % explicit[0]) pushop.bkresult = 2 pushop.outbookmarks.sort() @@ -1037,8 +1103,12 @@ stream = util.chunkbuffer(bundler.getchunks()) try: try: - reply = pushop.remote.unbundle( - stream, ['force'], pushop.remote.url()) + with pushop.remote.commandexecutor() as e: + reply = e.callcommand('unbundle', { + 'bundle': stream, + 'heads': ['force'], + 'url': pushop.remote.url(), + }).result() except error.BundleValueError as exc: raise error.Abort(_('missing support for %s') % exc) try: @@ -1105,7 +1175,7 @@ """synchronise phase information locally and remotely""" cheads = pushop.commonheads # even when we don't push, exchanging phase data is useful - remotephases = pushop.remote.listkeys('phases') + remotephases = listkeys(pushop.remote, 'phases') if (pushop.ui.configbool('ui', '_usedassubrepo') and remotephases # server supports phases and pushop.cgresult is None # nothing was pushed @@ -1149,10 +1219,14 @@ outdated = [c for c in outdated if c.node() not in pheads] # fallback to independent pushkey command for newremotehead in outdated: - r = pushop.remote.pushkey('phases', - newremotehead.hex(), - str(phases.draft), - str(phases.public)) + with pushop.remote.commandexecutor() as e: + r = e.callcommand('pushkey', { + 'namespace': 'phases', + 'key': newremotehead.hex(), + 'old': '%d' % phases.draft, + 'new': '%d' % phases.public + }).result() + if not r: pushop.ui.warn(_('updating %s to public failed!\n') % newremotehead) @@ -1207,7 +1281,16 @@ action = 'export' elif not new: action = 'delete' - if remote.pushkey('bookmarks', b, old, new): + + with remote.commandexecutor() as e: + r = e.callcommand('pushkey', { + 'namespace': 'bookmarks', + 'key': b, + 'old': old, + 'new': new, + }).result() + + if r: ui.status(bookmsgmap[action][0] % b) else: ui.warn(bookmsgmap[action][1] % b) @@ -1316,6 +1399,48 @@ if self._tr is not None: self._tr.release() +def listkeys(remote, namespace): + with remote.commandexecutor() as e: + return e.callcommand('listkeys', {'namespace': namespace}).result() + +def _fullpullbundle2(repo, pullop): + # The server may send a partial reply, i.e. when inlining + # pre-computed bundles. In that case, update the common + # set based on the results and pull another bundle. + # + # There are two indicators that the process is finished: + # - no changeset has been added, or + # - all remote heads are known locally. + # The head check must use the unfiltered view as obsoletion + # markers can hide heads. + unfi = repo.unfiltered() + unficl = unfi.changelog + def headsofdiff(h1, h2): + """Returns heads(h1 % h2)""" + res = unfi.set('heads(%ln %% %ln)', h1, h2) + return set(ctx.node() for ctx in res) + def headsofunion(h1, h2): + """Returns heads((h1 + h2) - null)""" + res = unfi.set('heads((%ln + %ln - null))', h1, h2) + return set(ctx.node() for ctx in res) + while True: + old_heads = unficl.heads() + clstart = len(unficl) + _pullbundle2(pullop) + if changegroup.NARROW_REQUIREMENT in repo.requirements: + # XXX narrow clones filter the heads on the server side during + # XXX getbundle and result in partial replies as well. + # XXX Disable pull bundles in this case as band aid to avoid + # XXX extra round trips. + break + if clstart == len(unficl): + break + if all(unficl.hasnode(n) for n in pullop.rheads): + break + new_heads = headsofdiff(unficl.heads(), old_heads) + pullop.common = headsofunion(new_heads, pullop.common) + pullop.rheads = set(pullop.rheads) - pullop.common + def pull(repo, remote, heads=None, force=False, bookmarks=(), opargs=None, streamclonerequested=None): """Fetch repository data from a remote. @@ -1361,7 +1486,7 @@ streamclone.maybeperformlegacystreamclone(pullop) _pulldiscovery(pullop) if pullop.canusebundle2: - _pullbundle2(pullop) + _fullpullbundle2(repo, pullop) _pullchangeset(pullop) _pullphase(pullop) _pullbookmarks(pullop) @@ -1415,7 +1540,7 @@ # all known bundle2 servers now support listkeys, but lets be nice with # new implementation. return - books = pullop.remote.listkeys('bookmarks') + books = listkeys(pullop.remote, 'bookmarks') pullop.remotebookmarks = bookmod.unhexlifybookmarks(books) @@ -1534,16 +1659,22 @@ kwargs['obsmarkers'] = True pullop.stepsdone.add('obsmarkers') _pullbundle2extraprepare(pullop, kwargs) - bundle = pullop.remote.getbundle('pull', **pycompat.strkwargs(kwargs)) - try: - op = bundle2.bundleoperation(pullop.repo, pullop.gettransaction) - op.modes['bookmarks'] = 'records' - bundle2.processbundle(pullop.repo, bundle, op=op) - except bundle2.AbortFromPart as exc: - pullop.repo.ui.status(_('remote: abort: %s\n') % exc) - raise error.Abort(_('pull failed on remote'), hint=exc.hint) - except error.BundleValueError as exc: - raise error.Abort(_('missing support for %s') % exc) + + with pullop.remote.commandexecutor() as e: + args = dict(kwargs) + args['source'] = 'pull' + bundle = e.callcommand('getbundle', args).result() + + try: + op = bundle2.bundleoperation(pullop.repo, pullop.gettransaction, + source='pull') + op.modes['bookmarks'] = 'records' + bundle2.processbundle(pullop.repo, bundle, op=op) + except bundle2.AbortFromPart as exc: + pullop.repo.ui.status(_('remote: abort: %s\n') % exc) + raise error.Abort(_('pull failed on remote'), hint=exc.hint) + except error.BundleValueError as exc: + raise error.Abort(_('missing support for %s') % exc) if pullop.fetch: pullop.cgresult = bundle2.combinechangegroupresults(op) @@ -1595,13 +1726,24 @@ cg = pullop.remote.getbundle('pull', common=pullop.common, heads=pullop.heads or pullop.rheads) elif pullop.heads is None: - cg = pullop.remote.changegroup(pullop.fetch, 'pull') + with pullop.remote.commandexecutor() as e: + cg = e.callcommand('changegroup', { + 'nodes': pullop.fetch, + 'source': 'pull', + }).result() + elif not pullop.remote.capable('changegroupsubset'): raise error.Abort(_("partial pull cannot be done because " "other repository doesn't support " "changegroupsubset.")) else: - cg = pullop.remote.changegroupsubset(pullop.fetch, pullop.heads, 'pull') + with pullop.remote.commandexecutor() as e: + cg = e.callcommand('changegroupsubset', { + 'bases': pullop.fetch, + 'heads': pullop.heads, + 'source': 'pull', + }).result() + bundleop = bundle2.applybundle(pullop.repo, cg, tr, 'pull', pullop.remote.url()) pullop.cgresult = bundle2.combinechangegroupresults(bundleop) @@ -1610,7 +1752,7 @@ # Get remote phases data from remote if 'phases' in pullop.stepsdone: return - remotephases = pullop.remote.listkeys('phases') + remotephases = listkeys(pullop.remote, 'phases') _pullapplyphases(pullop, remotephases) def _pullapplyphases(pullop, remotephases): @@ -1674,7 +1816,7 @@ tr = None if obsolete.isenabled(pullop.repo, obsolete.exchangeopt): pullop.repo.ui.debug('fetching remote obsolete markers\n') - remoteobs = pullop.remote.listkeys('obsolete') + remoteobs = listkeys(pullop.remote, 'obsolete') if 'dump0' in remoteobs: tr = pullop.gettransaction() markers = [] @@ -1775,29 +1917,8 @@ return info, bundler.getchunks() @getbundle2partsgenerator('stream2') -def _getbundlestream2(bundler, repo, source, bundlecaps=None, - b2caps=None, heads=None, common=None, **kwargs): - if not kwargs.get('stream', False): - return - - if not streamclone.allowservergeneration(repo): - raise error.Abort(_('stream data requested but server does not allow ' - 'this feature'), - hint=_('well-behaved clients should not be ' - 'requesting stream data from servers not ' - 'advertising it; the client may be buggy')) - - # Stream clones don't compress well. And compression undermines a - # goal of stream clones, which is to be fast. Communicate the desire - # to avoid compression to consumers of the bundle. - bundler.prefercompressed = False - - filecount, bytecount, it = streamclone.generatev2(repo) - requirements = _formatrequirementsspec(repo.requirements) - part = bundler.newpart('stream2', data=it) - part.addparam('bytecount', '%d' % bytecount, mandatory=True) - part.addparam('filecount', '%d' % filecount, mandatory=True) - part.addparam('requirements', requirements, mandatory=True) +def _getbundlestream2(bundler, repo, *args, **kwargs): + return bundle2.addpartbundlestream2(bundler, repo, **kwargs) @getbundle2partsgenerator('changegroup') def _getbundlechangegrouppart(bundler, repo, source, bundlecaps=None, @@ -1931,6 +2052,28 @@ outgoing = _computeoutgoing(repo, heads, common) bundle2.addparttagsfnodescache(repo, bundler, outgoing) +@getbundle2partsgenerator('cache:rev-branch-cache') +def _getbundlerevbranchcache(bundler, repo, source, bundlecaps=None, + b2caps=None, heads=None, common=None, + **kwargs): + """Transfer the rev-branch-cache mapping + + The payload is a series of data related to each branch + + 1) branch name length + 2) number of open heads + 3) number of closed heads + 4) open heads nodes + 5) closed heads nodes + """ + # Don't send unless: + # - changeset are being exchanged, + # - the client supports it. + if not (kwargs.get(r'cg', True)) or 'rev-branch-cache' not in b2caps: + return + outgoing = _computeoutgoing(repo, heads, common) + bundle2.addpartrevbranchcache(repo, bundler, outgoing) + def check_heads(repo, their_heads, context): """check if the heads of a repo have been modified @@ -1992,7 +2135,8 @@ gettransaction() op = bundle2.bundleoperation(repo, gettransaction, - captureoutput=captureoutput) + captureoutput=captureoutput, + source='push') try: op = bundle2.processbundle(repo, cg, op=op) finally: @@ -2037,7 +2181,8 @@ if not remote.capable('clonebundles'): return - res = remote._call('clonebundles') + with remote.commandexecutor() as e: + res = e.callcommand('clonebundles', {}).result() # If we call the wire protocol command, that's good enough to record the # attempt. @@ -2107,10 +2252,9 @@ # component of the BUNDLESPEC. if key == 'BUNDLESPEC': try: - comp, version, params = parsebundlespec(repo, value, - externalnames=True) - attrs['COMPRESSION'] = comp - attrs['VERSION'] = version + bundlespec = parsebundlespec(repo, value) + attrs['COMPRESSION'] = bundlespec.compression + attrs['VERSION'] = bundlespec.version except error.InvalidBundleSpecification: pass except error.UnsupportedBundleSpecification: @@ -2120,6 +2264,19 @@ return m +def isstreamclonespec(bundlespec): + # Stream clone v1 + if (bundlespec.wirecompression == 'UN' and bundlespec.wireversion == 's1'): + return True + + # Stream clone v2 + if (bundlespec.wirecompression == 'UN' and \ + bundlespec.wireversion == '02' and \ + bundlespec.contentopts.get('streamv2')): + return True + + return False + def filterclonebundleentries(repo, entries, streamclonerequested=False): """Remove incompatible clone bundle manifest entries. @@ -2135,21 +2292,22 @@ spec = entry.get('BUNDLESPEC') if spec: try: - comp, version, params = parsebundlespec(repo, spec, strict=True) + bundlespec = parsebundlespec(repo, spec, strict=True) # If a stream clone was requested, filter out non-streamclone # entries. - if streamclonerequested and (comp != 'UN' or version != 's1'): + if streamclonerequested and not isstreamclonespec(bundlespec): repo.ui.debug('filtering %s because not a stream clone\n' % entry['URL']) continue except error.InvalidBundleSpecification as e: - repo.ui.debug(str(e) + '\n') + repo.ui.debug(stringutil.forcebytestr(e) + '\n') continue except error.UnsupportedBundleSpecification as e: repo.ui.debug('filtering %s because unsupported bundle ' - 'spec: %s\n' % (entry['URL'], str(e))) + 'spec: %s\n' % ( + entry['URL'], stringutil.forcebytestr(e))) continue # If we don't have a spec and requested a stream clone, we don't know # what the entry is so don't attempt to apply it. @@ -2254,8 +2412,10 @@ bundle2.applybundle(repo, cg, tr, 'clonebundles', url) return True except urlerr.httperror as e: - ui.warn(_('HTTP error fetching bundle: %s\n') % str(e)) + ui.warn(_('HTTP error fetching bundle: %s\n') % + stringutil.forcebytestr(e)) except urlerr.urlerror as e: - ui.warn(_('error fetching bundle: %s\n') % e.reason) + ui.warn(_('error fetching bundle: %s\n') % + stringutil.forcebytestr(e.reason)) return False diff -r fb92df8b634c -r ed5448edcbfa mercurial/extensions.py --- a/mercurial/extensions.py Wed Apr 04 10:35:09 2018 -0400 +++ b/mercurial/extensions.py Wed Apr 18 15:32:08 2018 -0400 @@ -25,6 +25,10 @@ util, ) +from .utils import ( + stringutil, +) + _extensions = {} _disabledextensions = {} _aftercallbacks = {} @@ -118,10 +122,22 @@ # note: this ui.debug happens before --debug is processed, # Use --config ui.debug=1 to see them. ui.debug('could not import %s (%s): trying %s\n' - % (failed, util.forcebytestr(err), next)) + % (failed, stringutil.forcebytestr(err), next)) if ui.debugflag: ui.traceback() +def _rejectunicode(name, xs): + if isinstance(xs, (list, set, tuple)): + for x in xs: + _rejectunicode(name, x) + elif isinstance(xs, dict): + for k, v in xs.items(): + _rejectunicode(name, k) + _rejectunicode(b'%s.%s' % (name, stringutil.forcebytestr(k)), v) + elif isinstance(xs, type(u'')): + raise error.ProgrammingError(b"unicode %r found in %s" % (xs, name), + hint="use b'' to make it byte string") + # attributes set by registrar.command _cmdfuncattrs = ('norepo', 'optionalrepo', 'inferrepo') @@ -134,19 +150,22 @@ "registrar.command to register '%s'" % c, '4.6') missing = [a for a in _cmdfuncattrs if not util.safehasattr(f, a)] if not missing: - for option in e[1]: - default = option[2] - if isinstance(default, type(u'')): - raise error.ProgrammingError( - "option '%s.%s' has a unicode default value" - % (c, option[1]), - hint=("change the %s.%s default value to a " - "non-unicode string" % (c, option[1]))) continue raise error.ProgrammingError( 'missing attributes: %s' % ', '.join(missing), hint="use @command decorator to register '%s'" % c) +def _validatetables(ui, mod): + """Sanity check for loadable tables provided by extension module""" + for t in ['cmdtable', 'colortable', 'configtable']: + _rejectunicode(t, getattr(mod, t, {})) + for t in ['filesetpredicate', 'internalmerge', 'revsetpredicate', + 'templatefilter', 'templatefunc', 'templatekeyword']: + o = getattr(mod, t, None) + if o: + _rejectunicode(t, o._table) + _validatecmdtable(ui, getattr(mod, 'cmdtable', {})) + def load(ui, name, path): if name.startswith('hgext.') or name.startswith('hgext/'): shortname = name[6:] @@ -168,7 +187,7 @@ ui.warn(_('(third party extension %s requires version %s or newer ' 'of Mercurial; disabling)\n') % (shortname, minver)) return - _validatecmdtable(ui, getattr(mod, 'cmdtable', {})) + _validatetables(ui, mod) _extensions[shortname] = mod _order.append(shortname) @@ -183,7 +202,7 @@ uisetup(ui) except Exception as inst: ui.traceback(force=True) - msg = util.forcebytestr(inst) + msg = stringutil.forcebytestr(inst) ui.warn(_("*** failed to set up extension %s: %s\n") % (name, msg)) return False return True @@ -195,16 +214,12 @@ try: extsetup(ui) except TypeError: - # Try to use getfullargspec (Python 3) first, and fall - # back to getargspec only if it doesn't exist so as to - # avoid warnings. - if getattr(inspect, 'getfullargspec', - getattr(inspect, 'getargspec'))(extsetup).args: + if pycompat.getargspec(extsetup).args: raise extsetup() # old extsetup with no ui argument except Exception as inst: ui.traceback(force=True) - msg = util.forcebytestr(inst) + msg = stringutil.forcebytestr(inst) ui.warn(_("*** failed to set up extension %s: %s\n") % (name, msg)) return False return True @@ -222,7 +237,7 @@ try: load(ui, name, path) except Exception as inst: - msg = util.forcebytestr(inst) + msg = stringutil.forcebytestr(inst) if path: ui.warn(_("*** failed to import extension %s from %s: %s\n") % (name, path, msg)) @@ -279,8 +294,8 @@ fileset, revset, templatefilters, + templatefuncs, templatekw, - templater, ) # list of (objname, loadermod, loadername) tuple: @@ -296,7 +311,7 @@ ('internalmerge', filemerge, 'loadinternalmerge'), ('revsetpredicate', revset, 'loadpredicate'), ('templatefilter', templatefilters, 'loadfilter'), - ('templatefunc', templater, 'loadfunction'), + ('templatefunc', templatefuncs, 'loadfunction'), ('templatekeyword', templatekw, 'loadkeyword'), ] _loadextra(ui, newindex, extraloaders) diff -r fb92df8b634c -r ed5448edcbfa mercurial/fancyopts.py --- a/mercurial/fancyopts.py Wed Apr 04 10:35:09 2018 -0400 +++ b/mercurial/fancyopts.py Wed Apr 18 15:32:08 2018 -0400 @@ -7,6 +7,7 @@ from __future__ import absolute_import +import abc import functools from .i18n import _ @@ -201,6 +202,74 @@ parsedargs.extend(args[pos:]) return parsedopts, parsedargs +class customopt(object): + """Manage defaults and mutations for any type of opt.""" + + __metaclass__ = abc.ABCMeta + + def __init__(self, defaultvalue): + self._defaultvalue = defaultvalue + + def _isboolopt(self): + return False + + def getdefaultvalue(self): + """Returns the default value for this opt. + + Subclasses should override this to return a new value if the value type + is mutable.""" + return self._defaultvalue + + @abc.abstractmethod + def newstate(self, oldstate, newparam, abort): + """Adds newparam to oldstate and returns the new state. + + On failure, abort can be called with a string error message.""" + +class _simpleopt(customopt): + def _isboolopt(self): + return isinstance(self._defaultvalue, (bool, type(None))) + + def newstate(self, oldstate, newparam, abort): + return newparam + +class _callableopt(customopt): + def __init__(self, callablefn): + self.callablefn = callablefn + super(_callableopt, self).__init__(None) + + def newstate(self, oldstate, newparam, abort): + return self.callablefn(newparam) + +class _listopt(customopt): + def getdefaultvalue(self): + return self._defaultvalue[:] + + def newstate(self, oldstate, newparam, abort): + oldstate.append(newparam) + return oldstate + +class _intopt(customopt): + def newstate(self, oldstate, newparam, abort): + try: + return int(newparam) + except ValueError: + abort(_('expected int')) + +def _defaultopt(default): + """Returns a default opt implementation, given a default value.""" + + if isinstance(default, customopt): + return default + elif callable(default): + return _callableopt(default) + elif isinstance(default, list): + return _listopt(default[:]) + elif type(default) is type(1): + return _intopt(default) + else: + return _simpleopt(default) + def fancyopts(args, options, state, gnu=False, early=False, optaliases=None): """ read args, parse options, and store options in state @@ -220,6 +289,7 @@ list - parameter string is added to a list integer - parameter strings is stored as int function - call function with parameter + customopt - subclass of 'customopt' optaliases is a mapping from a canonical option name to a list of additional long options. This exists for preserving backward compatibility @@ -250,18 +320,13 @@ argmap['-' + short] = name for n in onames: argmap['--' + n] = name - defmap[name] = default + defmap[name] = _defaultopt(default) # copy defaults to state - if isinstance(default, list): - state[name] = default[:] - elif callable(default): - state[name] = None - else: - state[name] = default + state[name] = defmap[name].getdefaultvalue() # does it take a parameter? - if not (default is None or default is True or default is False): + if not defmap[name]._isboolopt(): if short: short += ':' onames = [n + '=' for n in onames] @@ -301,21 +366,13 @@ boolval = False name = argmap[opt] obj = defmap[name] - t = type(obj) - if callable(obj): - state[name] = defmap[name](val) - elif t is type(1): - try: - state[name] = int(val) - except ValueError: - raise error.Abort(_('invalid value %r for option %s, ' - 'expected int') % (val, opt)) - elif t is type(''): - state[name] = val - elif t is type([]): - state[name].append(val) - elif t is type(None) or t is type(False): + if obj._isboolopt(): state[name] = boolval + else: + def abort(s): + raise error.Abort(_('invalid value %r for option %s, %s') + % (pycompat.maybebytestr(val), opt, s)) + state[name] = defmap[name].newstate(state[name], val, abort) # return unparsed args return args diff -r fb92df8b634c -r ed5448edcbfa mercurial/filelog.py --- a/mercurial/filelog.py Wed Apr 04 10:35:09 2018 -0400 +++ b/mercurial/filelog.py Wed Apr 18 15:32:08 2018 -0400 @@ -7,44 +7,116 @@ from __future__ import absolute_import -import re -import struct - +from .thirdparty.zope import ( + interface as zi, +) from . import ( error, - mdiff, + repository, revlog, ) -_mdre = re.compile('\1\n') -def parsemeta(text): - """return (metadatadict, metadatasize)""" - # text can be buffer, so we can't use .startswith or .index - if text[:2] != '\1\n': - return None, None - s = _mdre.search(text, 2).start() - mtext = text[2:s] - meta = {} - for l in mtext.splitlines(): - k, v = l.split(": ", 1) - meta[k] = v - return meta, (s + 2) - -def packmeta(meta, text): - keys = sorted(meta) - metatext = "".join("%s: %s\n" % (k, meta[k]) for k in keys) - return "\1\n%s\1\n%s" % (metatext, text) - -def _censoredtext(text): - m, offs = parsemeta(text) - return m and "censored" in m - -class filelog(revlog.revlog): +@zi.implementer(repository.ifilestorage) +class filelog(object): def __init__(self, opener, path): - super(filelog, self).__init__(opener, - "/".join(("data", path + ".i"))) + self._revlog = revlog.revlog(opener, + '/'.join(('data', path + '.i')), + censorable=True) # full name of the user visible file, relative to the repository root self.filename = path + self.index = self._revlog.index + self.version = self._revlog.version + self.storedeltachains = self._revlog.storedeltachains + self._generaldelta = self._revlog._generaldelta + + def __len__(self): + return len(self._revlog) + + def __iter__(self): + return self._revlog.__iter__() + + def revs(self, start=0, stop=None): + return self._revlog.revs(start=start, stop=stop) + + def parents(self, node): + return self._revlog.parents(node) + + def parentrevs(self, rev): + return self._revlog.parentrevs(rev) + + def rev(self, node): + return self._revlog.rev(node) + + def node(self, rev): + return self._revlog.node(rev) + + def lookup(self, node): + return self._revlog.lookup(node) + + def linkrev(self, rev): + return self._revlog.linkrev(rev) + + def flags(self, rev): + return self._revlog.flags(rev) + + def commonancestorsheads(self, node1, node2): + return self._revlog.commonancestorsheads(node1, node2) + + def descendants(self, revs): + return self._revlog.descendants(revs) + + def headrevs(self): + return self._revlog.headrevs() + + def heads(self, start=None, stop=None): + return self._revlog.heads(start, stop) + + def children(self, node): + return self._revlog.children(node) + + def deltaparent(self, rev): + return self._revlog.deltaparent(rev) + + def candelta(self, baserev, rev): + return self._revlog.candelta(baserev, rev) + + def iscensored(self, rev): + return self._revlog.iscensored(rev) + + def rawsize(self, rev): + return self._revlog.rawsize(rev) + + def checkhash(self, text, node, p1=None, p2=None, rev=None): + return self._revlog.checkhash(text, node, p1=p1, p2=p2, rev=rev) + + def revision(self, node, _df=None, raw=False): + return self._revlog.revision(node, _df=_df, raw=raw) + + def revdiff(self, rev1, rev2): + return self._revlog.revdiff(rev1, rev2) + + def addrevision(self, revisiondata, transaction, linkrev, p1, p2, + node=None, flags=revlog.REVIDX_DEFAULT_FLAGS, + cachedelta=None): + return self._revlog.addrevision(revisiondata, transaction, linkrev, + p1, p2, node=node, flags=flags, + cachedelta=cachedelta) + + def addgroup(self, deltas, linkmapper, transaction, addrevisioncb=None): + return self._revlog.addgroup(deltas, linkmapper, transaction, + addrevisioncb=addrevisioncb) + + def getstrippoint(self, minlink): + return self._revlog.getstrippoint(minlink) + + def strip(self, minlink, transaction): + return self._revlog.strip(minlink, transaction) + + def files(self): + return self._revlog.files() + + def checksize(self): + return self._revlog.checksize() def read(self, node): t = self.revision(node) @@ -55,14 +127,14 @@ def add(self, text, meta, transaction, link, p1=None, p2=None): if meta or text.startswith('\1\n'): - text = packmeta(meta, text) + text = revlog.packmeta(meta, text) return self.addrevision(text, transaction, link, p1, p2) def renamed(self, node): if self.parents(node)[0] != revlog.nullid: return False t = self.revision(node) - m = parsemeta(t)[0] + m = revlog.parsemeta(t)[0] if m and "copy" in m: return (m["copy"], revlog.bin(m["copyrev"])) return False @@ -78,7 +150,7 @@ return 0 # XXX if self.read(node).startswith("\1\n"), this returns (size+4) - return super(filelog, self).size(rev) + return self._revlog.size(rev) def cmp(self, node, text): """compare text with a given file revision @@ -90,7 +162,7 @@ if text.startswith('\1\n'): t = '\1\n\1\n' + text - samehashes = not super(filelog, self).cmp(node, t) + samehashes = not self._revlog.cmp(node, t) if samehashes: return False @@ -106,34 +178,90 @@ return True - def checkhash(self, text, node, p1=None, p2=None, rev=None): - try: - super(filelog, self).checkhash(text, node, p1=p1, p2=p2, rev=rev) - except error.RevlogError: - if _censoredtext(text): - raise error.CensoredNodeError(self.indexfile, node, text) - raise + @property + def filename(self): + return self._revlog.filename + + @filename.setter + def filename(self, value): + self._revlog.filename = value + + # TODO these aren't part of the interface and aren't internal methods. + # Callers should be fixed to not use them. + @property + def indexfile(self): + return self._revlog.indexfile + + @indexfile.setter + def indexfile(self, value): + self._revlog.indexfile = value - def iscensored(self, rev): - """Check if a file revision is censored.""" - return self.flags(rev) & revlog.REVIDX_ISCENSORED + @property + def datafile(self): + return self._revlog.datafile + + @property + def opener(self): + return self._revlog.opener + + @property + def _lazydeltabase(self): + return self._revlog._lazydeltabase + + @_lazydeltabase.setter + def _lazydeltabase(self, value): + self._revlog._lazydeltabase = value + + @property + def _aggressivemergedeltas(self): + return self._revlog._aggressivemergedeltas + + @_aggressivemergedeltas.setter + def _aggressivemergedeltas(self, value): + self._revlog._aggressivemergedeltas = value - def _peek_iscensored(self, baserev, delta, flush): - """Quickly check if a delta produces a censored revision.""" - # Fragile heuristic: unless new file meta keys are added alphabetically - # preceding "censored", all censored revisions are prefixed by - # "\1\ncensored:". A delta producing such a censored revision must be a - # full-replacement delta, so we inspect the first and only patch in the - # delta for this prefix. - hlen = struct.calcsize(">lll") - if len(delta) <= hlen: - return False + @property + def _inline(self): + return self._revlog._inline + + @property + def _withsparseread(self): + return getattr(self._revlog, '_withsparseread', False) + + @property + def _srmingapsize(self): + return self._revlog._srmingapsize + + @property + def _srdensitythreshold(self): + return self._revlog._srdensitythreshold + + def _deltachain(self, rev, stoprev=None): + return self._revlog._deltachain(rev, stoprev) + + def chainbase(self, rev): + return self._revlog.chainbase(rev) - oldlen = self.rawsize(baserev) - newlen = len(delta) - hlen - if delta[:hlen] != mdiff.replacediffheader(oldlen, newlen): - return False + def chainlen(self, rev): + return self._revlog.chainlen(rev) + + def clone(self, tr, destrevlog, **kwargs): + if not isinstance(destrevlog, filelog): + raise error.ProgrammingError('expected filelog to clone()') + + return self._revlog.clone(tr, destrevlog._revlog, **kwargs) + + def start(self, rev): + return self._revlog.start(rev) - add = "\1\ncensored:" - addlen = len(add) - return newlen >= addlen and delta[hlen:hlen + addlen] == add + def end(self, rev): + return self._revlog.end(rev) + + def length(self, rev): + return self._revlog.length(rev) + + def compress(self, data): + return self._revlog.compress(data) + + def _addrevision(self, *args, **kwargs): + return self._revlog._addrevision(*args, **kwargs) diff -r fb92df8b634c -r ed5448edcbfa mercurial/filemerge.py --- a/mercurial/filemerge.py Wed Apr 04 10:35:09 2018 -0400 +++ b/mercurial/filemerge.py Wed Apr 18 15:32:08 2018 -0400 @@ -7,8 +7,10 @@ from __future__ import absolute_import +import contextlib import os import re +import shutil import tempfile from .i18n import _ @@ -29,6 +31,11 @@ util, ) +from .utils import ( + procutil, + stringutil, +) + def _toolstr(ui, tool, part, *args): return ui.config("merge-tools", tool + "." + part, *args) @@ -116,11 +123,11 @@ continue p = util.lookupreg(k, _toolstr(ui, tool, "regname")) if p: - p = util.findexe(p + _toolstr(ui, tool, "regappend", "")) + p = procutil.findexe(p + _toolstr(ui, tool, "regappend", "")) if p: return p exe = _toolstr(ui, tool, "executable", tool) - return util.findexe(util.expandpath(exe)) + return procutil.findexe(util.expandpath(exe)) def _picktool(repo, ui, path, binary, symlink, changedelete): def supportscd(tool): @@ -143,7 +150,7 @@ # the nomerge tools are the only tools that support change/delete # conflicts pass - elif not util.gui() and _toolbool(ui, tool, "gui"): + elif not procutil.gui() and _toolbool(ui, tool, "gui"): ui.warn(_("tool %s requires a GUI\n") % tmsg) else: return True @@ -158,7 +165,7 @@ return ":prompt", None else: if toolpath: - return (force, util.shellquote(toolpath)) + return (force, procutil.shellquote(toolpath)) else: # mimic HGMERGE if given tool not found return (force, force) @@ -176,7 +183,7 @@ mf = match.match(repo.root, '', [pat]) if mf(path) and check(tool, pat, symlink, False, changedelete): toolpath = _findtool(ui, tool) - return (tool, util.shellquote(toolpath)) + return (tool, procutil.shellquote(toolpath)) # then merge tools tools = {} @@ -201,7 +208,7 @@ for p, t in tools: if check(t, None, symlink, binary, changedelete): toolpath = _findtool(ui, t) - return (t, util.shellquote(toolpath)) + return (t, procutil.shellquote(toolpath)) # internal merge or prompt as last resort if symlink or binary or changedelete: @@ -509,28 +516,41 @@ 'for %s\n') % (tool, fcd.path())) return False, 1, None unused, unused, unused, back = files - a = _workingpath(repo, fcd) - b, c = _maketempfiles(repo, fco, fca) - try: - out = "" + localpath = _workingpath(repo, fcd) + args = _toolstr(repo.ui, tool, "args") + + with _maketempfiles(repo, fco, fca, repo.wvfs.join(back.path()), + "$output" in args) as temppaths: + basepath, otherpath, localoutputpath = temppaths + outpath = "" + mylabel, otherlabel = labels[:2] + if len(labels) >= 3: + baselabel = labels[2] + else: + baselabel = 'base' env = {'HG_FILE': fcd.path(), 'HG_MY_NODE': short(mynode), - 'HG_OTHER_NODE': str(fco.changectx()), - 'HG_BASE_NODE': str(fca.changectx()), + 'HG_OTHER_NODE': short(fco.changectx().node()), + 'HG_BASE_NODE': short(fca.changectx().node()), 'HG_MY_ISLINK': 'l' in fcd.flags(), 'HG_OTHER_ISLINK': 'l' in fco.flags(), 'HG_BASE_ISLINK': 'l' in fca.flags(), + 'HG_MY_LABEL': mylabel, + 'HG_OTHER_LABEL': otherlabel, + 'HG_BASE_LABEL': baselabel, } ui = repo.ui - args = _toolstr(ui, tool, "args") if "$output" in args: # read input from backup, write to original - out = a - a = repo.wvfs.join(back.path()) - replace = {'local': a, 'base': b, 'other': c, 'output': out} - args = util.interpolate(r'\$', replace, args, - lambda s: util.shellquote(util.localpath(s))) + outpath = localpath + localpath = localoutputpath + replace = {'local': localpath, 'base': basepath, 'other': otherpath, + 'output': outpath, 'labellocal': mylabel, + 'labelother': otherlabel, 'labelbase': baselabel} + args = util.interpolate( + br'\$', replace, args, + lambda s: procutil.shellquote(util.localpath(s))) cmd = toolpath + ' ' + args if _toolbool(ui, tool, "gui"): repo.ui.status(_('running merge tool %s for file %s\n') % @@ -539,9 +559,6 @@ r = ui.system(cmd, cwd=repo.root, environ=env, blockedtag='mergetool') repo.ui.debug('merge tool returned: %d\n' % r) return True, r, False - finally: - util.unlink(b) - util.unlink(c) def _formatconflictmarker(ctx, template, label, pad): """Applies the given template to the ctx, prefixed by the label. @@ -553,7 +570,7 @@ ctx = ctx.p1() props = {'ctx': ctx} - templateresult = template.render(props) + templateresult = template.renderdefault(props) label = ('%s:' % label).ljust(pad + 1) mark = '%s %s' % (label, templateresult) @@ -562,11 +579,11 @@ mark = mark.splitlines()[0] # split for safety # 8 for the prefix of conflict marker lines (e.g. '<<<<<<< ') - return util.ellipsis(mark, 80 - 8) + return stringutil.ellipsis(mark, 80 - 8) _defaultconflictlabels = ['local', 'other'] -def _formatlabels(repo, fcd, fco, fca, labels): +def _formatlabels(repo, fcd, fco, fca, labels, tool=None): """Formats the given labels using the conflict marker template. Returns a list of formatted labels. @@ -577,6 +594,8 @@ ui = repo.ui template = ui.config('ui', 'mergemarkertemplate') + if tool is not None: + template = _toolstr(ui, tool, 'mergemarkertemplate', template) template = templater.unquotestring(template) tres = formatter.templateresources(ui, repo) tmpl = formatter.maketemplater(ui, template, defaults=templatekw.keywords, @@ -653,24 +672,62 @@ # the backup context regardless of where it lives. return context.arbitraryfilectx(back, repo=repo) -def _maketempfiles(repo, fco, fca): - """Writes out `fco` and `fca` as temporary files, so an external merge - tool may use them. +@contextlib.contextmanager +def _maketempfiles(repo, fco, fca, localpath, uselocalpath): + """Writes out `fco` and `fca` as temporary files, and (if uselocalpath) + copies `localpath` to another temporary file, so an external merge tool may + use them. """ - def temp(prefix, ctx): - fullbase, ext = os.path.splitext(ctx.path()) - pre = "%s~%s." % (os.path.basename(fullbase), prefix) - (fd, name) = tempfile.mkstemp(prefix=pre, suffix=ext) + tmproot = None + tmprootprefix = repo.ui.config('experimental', 'mergetempdirprefix') + if tmprootprefix: + tmproot = tempfile.mkdtemp(prefix=tmprootprefix) + + def maketempfrompath(prefix, path): + fullbase, ext = os.path.splitext(path) + pre = "%s~%s" % (os.path.basename(fullbase), prefix) + if tmproot: + name = os.path.join(tmproot, pre) + if ext: + name += ext + f = open(name, r"wb") + else: + fd, name = tempfile.mkstemp(prefix=pre + '.', suffix=ext) + f = os.fdopen(fd, r"wb") + return f, name + + def tempfromcontext(prefix, ctx): + f, name = maketempfrompath(prefix, ctx.path()) data = repo.wwritedata(ctx.path(), ctx.data()) - f = os.fdopen(fd, pycompat.sysstr("wb")) f.write(data) f.close() return name - b = temp("base", fca) - c = temp("other", fco) + b = tempfromcontext("base", fca) + c = tempfromcontext("other", fco) + d = localpath + if uselocalpath: + # We start off with this being the backup filename, so remove the .orig + # to make syntax-highlighting more likely. + if d.endswith('.orig'): + d, _ = os.path.splitext(d) + f, d = maketempfrompath("local", d) + with open(localpath, 'rb') as src: + f.write(src.read()) + f.close() - return b, c + try: + yield b, c, d + finally: + if tmproot: + shutil.rmtree(tmproot) + else: + util.unlink(b) + util.unlink(c) + # if not uselocalpath, d is the 'orig'/backup file which we + # shouldn't delete. + if d and uselocalpath: + util.unlink(d) def _filemerge(premerge, repo, wctx, mynode, orig, fcd, fco, fca, labels=None): """perform a 3-way merge in the working directory @@ -706,6 +763,7 @@ mergetype = func.mergetype onfailure = func.onfailure precheck = func.precheck + isexternal = False else: if wctx.isinmemory(): func = _xmergeimm @@ -714,6 +772,7 @@ mergetype = fullmerge onfailure = _("merging %s failed!\n") precheck = None + isexternal = True toolconf = tool, toolpath, binary, symlink @@ -743,19 +802,42 @@ files = (None, None, None, back) r = 1 try: - markerstyle = ui.config('ui', 'mergemarkers') + internalmarkerstyle = ui.config('ui', 'mergemarkers') + if isexternal: + markerstyle = _toolstr(ui, tool, 'mergemarkers') + else: + markerstyle = internalmarkerstyle + if not labels: labels = _defaultconflictlabels + formattedlabels = labels if markerstyle != 'basic': - labels = _formatlabels(repo, fcd, fco, fca, labels) + formattedlabels = _formatlabels(repo, fcd, fco, fca, labels, + tool=tool) if premerge and mergetype == fullmerge: - r = _premerge(repo, fcd, fco, fca, toolconf, files, labels=labels) + # conflict markers generated by premerge will use 'detailed' + # settings if either ui.mergemarkers or the tool's mergemarkers + # setting is 'detailed'. This way tools can have basic labels in + # space-constrained areas of the UI, but still get full information + # in conflict markers if premerge is 'keep' or 'keep-merge3'. + premergelabels = labels + labeltool = None + if markerstyle != 'basic': + # respect 'tool's mergemarkertemplate (which defaults to + # ui.mergemarkertemplate) + labeltool = tool + if internalmarkerstyle != 'basic' or markerstyle != 'basic': + premergelabels = _formatlabels(repo, fcd, fco, fca, + premergelabels, tool=labeltool) + + r = _premerge(repo, fcd, fco, fca, toolconf, files, + labels=premergelabels) # complete if premerge successful (r is 0) return not r, r, False needcheck, r, deleted = func(repo, mynode, orig, fcd, fco, fca, - toolconf, files, labels=labels) + toolconf, files, labels=formattedlabels) if needcheck: r = _check(repo, r, ui, tool, fcd, files) diff -r fb92df8b634c -r ed5448edcbfa mercurial/fileset.py --- a/mercurial/fileset.py Wed Apr 04 10:35:09 2018 -0400 +++ b/mercurial/fileset.py Wed Apr 18 15:32:08 2018 -0400 @@ -20,6 +20,9 @@ scmutil, util, ) +from .utils import ( + stringutil, +) elements = { # token-type: binding-strength, primary, prefix, infix, suffix @@ -392,11 +395,10 @@ elif expr.startswith(">"): a = util.sizetoint(expr[1:]) return lambda x: x > a - elif expr[0].isdigit or expr[0] == '.': + else: a = util.sizetoint(expr) b = _sizetomax(expr) return lambda x: x >= a and x <= b - raise error.ParseError(_("couldn't parse size: %s") % expr) @predicate('size(expression)', callexisting=True) def size(mctx, x): @@ -446,7 +448,7 @@ s = [] for f in mctx.existing(): d = mctx.ctx[f].data() - if util.binary(d): + if stringutil.binary(d): continue if (enc == 'dos' or enc == 'win') and '\r\n' in d: s.append(f) @@ -511,9 +513,7 @@ revspec = getstring(r, reverr) if not revspec: raise error.ParseError(reverr) - basenode, node = scmutil.revpair(repo, [baserevspec, revspec]) - basectx = repo[basenode] - ctx = repo[node] + basectx, ctx = scmutil.revpair(repo, [baserevspec, revspec]) return getset(mctx.switch(ctx, _buildstatus(ctx, x, basectx=basectx)), x) @predicate('subrepo([pattern])') diff -r fb92df8b634c -r ed5448edcbfa mercurial/formatter.py --- a/mercurial/formatter.py Wed Apr 04 10:35:09 2018 -0400 +++ b/mercurial/formatter.py Wed Apr 18 15:32:08 2018 -0400 @@ -95,7 +95,7 @@ >>> def subrepos(ui, fm): ... fm.startitem() ... fm.write(b'reponame', b'[%s]\\n', b'baz') -... files(ui, fm.nested(b'files')) +... files(ui, fm.nested(b'files', tmpl=b'{reponame}')) ... fm.end() >>> show(subrepos) [baz] @@ -124,8 +124,10 @@ templatefilters, templatekw, templater, + templateutil, util, ) +from .utils import dateutil pickle = util.pickle @@ -136,9 +138,15 @@ storecontext = False @staticmethod + def wrapnested(data, tmpl, sep): + '''wrap nested data by appropriate type''' + return data + @staticmethod def formatdate(date, fmt): '''convert date tuple to appropriate format''' - return date + # timestamp can be float, but the canonical form should be int + ts, tz = date + return (int(ts), tz) @staticmethod def formatdict(data, key, value, fmt, sep): '''convert dict or key-value pairs to appropriate dict format''' @@ -154,8 +162,7 @@ def __init__(self, ui, topic, opts, converter): self._ui = ui self._topic = topic - self._style = opts.get("style") - self._template = opts.get("template") + self._opts = opts self._converter = converter self._item = None # function to convert node to string suitable for this output @@ -175,10 +182,10 @@ def formatdate(self, date, fmt='%a %b %d %H:%M:%S %Y %1%2'): '''convert date tuple to appropriate format''' return self._converter.formatdate(date, fmt) - def formatdict(self, data, key='key', value='value', fmt='%s=%s', sep=' '): + def formatdict(self, data, key='key', value='value', fmt=None, sep=' '): '''convert dict or key-value pairs to appropriate dict format''' return self._converter.formatdict(data, key, value, fmt, sep) - def formatlist(self, data, name, fmt='%s', sep=' '): + def formatlist(self, data, name, fmt=None, sep=' '): '''convert iterable to appropriate list format''' # name is mandatory argument for now, but it could be optional if # we have default template keyword, e.g. {item} @@ -186,7 +193,7 @@ def context(self, **ctxs): '''insert context objects to be used to render template keywords''' ctxs = pycompat.byteskwargs(ctxs) - assert all(k == 'ctx' for k in ctxs) + assert all(k in {'ctx', 'fctx'} for k in ctxs) if self._converter.storecontext: self._item.update(ctxs) def data(self, **data): @@ -208,18 +215,19 @@ def isplain(self): '''check for plain formatter usage''' return False - def nested(self, field): + def nested(self, field, tmpl=None, sep=''): '''sub formatter to store nested data in the specified field''' - self._item[field] = data = [] + data = [] + self._item[field] = self._converter.wrapnested(data, tmpl, sep) return _nestedformatter(self._ui, self._converter, data) def end(self): '''end output for the formatter''' if self._item is not None: self._showitem() -def nullformatter(ui, topic): +def nullformatter(ui, topic, opts): '''formatter that prints nothing''' - return baseformatter(ui, topic, opts={}, converter=_nullconverter) + return baseformatter(ui, topic, opts, converter=_nullconverter) class _nestedformatter(baseformatter): '''build sub items and store them in the parent formatter''' @@ -241,17 +249,29 @@ storecontext = False @staticmethod + def wrapnested(data, tmpl, sep): + raise error.ProgrammingError('plainformatter should never be nested') + @staticmethod def formatdate(date, fmt): '''stringify date tuple in the given format''' - return util.datestr(date, fmt) + return dateutil.datestr(date, fmt) @staticmethod def formatdict(data, key, value, fmt, sep): '''stringify key-value pairs separated by sep''' - return sep.join(fmt % (k, v) for k, v in _iteritems(data)) + prefmt = pycompat.identity + if fmt is None: + fmt = '%s=%s' + prefmt = pycompat.bytestr + return sep.join(fmt % (prefmt(k), prefmt(v)) + for k, v in _iteritems(data)) @staticmethod def formatlist(data, name, fmt, sep): '''stringify iterable separated by sep''' - return sep.join(fmt % e for e in data) + prefmt = pycompat.identity + if fmt is None: + fmt = '%s' + prefmt = pycompat.bytestr + return sep.join(fmt % prefmt(e) for e in data) class plainformatter(baseformatter): '''the default text output scheme''' @@ -279,7 +299,7 @@ self._write(text, **opts) def isplain(self): return True - def nested(self, field): + def nested(self, field, tmpl=None, sep=''): # nested data will be directly written to ui return self def end(self): @@ -291,7 +311,7 @@ self._out = out self._out.write("%s = [\n" % self._topic) def _showitem(self): - self._out.write(" " + repr(self._item) + ",\n") + self._out.write(' %s,\n' % pycompat.byterepr(self._item)) def end(self): baseformatter.end(self) self._out.write("]\n") @@ -339,6 +359,10 @@ storecontext = True @staticmethod + def wrapnested(data, tmpl, sep): + '''wrap nested data by templatable type''' + return templateutil.mappinglist(data, tmpl=tmpl, sep=sep) + @staticmethod def formatdate(date, fmt): '''return date tuple''' return date @@ -348,14 +372,15 @@ data = util.sortdict(_iteritems(data)) def f(): yield _plainconverter.formatdict(data, key, value, fmt, sep) - return templatekw.hybriddict(data, key=key, value=value, fmt=fmt, gen=f) + return templateutil.hybriddict(data, key=key, value=value, fmt=fmt, + gen=f) @staticmethod def formatlist(data, name, fmt, sep): '''build object that can be evaluated as either plain string or list''' data = list(data) def f(): yield _plainconverter.formatlist(data, name, fmt, sep) - return templatekw.hybridlist(data, name=name, fmt=fmt, gen=f) + return templateutil.hybridlist(data, name=name, fmt=fmt, gen=f) class templateformatter(baseformatter): def __init__(self, ui, out, topic, opts): @@ -382,20 +407,7 @@ if part not in self._parts: return ref = self._parts[part] - - # TODO: add support for filectx. probably each template keyword or - # function will have to declare dependent resources. e.g. - # @templatekeyword(..., requires=('ctx',)) - props = {} - # explicitly-defined fields precede templatekw - props.update(item) - if 'ctx' in item: - # but template resources must be always available - props['repo'] = props['ctx'].repo() - props['revcache'] = {} - props = pycompat.strkwargs(props) - g = self._t(ref, **props) - self._out.write(templater.stringify(g)) + self._out.write(self._t.render(ref, item)) def end(self): baseformatter.end(self) @@ -488,16 +500,71 @@ t.cache[''] = tmpl return t -def templateresources(ui, repo=None): - """Create a dict of template resources designed for the default templatekw - and function""" - return { - 'cache': {}, # for templatekw/funcs to store reusable data - 'ctx': None, - 'repo': repo, - 'revcache': None, # per-ctx cache; set later - 'ui': ui, +class templateresources(templater.resourcemapper): + """Resource mapper designed for the default templatekw and function""" + + def __init__(self, ui, repo=None): + self._resmap = { + 'cache': {}, # for templatekw/funcs to store reusable data + 'repo': repo, + 'ui': ui, + } + + def availablekeys(self, context, mapping): + return {k for k, g in self._gettermap.iteritems() + if g(self, context, mapping, k) is not None} + + def knownkeys(self): + return self._knownkeys + + def lookup(self, context, mapping, key): + get = self._gettermap.get(key) + if not get: + return None + return get(self, context, mapping, key) + + def populatemap(self, context, origmapping, newmapping): + mapping = {} + if self._hasctx(newmapping): + mapping['revcache'] = {} # per-ctx cache + if (('node' in origmapping or self._hasctx(origmapping)) + and ('node' in newmapping or self._hasctx(newmapping))): + orignode = templateutil.runsymbol(context, origmapping, 'node') + mapping['originalnode'] = orignode + return mapping + + def _getsome(self, context, mapping, key): + v = mapping.get(key) + if v is not None: + return v + return self._resmap.get(key) + + def _hasctx(self, mapping): + return 'ctx' in mapping or 'fctx' in mapping + + def _getctx(self, context, mapping, key): + ctx = mapping.get('ctx') + if ctx is not None: + return ctx + fctx = mapping.get('fctx') + if fctx is not None: + return fctx.changectx() + + def _getrepo(self, context, mapping, key): + ctx = self._getctx(context, mapping, 'ctx') + if ctx is not None: + return ctx.repo() + return self._getsome(context, mapping, key) + + _gettermap = { + 'cache': _getsome, + 'ctx': _getctx, + 'fctx': _getsome, + 'repo': _getrepo, + 'revcache': _getsome, + 'ui': _getsome, } + _knownkeys = set(_gettermap.keys()) def formatter(ui, out, topic, opts): template = opts.get("template", "") @@ -531,7 +598,7 @@ def _neverending(fm): yield fm -def maybereopen(fm, filename, opts): +def maybereopen(fm, filename): """Create a formatter backed by file if filename specified, else return the given formatter @@ -539,6 +606,6 @@ of the given formatter. """ if filename: - return openformatter(fm._ui, filename, fm._topic, opts) + return openformatter(fm._ui, filename, fm._topic, fm._opts) else: return _neverending(fm) diff -r fb92df8b634c -r ed5448edcbfa mercurial/graphmod.py --- a/mercurial/graphmod.py Wed Apr 04 10:35:09 2018 -0400 +++ b/mercurial/graphmod.py Wed Apr 18 15:32:08 2018 -0400 @@ -454,7 +454,7 @@ if any(len(char) > 1 for char in edgemap.values()): # limit drawing an edge to the first or last N lines of the current # section the rest of the edge is drawn like a parent line. - parent = state['styles'][PARENT][-1] + parent = state['styles'][PARENT][-1:] def _drawgp(char, i): # should a grandparent character be drawn for this line? if len(char) < 2: @@ -463,7 +463,7 @@ # either skip first num lines or take last num lines, based on sign return -num <= i if num < 0 else (len(lines) - i) <= num for i, line in enumerate(lines): - line[:] = [c[-1] if _drawgp(c, i) else parent for c in line] + line[:] = [c[-1:] if _drawgp(c, i) else parent for c in line] edgemap.update( (e, (c if len(c) < 2 else parent)) for e, c in edgemap.items()) diff -r fb92df8b634c -r ed5448edcbfa mercurial/hbisect.py --- a/mercurial/hbisect.py Wed Apr 04 10:35:09 2018 -0400 +++ b/mercurial/hbisect.py Wed Apr 18 15:32:08 2018 -0400 @@ -55,7 +55,7 @@ if (len(state['bad']) == 1 and len(state['good']) == 1 and state['bad'] != state['good']): raise error.Abort(_("starting revisions are not directly related")) - raise error.Abort(_("inconsistent state, %s:%s is good and bad") + raise error.Abort(_("inconsistent state, %d:%s is good and bad") % (badrev, short(bad))) # build children dict @@ -267,12 +267,6 @@ return None -def shortlabel(label): - if label: - return label[0].upper() - - return None - def printresult(ui, repo, state, displayer, nodes, good): if len(nodes) == 1: # narrowed it down to a single revision diff -r fb92df8b634c -r ed5448edcbfa mercurial/help.py --- a/mercurial/help.py Wed Apr 04 10:35:09 2018 -0400 +++ b/mercurial/help.py Wed Apr 18 15:32:08 2018 -0400 @@ -20,14 +20,15 @@ encoding, error, extensions, + fancyopts, filemerge, fileset, minirst, pycompat, revset, templatefilters, + templatefuncs, templatekw, - templater, util, ) from .hgweb import ( @@ -62,7 +63,8 @@ rst = loaddoc('extensions')(ui).splitlines(True) rst.extend(listexts( _('enabled extensions:'), extensions.enabled(), showdeprecated=True)) - rst.extend(listexts(_('disabled extensions:'), extensions.disabled())) + rst.extend(listexts(_('disabled extensions:'), extensions.disabled(), + showdeprecated=ui.verbose)) doc = ''.join(rst) return doc @@ -83,7 +85,10 @@ if shortopt: so = '-' + shortopt lo = '--' + longopt - if default: + + if isinstance(default, fancyopts.customopt): + default = default.getdefaultvalue() + if default and not callable(default): # default is of unknown type, and in Python 2 we abused # the %s-shows-repr property to handle integers etc. To # match that behavior on Python 3, we do str(default) and @@ -149,7 +154,7 @@ doclines = docs.splitlines() if doclines: summary = doclines[0] - cmdname = cmd.partition('|')[0].lstrip('^') + cmdname = cmdutil.parsealiases(cmd)[0] if filtercmd(ui, cmdname, kw, docs): continue results['commands'].append((cmdname, summary)) @@ -169,7 +174,7 @@ continue for cmd, entry in getattr(mod, 'cmdtable', {}).iteritems(): if kw in cmd or (len(entry) > 2 and lowercontains(entry[2])): - cmdname = cmd.partition('|')[0].lstrip('^') + cmdname = cmdutil.parsealiases(cmd)[0] cmddoc = pycompat.getdoc(entry[0]) if cmddoc: cmddoc = gettext(cmddoc).splitlines()[0] @@ -196,6 +201,8 @@ return loader internalstable = sorted([ + (['bundle2'], _('Bundle2'), + loaddoc('bundle2', subdir='internals')), (['bundles'], _('Bundles'), loaddoc('bundles', subdir='internals')), (['censor'], _('Censor'), @@ -306,7 +313,7 @@ addtopicsymbols('revisions', '.. predicatesmarker', revset.symbols) addtopicsymbols('templates', '.. keywordsmarker', templatekw.keywords) addtopicsymbols('templates', '.. filtersmarker', templatefilters.filters) -addtopicsymbols('templates', '.. functionsmarker', templater.funcs) +addtopicsymbols('templates', '.. functionsmarker', templatefuncs.funcs) addtopicsymbols('hgweb', '.. webcommandsmarker', webcommands.commands, dedent=True) @@ -327,7 +334,7 @@ # py3k fix: except vars can't be used outside the scope of the # except block, nor can be used inside a lambda. python issue4617 prefix = inst.args[0] - select = lambda c: c.lstrip('^').startswith(prefix) + select = lambda c: cmdutil.parsealiases(c)[0].startswith(prefix) rst = helplist(select) return rst @@ -363,8 +370,8 @@ if util.safehasattr(entry[0], 'definition'): # aliased command source = entry[0].source if entry[0].definition.startswith('!'): # shell alias - doc = (_('shell alias for::\n\n %s\n\ndefined by: %s\n') % - (entry[0].definition[1:], source)) + doc = (_('shell alias for: %s\n\n%s\n\ndefined by: %s\n') % + (entry[0].definition[1:], doc, source)) else: doc = (_('alias for: hg %s\n\n%s\n\ndefined by: %s\n') % (entry[0].definition, doc, source)) @@ -418,15 +425,18 @@ h = {} cmds = {} for c, e in commands.table.iteritems(): - f = c.partition("|")[0] - if select and not select(f): + fs = cmdutil.parsealiases(c) + f = fs[0] + p = '' + if c.startswith("^"): + p = '^' + if select and not select(p + f): continue if (not select and name != 'shortlist' and e[0].__module__ != commands.__name__): continue - if name == "shortlist" and not f.startswith("^"): + if name == "shortlist" and not p: continue - f = f.lstrip("^") doc = pycompat.getdoc(e[0]) if filtercmd(ui, f, name, doc): continue @@ -434,7 +444,7 @@ if not doc: doc = _("(no help text available)") h[f] = doc.splitlines()[0].rstrip() - cmds[f] = c.lstrip("^") + cmds[f] = '|'.join(fs) rst = [] if not h: diff -r fb92df8b634c -r ed5448edcbfa mercurial/help/config.txt --- a/mercurial/help/config.txt Wed Apr 04 10:35:09 2018 -0400 +++ b/mercurial/help/config.txt Wed Apr 18 15:32:08 2018 -0400 @@ -1363,13 +1363,18 @@ ``args`` The arguments to pass to the tool executable. You can refer to the files being merged as well as the output file through these - variables: ``$base``, ``$local``, ``$other``, ``$output``. The meaning - of ``$local`` and ``$other`` can vary depending on which action is being - performed. During and update or merge, ``$local`` represents the original - state of the file, while ``$other`` represents the commit you are updating - to or the commit you are merging with. During a rebase ``$local`` - represents the destination of the rebase, and ``$other`` represents the - commit being rebased. + variables: ``$base``, ``$local``, ``$other``, ``$output``. + + The meaning of ``$local`` and ``$other`` can vary depending on which action is + being performed. During an update or merge, ``$local`` represents the original + state of the file, while ``$other`` represents the commit you are updating to or + the commit you are merging with. During a rebase, ``$local`` represents the + destination of the rebase, and ``$other`` represents the commit being rebased. + + Some operations define custom labels to assist with identifying the revisions, + accessible via ``$labellocal``, ``$labelother``, and ``$labelbase``. If custom + labels are not available, these will be ``local``, ``other``, and ``base``, + respectively. (default: ``$local $base $other``) ``premerge`` @@ -1405,6 +1410,21 @@ ``gui`` This tool requires a graphical interface to run. (default: False) +``mergemarkers`` + Controls whether the labels passed via ``$labellocal``, ``$labelother``, and + ``$labelbase`` are ``detailed`` (respecting ``mergemarkertemplate``) or + ``basic``. If ``premerge`` is ``keep`` or ``keep-merge3``, the conflict + markers generated during premerge will be ``detailed`` if either this option or + the corresponding option in the ``[ui]`` section is ``detailed``. + (default: ``basic``) + +``mergemarkertemplate`` + This setting can be used to override ``mergemarkertemplate`` from the ``[ui]`` + section on a per-tool basis; this applies to the ``$label``-prefixed variables + and to the conflict markers that are generated if ``premerge`` is ``keep` or + ``keep-merge3``. See the corresponding variable in ``[ui]`` for more + information. + .. container:: windows ``regkey`` @@ -1564,8 +1584,7 @@ In this section description, 'profiling data' stands for the raw data collected during profiling, while 'profiling report' stands for a -statistical text report generated from the profiling data. The -profiling is done using lsprof. +statistical text report generated from the profiling data. ``enabled`` Enable the profiler. @@ -1637,7 +1656,7 @@ Show at most this number of lines of drill-down info after each main entry. This can help explain the difference between Total and Inline. Specific to the ``ls`` instrumenting profiler. - (default: 5) + (default: 0) ``showmin`` Minimum fraction of samples an entry must have for it to be displayed. @@ -1772,6 +1791,19 @@ are highly recommended. Partial clones will still be allowed. (default: False) +``streamunbundle`` + When set, servers will apply data sent from the client directly, + otherwise it will be written to a temporary file first. This option + effectively prevents concurrent pushes. + +``pullbundle`` + When set, the server will check pullbundle.manifest for bundles + covering the requested heads and common nodes. The first matching + entry will be streamed to the client. + + For HTTP transport, the stream will still use zlib compression + for older clients. + ``concurrent-push-mode`` Level of allowed race condition between two pushing clients. @@ -2120,6 +2152,8 @@ markers is different from the encoding of the merged files, serious problems may occur. + Can be overridden per-merge-tool, see the ``[merge-tools]`` section. + ``origbackuppath`` The path to a directory used to store generated .orig files. If the path is not a directory, one will be created. If set, files stored in this @@ -2506,6 +2540,9 @@ Values less than or equal to 0 always refresh. (default: 20) +``server-header`` + Value for HTTP ``Server`` response header. + ``staticurl`` Base URL to use for static files. If unset, static files (e.g. the hgicon.png favicon) will be served by the CGI script itself. Use diff -r fb92df8b634c -r ed5448edcbfa mercurial/help/internals/bundle2.txt --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/mercurial/help/internals/bundle2.txt Wed Apr 18 15:32:08 2018 -0400 @@ -0,0 +1,677 @@ +Bundle2 refers to a data format that is used for both on-disk storage +and over-the-wire transfer of repository data and state. + +The data format allows the capture of multiple components of +repository data. Contrast with the initial bundle format, which +only captured *changegroup* data (and couldn't store bookmarks, +phases, etc). + +Bundle2 is used for: + +* Transferring data from a repository (e.g. as part of an ``hg clone`` + or ``hg pull`` operation). +* Transferring data to a repository (e.g. as part of an ``hg push`` + operation). +* Storing data on disk (e.g. the result of an ``hg bundle`` + operation). +* Transferring the results of a repository operation (e.g. the + reply to an ``hg push`` operation). + +At its highest level, a bundle2 payload is a stream that begins +with some metadata and consists of a series of *parts*, with each +part describing repository data or state or the result of an +operation. New bundle2 parts are introduced over time when there is +a need to capture a new form of data. A *capabilities* mechanism +exists to allow peers to understand which bundle2 parts the other +understands. + +Stream Format +============= + +A bundle2 payload consists of a magic string (``HG20``) followed by +stream level parameters, followed by any number of payload *parts*. + +It may help to think of the stream level parameters as *headers* and the +payload parts as the *body*. + +Stream Level Parameters +----------------------- + +Following the magic string is data that defines parameters applicable to the +entire payload. + +Stream level parameters begin with a 32-bit unsigned big-endian integer. +The value of this integer defines the number of bytes of stream level +parameters that follow. + +The *N* bytes of raw data contains a space separated list of parameters. +Each parameter consists of a required name and an optional value. + +Parameters have the form ```` or ``=``. + +Both the parameter name and value are URL quoted. + +Names MUST start with a letter. If the first letter is lower case, the +parameter is advisory and can safely be ignored. If the first letter +is upper case, the parameter is mandatory and the handler MUST stop if +it is unable to process it. + +Stream level parameters apply to the entire bundle2 payload. Lower-level +options should go into a bundle2 part instead. + +The following stream level parameters are defined: + +compression + Compression format of payload data. ``GZ`` denotes zlib. ``BZ`` + denotes bzip2. ``ZS`` denotes zstandard. + + When defined, all bytes after the stream level parameters are + compressed using the compression format defined by this parameter. + + If this parameter isn't present, data is raw/uncompressed. + + This parameter MUST be mandatory because attempting to consume + streams without knowing how to decode the underlying bytes will + result in errors. + +Payload Part +------------ + +Following the stream level parameters are 0 or more payload parts. Each +payload part consists of a header and a body. + +The payload part header consists of a 32-bit unsigned big-endian integer +defining the number of bytes in the header that follow. The special +value ``0`` indicates the end of the bundle2 stream. + +The binary format of the part header is as follows: + +* 8-bit unsigned size of the part name +* N-bytes alphanumeric part name +* 32-bit unsigned big-endian part ID +* N bytes part parameter data + +The *part name* identifies the type of the part. A part name with an +UPPERCASE letter is mandatory. Otherwise, the part is advisory. A +consumer should abort if it encounters a mandatory part it doesn't know +how to process. See the sections below for each defined part type. + +The *part ID* is a unique identifier within the bundle used to refer to a +specific part. It should be unique within the bundle2 payload. + +Part parameter data consists of: + +* 1 byte number of mandatory parameters +* 1 byte number of advisory parameters +* 2 * N bytes of sizes of parameter key and values +* N * M blobs of values for parameter key and values + +Following the 2 bytes of mandatory and advisory parameter counts are +2-tuples of bytes of the sizes of each parameter. e.g. +(, ). + +Following that are the raw values, without padding. Mandatory parameters +come first, followed by advisory parameters. + +Each parameter's key MUST be unique within the part. + +Following the part parameter data is the part payload. The part payload +consists of a series of framed chunks. The frame header is a 32-bit +big-endian integer defining the size of the chunk. The N bytes of raw +payload data follows. + +The part payload consists of 0 or more chunks. + +A chunk with size ``0`` denotes the end of the part payload. Therefore, +there will always be at least 1 32-bit integer following the payload +part header. + +A chunk size of ``-1`` is used to signal an *interrupt*. If such a chunk +size is seen, the stream processor should process the next bytes as a new +payload part. After this payload part, processing of the original, +interrupted part should resume. + +Capabilities +============ + +Bundle2 is a dynamic format that can evolve over time. For example, +when a new repository data concept is invented, a new bundle2 part +is typically invented to hold that data. In addition, parts performing +similar functionality may come into existence if there is a better +mechanism for performing certain functionality. + +Because the bundle2 format evolves over time, peers need to understand +what bundle2 features the other can understand. The *capabilities* +mechanism is how those features are expressed. + +Bundle2 capabilities are logically expressed as a dictionary of +string key-value pairs where the keys are strings and the values +are lists of strings. + +Capabilities are encoded for exchange between peers. The encoded +capabilities blob consists of a newline (``\n``) delimited list of +entries. Each entry has the form ```` or ``=``, +depending if the capability has a value. + +The capability name is URL quoted (``%XX`` encoding of URL unsafe +characters). + +The value, if present, is formed by URL quoting each value in +the capability list and concatenating the result with a comma (``,``). + +For example, the capabilities ``novaluekey`` and ``listvaluekey`` +with values ``value 1`` and ``value 2``. This would be encoded as: + + listvaluekey=value%201,value%202\nnovaluekey + +The sections below detail the defined bundle2 capabilities. + +HG20 +---- + +Denotes that the peer supports the bundle2 data format. + +bookmarks +--------- + +Denotes that the peer supports the ``bookmarks`` part. + +Peers should not issue mandatory ``bookmarks`` parts unless this +capability is present. + +changegroup +----------- + +Denotes which versions of the *changegroup* format the peer can +receive. Values include ``01``, ``02``, and ``03``. + +The peer should not generate changegroup data for a version not +specified by this capability. + +checkheads +---------- + +Denotes which forms of heads checking the peer supports. + +If ``related`` is in the value, then the peer supports the ``check:heads`` +part and the peer is capable of detecting race conditions when applying +changelog data. + +digests +------- + +Denotes which hashing formats the peer supports. + +Values are names of hashing function. Values include ``md5``, ``sha1``, +and ``sha512``. + +error +----- + +Denotes which ``error:`` parts the peer supports. + +Value is a list of strings of ``error:`` part names. Valid values +include ``abort``, ``unsupportecontent``, ``pushraced``, and ``pushkey``. + +Peers should not issue an ``error:`` part unless the type of that +part is listed as supported by this capability. + +listkeys +-------- + +Denotes that the peer supports the ``listkeys`` part. + +hgtagsfnodes +------------ + +Denotes that the peer supports the ``hgtagsfnodes`` part. + +obsmarkers +---------- + +Denotes that the peer supports the ``obsmarker`` part and which versions +of the obsolescence data format it can receive. Values are strings like +``V``. e.g. ``V1``. + +phases +------ + +Denotes that the peer supports the ``phases`` part. + +pushback +-------- + +Denotes that the peer supports sending/receiving bundle2 data in response +to a bundle2 request. + +This capability is typically used by servers that employ server-side +rewriting of pushed repository data. For example, a server may wish to +automatically rebase pushed changesets. When this capability is present, +the server can send a bundle2 response containing the rewritten changeset +data and the client will apply it. + +pushkey +------- + +Denotes that the peer supports the ``puskey`` part. + +remote-changegroup +------------------ + +Denotes that the peer supports the ``remote-changegroup`` part and +which protocols it can use to fetch remote changegroup data. + +Values are protocol names. e.g. ``http`` and ``https``. + +stream +------ + +Denotes that the peer supports ``stream*`` parts in order to support +*stream clone*. + +Values are which ``stream*`` parts the peer supports. ``v2`` denotes +support for the ``stream2`` part. + +Bundle2 Part Types +================== + +The sections below detail the various bundle2 part types. + +bookmarks +--------- + +The ``bookmarks`` part holds bookmarks information. + +This part has no parameters. + +The payload consists of entries defining bookmarks. Each entry consists of: + +* 20 bytes binary changeset node. +* 2 bytes big endian short defining bookmark name length. +* N bytes defining bookmark name. + +Receivers typically update bookmarks to match the state specified in +this part. + +changegroup +----------- + +The ``changegroup`` part contains *changegroup* data (changelog, manifestlog, +and filelog revision data). + +The following part parameters are defined for this part. + +version + Changegroup version string. e.g. ``01``, ``02``, and ``03``. This parameter + determines how to interpret the changegroup data within the part. + +nbchanges + The number of changesets in this changegroup. This parameter can be used + to aid in the display of progress bars, etc during part application. + +treemanifest + Whether the changegroup contains tree manifests. + +targetphase + The target phase of changesets in this part. Value is an integer of + the target phase. + +The payload of this part is raw changegroup data. See +:hg:`help internals.changegroups` for the format of changegroup data. + +check:bookmarks +--------------- + +The ``check:bookmarks`` part is inserted into a bundle as a means for the +receiver to validate that the sender's known state of bookmarks matches +the receiver's. + +This part has no parameters. + +The payload is a binary stream of bookmark data. Each entry in the stream +consists of: + +* 20 bytes binary node that bookmark is associated with +* 2 bytes unsigned short defining length of bookmark name +* N bytes containing the bookmark name + +If all bits in the node value are ``1``, then this signifies a missing +bookmark. + +When the receiver encounters this part, for each bookmark in the part +payload, it should validate that the current bookmark state matches +the specified state. If it doesn't, then the receiver should take +appropriate action. (In the case of pushes, this mismatch signifies +a race condition and the receiver should consider rejecting the push.) + +check:heads +----------- + +The ``check:heads`` part is a means to validate that the sender's state +of DAG heads matches the receiver's. + +This part has no parameters. + +The body of this part is an array of 20 byte binary nodes representing +changeset heads. + +Receivers should compare the set of heads defined in this part to the +current set of repo heads and take action if there is a mismatch in that +set. + +Note that this part applies to *all* heads in the repo. + +check:phases +------------ + +The ``check:phases`` part validates that the sender's state of phase +boundaries matches the receiver's. + +This part has no parameters. + +The payload consists of an array of 24 byte entries. Each entry is +a big endian 32-bit integer defining the phase integer and 20 byte +binary node value. + +For each changeset defined in this part, the receiver should validate +that its current phase matches the phase defined in this part. The +receiver should take appropriate action if a mismatch occurs. + +check:updated-heads +------------------- + +The ``check:updated-heads`` part validates that the sender's state of +DAG heads updated by this bundle matches the receiver's. + +This type is nearly identical to ``check:heads`` except the heads +in the payload are only a subset of heads in the repository. The +receiver should validate that all nodes specified by the sender are +branch heads and take appropriate action if not. + +error:abort +----------- + +The ``error:abort`` part conveys a fatal error. + +The following part parameters are defined: + +message + The string content of the error message. + +hint + Supplemental string giving a hint on how to fix the problem. + +error:pushkey +------------- + +The ``error:pushkey`` part conveys an error in the *pushkey* protocol. + +The following part parameters are defined: + +namespace + The pushkey domain that exhibited the error. + +key + The key whose update failed. + +new + The value we tried to set the key to. + +old + The old value of the key (as supplied by the client). + +ret + The integer result code for the pushkey request. + +in-reply-to + Part ID that triggered this error. + +This part is generated if there was an error applying *pushkey* data. +Pushkey data includes bookmarks, phases, and obsolescence markers. + +error:pushraced +--------------- + +The ``error:pushraced`` part conveys that an error occurred and +the likely cause is losing a race with another pusher. + +The following part parameters are defined: + +message + String error message. + +This part is typically emitted when a receiver examining ``check:*`` +parts encountered inconsistency between incoming state and local state. +The likely cause of that inconsistency is another repository change +operation (often another client performing an ``hg push``). + +error:unsupportedcontent +------------------------ + +The ``error:unsupportedcontent`` part conveys that a bundle2 receiver +encountered a part or content it was not able to handle. + +The following part parameters are defined: + +parttype + The name of the part that triggered this error. + +params + ``\0`` delimited list of parameters. + +hgtagsfnodes +------------ + +The ``hgtagsfnodes`` type defines file nodes for the ``.hgtags`` file +for various changesets. + +This part has no parameters. + +The payload is an array of pairs of 20 byte binary nodes. The first node +is a changeset node. The second node is the ``.hgtags`` file node. + +Resolving tags requires resolving the ``.hgtags`` file node for changesets. +On large repositories, this can be expensive. Repositories cache the +mapping of changeset to ``.hgtags`` file node on disk as a performance +optimization. This part allows that cached data to be transferred alongside +changeset data. + +Receivers should update their ``.hgtags`` cache file node mappings with +the incoming data. + +listkeys +-------- + +The ``listkeys`` part holds content for a *pushkey* namespace. + +The following part parameters are defined: + +namespace + The pushkey domain this data belongs to. + +The part payload contains a newline (``\n``) delimited list of +tab (``\t``) delimited key-value pairs defining entries in this pushkey +namespace. + +obsmarkers +---------- + +The ``obsmarkers`` part defines obsolescence markers. + +This part has no parameters. + +The payload consists of obsolescence markers using the on-disk markers +format. The first byte defines the version format. + +The receiver should apply the obsolescence markers defined in this +part. A ``reply:obsmarkers`` part should be sent to the sender, if possible. + +output +------ + +The ``output`` part is used to display output on the receiver. + +This part has no parameters. + +The payload consists of raw data to be printed on the receiver. + +phase-heads +----------- + +The ``phase-heads`` part defines phase boundaries. + +This part has no parameters. + +The payload consists of an array of 24 byte entries. Each entry is +a big endian 32-bit integer defining the phase integer and 20 byte +binary node value. + +pushkey +------- + +The ``pushkey`` part communicates an intent to perform a ``pushkey`` +request. + +The following part parameters are defined: + +namespace + The pushkey domain to operate on. + +key + The key within the pushkey namespace that is being changed. + +old + The old value for the key being changed. + +new + The new value for the key being changed. + +This part has no payload. + +The receiver should perform a pushkey operation as described by this +part's parameters. + +If the pushey operation fails, a ``reply:pushkey`` part should be sent +back to the sender, if possible. The ``in-reply-to`` part parameter +should reference the source part. + +pushvars +-------- + +The ``pushvars`` part defines environment variables that should be +set when processing this bundle2 payload. + +The part's advisory parameters define environment variables. + +There is no part payload. + +When received, part parameters are prefixed with ``USERVAR_`` and the +resulting variables are defined in the hooks context for the current +bundle2 application. This part provides a mechanism for senders to +inject extra state into the hook execution environment on the receiver. + +remote-changegroup +------------------ + +The ``remote-changegroup`` part defines an external location of a bundle +to apply. This part can be used by servers to serve pre-generated bundles +hosted at arbitrary URLs. + +The following part parameters are defined: + +url + The URL of the remote bundle. + +size + The size in bytes of the remote bundle. + +digests + A space separated list of the digest types provided in additional + part parameters. + +digest: + The hexadecimal representation of the digest (hash) of the remote bundle. + +There is no payload for this part type. + +When encountered, clients should attempt to fetch the URL being advertised +and read and apply it as a bundle. + +The ``size`` and ``digest:`` parameters should be used to validate +that the downloaded bundle matches what was advertised. If a mismatch occurs, +the client should abort. + +reply:changegroup +----------------- + +The ``reply:changegroup`` part conveys the results of application of a +``changegroup`` part. + +The following part parameters are defined: + +return + Integer return code from changegroup application. + +in-reply-to + Part ID of part this reply is in response to. + +reply:obsmarkers +---------------- + +The ``reply:obsmarkers`` part conveys the results of applying an +``obsmarkers`` part. + +The following part parameters are defined: + +new + The integer number of new markers that were applied. + +in-reply-to + The part ID that this part is in reply to. + +reply:pushkey +------------- + +The ``reply:pushkey`` part conveys the result of a *pushkey* operation. + +The following part parameters are defined: + +return + Integer result code from pushkey operation. + +in-reply-to + Part ID that triggered this pushkey operation. + +This part has no payload. + +replycaps +--------- + +The ``replycaps`` part notifies the receiver that a reply bundle should +be created. + +This part has no parameters. + +The payload consists of a bundle2 capabilities blob. + +stream2 +------- + +The ``stream2`` part contains *streaming clone* version 2 data. + +The following part parameters are defined: + +requirements + URL quoted repository requirements string. Requirements are delimited by a + command (``,``). + +filecount + The total number of files being transferred in the payload. + +bytecount + The total size of file content being transferred in the payload. + +The payload consists of raw stream clone version 2 data. + +The ``filecount`` and ``bytecount`` parameters can be used for progress and +reporting purposes. The values may not be exact. diff -r fb92df8b634c -r ed5448edcbfa mercurial/help/internals/bundles.txt --- a/mercurial/help/internals/bundles.txt Wed Apr 04 10:35:09 2018 -0400 +++ b/mercurial/help/internals/bundles.txt Wed Apr 18 15:32:08 2018 -0400 @@ -63,8 +63,7 @@ ``HG20`` is currently the only defined bundle2 version. -The ``HG20`` format is not yet documented here. See the inline comments -in ``mercurial/exchange.py`` for now. +The ``HG20`` format is documented at :hg:`help internals.bundle2`. Initial ``HG20`` support was added in Mercurial 3.0 (released May 2014). However, bundle2 bundles were hidden behind an experimental flag diff -r fb92df8b634c -r ed5448edcbfa mercurial/help/internals/requirements.txt --- a/mercurial/help/internals/requirements.txt Wed Apr 04 10:35:09 2018 -0400 +++ b/mercurial/help/internals/requirements.txt Wed Apr 18 15:32:08 2018 -0400 @@ -1,4 +1,3 @@ - Repositories contain a file (``.hg/requires``) containing a list of features/capabilities that are *required* for clients to interface with the repository. This file has been present in Mercurial since @@ -105,8 +104,10 @@ Denotes that version 2 of manifests are being used. Support for this requirement was added in Mercurial 3.4 (released -May 2015). The requirement is currently experimental and is disabled -by default. +May 2015). The new format failed to meet expectations and support +for the format and requirement were removed in Mercurial 4.6 +(released May 2018) since the feature never graduated frome experiment +status. treemanifest ============ diff -r fb92df8b634c -r ed5448edcbfa mercurial/help/internals/wireprotocol.txt --- a/mercurial/help/internals/wireprotocol.txt Wed Apr 04 10:35:09 2018 -0400 +++ b/mercurial/help/internals/wireprotocol.txt Wed Apr 18 15:32:08 2018 -0400 @@ -10,11 +10,79 @@ The protocol is synchronous and does not support multiplexing (concurrent commands). -Transport Protocols -=================== +Handshake +========= + +It is required or common for clients to perform a *handshake* when connecting +to a server. The handshake serves the following purposes: + +* Negotiating protocol/transport level options +* Allows the client to learn about server capabilities to influence + future requests +* Ensures the underlying transport channel is in a *clean* state + +An important goal of the handshake is to allow clients to use more modern +wire protocol features. By default, clients must assume they are talking +to an old version of Mercurial server (possibly even the very first +implementation). So, clients should not attempt to call or utilize modern +wire protocol features until they have confirmation that the server +supports them. The handshake implementation is designed to allow both +ends to utilize the latest set of features and capabilities with as +few round trips as possible. + +The handshake mechanism varies by transport and protocol and is documented +in the sections below. + +HTTP Protocol +============= + +Handshake +--------- + +The client sends a ``capabilities`` command request (``?cmd=capabilities``) +as soon as HTTP requests may be issued. -HTTP Transport --------------- +By default, the server responds with a version 1 capabilities string, which +the client parses to learn about the server's abilities. The ``Content-Type`` +for this response is ``application/mercurial-0.1`` or +``application/mercurial-0.2`` depending on whether the client advertised +support for version ``0.2`` in its request. (Clients aren't supposed to +advertise support for ``0.2`` until the capabilities response indicates +the server's support for that media type. However, a client could +conceivably cache this metadata and issue the capabilities request in such +a way to elicit an ``application/mercurial-0.2`` response.) + +Clients wishing to switch to a newer API service may send an +``X-HgUpgrade-`` header containing a space-delimited list of API service +names the client is capable of speaking. The request MUST also include an +``X-HgProto-`` header advertising a known serialization format for the +response. ``cbor`` is currently the only defined serialization format. + +If the request contains these headers, the response ``Content-Type`` MAY +be for a different media type. e.g. ``application/mercurial-cbor`` if the +client advertises support for CBOR. + +The response MUST be deserializable to a map with the following keys: + +apibase + URL path to API services, relative to the repository root. e.g. ``api/``. + +apis + A map of API service names to API descriptors. An API descriptor contains + more details about that API. In the case of the HTTP Version 2 Transport, + it will be the normal response to a ``capabilities`` command. + + Only the services advertised by the client that are also available on + the server are advertised. + +v1capabilities + The capabilities string that would be returned by a version 1 response. + +The client can then inspect the server-advertised APIs and decide which +API to use, including continuing to use the HTTP Version 1 Transport. + +HTTP Version 1 Transport +------------------------ Commands are issued as HTTP/1.0 or HTTP/1.1 requests. Commands are sent to the base URL of the repository with the command name sent in @@ -91,11 +159,8 @@ The content of the HTTP response body typically holds text describing the error. -The ``application/hg-changegroup`` media type indicates a changegroup response -type. - -Clients also accept the ``text/plain`` media type. All other media -types should cause the client to error. +The ``application/mercurial-cbor`` media type indicates a CBOR payload +and should be interpreted as identical to ``application/cbor``. Behavior of media types is further described in the ``Content Negotiation`` section below. @@ -112,11 +177,256 @@ ``application/mercurial-0.*`` media type and the HTTP response is typically using *chunked transfer* (``Transfer-Encoding: chunked``). -SSH Transport -============= +HTTP Version 2 Transport +------------------------ + +**Experimental - feature under active development** + +Version 2 of the HTTP protocol is exposed under the ``/api/*`` URL space. +It's final API name is not yet formalized. + +Commands are triggered by sending HTTP POST requests against URLs of the +form ``/``, where ```` is ``ro`` or +``rw``, meaning read-only and read-write, respectively and ```` +is a named wire protocol command. + +Non-POST request methods MUST be rejected by the server with an HTTP +405 response. + +Commands that modify repository state in meaningful ways MUST NOT be +exposed under the ``ro`` URL prefix. All available commands MUST be +available under the ``rw`` URL prefix. + +Server adminstrators MAY implement blanket HTTP authentication keyed +off the URL prefix. For example, a server may require authentication +for all ``rw/*`` URLs and let unauthenticated requests to ``ro/*`` +URL proceed. A server MAY issue an HTTP 401, 403, or 407 response +in accordance with RFC 7235. Clients SHOULD recognize the HTTP Basic +(RFC 7617) and Digest (RFC 7616) authentication schemes. Clients SHOULD +make an attempt to recognize unknown schemes using the +``WWW-Authenticate`` response header on a 401 response, as defined by +RFC 7235. + +Read-only commands are accessible under ``rw/*`` URLs so clients can +signal the intent of the operation very early in the connection +lifecycle. For example, a ``push`` operation - which consists of +various read-only commands mixed with at least one read-write command - +can perform all commands against ``rw/*`` URLs so that any server-side +authentication requirements are discovered upon attempting the first +command - not potentially several commands into the exchange. This +allows clients to fail faster or prompt for credentials as soon as the +exchange takes place. This provides a better end-user experience. + +Requests to unknown commands or URLS result in an HTTP 404. +TODO formally define response type, how error is communicated, etc. + +HTTP request and response bodies use the *Unified Frame-Based Protocol* +(defined below) for media exchange. The entirety of the HTTP message +body is 0 or more frames as defined by this protocol. + +Clients and servers MUST advertise the ``TBD`` media type via the +``Content-Type`` request and response headers. In addition, clients MUST +advertise this media type value in their ``Accept`` request header in all +requests. +TODO finalize the media type. For now, it is defined in wireprotoserver.py. + +Servers receiving requests without an ``Accept`` header SHOULD respond with +an HTTP 406. + +Servers receiving requests with an invalid ``Content-Type`` header SHOULD +respond with an HTTP 415. + +The command to run is specified in the POST payload as defined by the +*Unified Frame-Based Protocol*. This is redundant with data already +encoded in the URL. This is by design, so server operators can have +better understanding about server activity from looking merely at +HTTP access logs. + +In most circumstances, the command specified in the URL MUST match +the command specified in the frame-based payload or the server will +respond with an error. The exception to this is the special +``multirequest`` URL. (See below.) In addition, HTTP requests +are limited to one command invocation. The exception is the special +``multirequest`` URL. + +The ``multirequest`` command endpoints (``ro/multirequest`` and +``rw/multirequest``) are special in that they allow the execution of +*any* command and allow the execution of multiple commands. If the +HTTP request issues multiple commands across multiple frames, all +issued commands will be processed by the server. Per the defined +behavior of the *Unified Frame-Based Protocol*, commands may be +issued interleaved and responses may come back in a different order +than they were issued. Clients MUST be able to deal with this. + +SSH Protocol +============ + +Handshake +--------- + +For all clients, the handshake consists of the client sending 1 or more +commands to the server using version 1 of the transport. Servers respond +to commands they know how to respond to and send an empty response (``0\n``) +for unknown commands (per standard behavior of version 1 of the transport). +Clients then typically look for a response to the newest sent command to +determine which transport version to use and what the available features for +the connection and server are. + +Preceding any response from client-issued commands, the server may print +non-protocol output. It is common for SSH servers to print banners, message +of the day announcements, etc when clients connect. It is assumed that any +such *banner* output will precede any Mercurial server output. So clients +must be prepared to handle server output on initial connect that isn't +in response to any client-issued command and doesn't conform to Mercurial's +wire protocol. This *banner* output should only be on stdout. However, +some servers may send output on stderr. + +Pre 0.9.1 clients issue a ``between`` command with the ``pairs`` argument +having the value +``0000000000000000000000000000000000000000-0000000000000000000000000000000000000000``. + +The ``between`` command has been supported since the original Mercurial +SSH server. Requesting the empty range will return a ``\n`` string response, +which will be encoded as ``1\n\n`` (value length of ``1`` followed by a newline +followed by the value, which happens to be a newline). + +For pre 0.9.1 clients and all servers, the exchange looks like:: + + c: between\n + c: pairs 81\n + c: 0000000000000000000000000000000000000000-0000000000000000000000000000000000000000 + s: 1\n + s: \n + +0.9.1+ clients send a ``hello`` command (with no arguments) before the +``between`` command. The response to this command allows clients to +discover server capabilities and settings. -The SSH transport is a custom text-based protocol suitable for use over any -bi-directional stream transport. It is most commonly used with SSH. +An example exchange between 0.9.1+ clients and a ``hello`` aware server looks +like:: + + c: hello\n + c: between\n + c: pairs 81\n + c: 0000000000000000000000000000000000000000-0000000000000000000000000000000000000000 + s: 324\n + s: capabilities: lookup changegroupsubset branchmap pushkey known getbundle ...\n + s: 1\n + s: \n + +And a similar scenario but with servers sending a banner on connect:: + + c: hello\n + c: between\n + c: pairs 81\n + c: 0000000000000000000000000000000000000000-0000000000000000000000000000000000000000 + s: welcome to the server\n + s: if you find any issues, email someone@somewhere.com\n + s: 324\n + s: capabilities: lookup changegroupsubset branchmap pushkey known getbundle ...\n + s: 1\n + s: \n + +Note that output from the ``hello`` command is terminated by a ``\n``. This is +part of the response payload and not part of the wire protocol adding a newline +after responses. In other words, the length of the response contains the +trailing ``\n``. + +Clients supporting version 2 of the SSH transport send a line beginning +with ``upgrade`` before the ``hello`` and ``between`` commands. The line +(which isn't a well-formed command line because it doesn't consist of a +single command name) serves to both communicate the client's intent to +switch to transport version 2 (transports are version 1 by default) as +well as to advertise the client's transport-level capabilities so the +server may satisfy that request immediately. + +The upgrade line has the form: + + upgrade + +That is the literal string ``upgrade`` followed by a space, followed by +a randomly generated string, followed by a space, followed by a string +denoting the client's transport capabilities. + +The token can be anything. However, a random UUID is recommended. (Use +of version 4 UUIDs is recommended because version 1 UUIDs can leak the +client's MAC address.) + +The transport capabilities string is a URL/percent encoded string +containing key-value pairs defining the client's transport-level +capabilities. The following capabilities are defined: + +proto + A comma-delimited list of transport protocol versions the client + supports. e.g. ``ssh-v2``. + +If the server does not recognize the ``upgrade`` line, it should issue +an empty response and continue processing the ``hello`` and ``between`` +commands. Here is an example handshake between a version 2 aware client +and a non version 2 aware server: + + c: upgrade 2e82ab3f-9ce3-4b4e-8f8c-6fd1c0e9e23a proto=ssh-v2 + c: hello\n + c: between\n + c: pairs 81\n + c: 0000000000000000000000000000000000000000-0000000000000000000000000000000000000000 + s: 0\n + s: 324\n + s: capabilities: lookup changegroupsubset branchmap pushkey known getbundle ...\n + s: 1\n + s: \n + +(The initial ``0\n`` line from the server indicates an empty response to +the unknown ``upgrade ..`` command/line.) + +If the server recognizes the ``upgrade`` line and is willing to satisfy that +upgrade request, it replies to with a payload of the following form: + + upgraded \n + +This line is the literal string ``upgraded``, a space, the token that was +specified by the client in its ``upgrade ...`` request line, a space, and the +name of the transport protocol that was chosen by the server. The transport +name MUST match one of the names the client specified in the ``proto`` field +of its ``upgrade ...`` request line. + +If a server issues an ``upgraded`` response, it MUST also read and ignore +the lines associated with the ``hello`` and ``between`` command requests +that were issued by the server. It is assumed that the negotiated transport +will respond with equivalent requested information following the transport +handshake. + +All data following the ``\n`` terminating the ``upgraded`` line is the +domain of the negotiated transport. It is common for the data immediately +following to contain additional metadata about the state of the transport and +the server. However, this isn't strictly speaking part of the transport +handshake and isn't covered by this section. + +Here is an example handshake between a version 2 aware client and a version +2 aware server: + + c: upgrade 2e82ab3f-9ce3-4b4e-8f8c-6fd1c0e9e23a proto=ssh-v2 + c: hello\n + c: between\n + c: pairs 81\n + c: 0000000000000000000000000000000000000000-0000000000000000000000000000000000000000 + s: upgraded 2e82ab3f-9ce3-4b4e-8f8c-6fd1c0e9e23a ssh-v2\n + s: + +The client-issued token that is echoed in the response provides a more +resilient mechanism for differentiating *banner* output from Mercurial +output. In version 1, properly formatted banner output could get confused +for Mercurial server output. By submitting a randomly generated token +that is then present in the response, the client can look for that token +in response lines and have reasonable certainty that the line did not +originate from a *banner* message. + +SSH Version 1 Transport +----------------------- + +The SSH transport (version 1) is a custom text-based protocol suitable for +use over any bi-directional stream transport. It is most commonly used with +SSH. A SSH transport server can be started with ``hg serve --stdio``. The stdin, stderr, and stdout file descriptors of the started process are used to exchange @@ -174,6 +484,532 @@ The server terminates if it receives an empty command (a ``\n`` character). +If the server announces support for the ``protocaps`` capability, the client +should issue a ``protocaps`` command after the initial handshake to annonunce +its own capabilities. The client capabilities are persistent. + +SSH Version 2 Transport +----------------------- + +**Experimental and under development** + +Version 2 of the SSH transport behaves identically to version 1 of the SSH +transport with the exception of handshake semantics. See above for how +version 2 of the SSH transport is negotiated. + +Immediately following the ``upgraded`` line signaling a switch to version +2 of the SSH protocol, the server automatically sends additional details +about the capabilities of the remote server. This has the form: + + \n + capabilities: ...\n + +e.g. + + s: upgraded 2e82ab3f-9ce3-4b4e-8f8c-6fd1c0e9e23a ssh-v2\n + s: 240\n + s: capabilities: known getbundle batch ...\n + +Following capabilities advertisement, the peers communicate using version +1 of the SSH transport. + +Unified Frame-Based Protocol +============================ + +**Experimental and under development** + +The *Unified Frame-Based Protocol* is a communications protocol between +Mercurial peers. The protocol aims to be mostly transport agnostic +(works similarly on HTTP, SSH, etc). + +To operate the protocol, a bi-directional, half-duplex pipe supporting +ordered sends and receives is required. That is, each peer has one pipe +for sending data and another for receiving. + +All data is read and written in atomic units called *frames*. These +are conceptually similar to TCP packets. Higher-level functionality +is built on the exchange and processing of frames. + +All frames are associated with a *stream*. A *stream* provides a +unidirectional grouping of frames. Streams facilitate two goals: +content encoding and parallelism. There is a dedicated section on +streams below. + +The protocol is request-response based: the client issues requests to +the server, which issues replies to those requests. Server-initiated +messaging is not currently supported, but this specification carves +out room to implement it. + +All frames are associated with a numbered request. Frames can thus +be logically grouped by their request ID. + +Frames begin with an 8 octet header followed by a variable length +payload:: + + +------------------------------------------------+ + | Length (24) | + +--------------------------------+---------------+ + | Request ID (16) | Stream ID (8) | + +------------------+-------------+---------------+ + | Stream Flags (8) | + +-----------+------+ + | Type (4) | + +-----------+ + | Flags (4) | + +===========+===================================================| + | Frame Payload (0...) ... + +---------------------------------------------------------------+ + +The length of the frame payload is expressed as an unsigned 24 bit +little endian integer. Values larger than 65535 MUST NOT be used unless +given permission by the server as part of the negotiated capabilities +during the handshake. The frame header is not part of the advertised +frame length. The payload length is the over-the-wire length. If there +is content encoding applied to the payload as part of the frame's stream, +the length is the output of that content encoding, not the input. + +The 16-bit ``Request ID`` field denotes the integer request identifier, +stored as an unsigned little endian integer. Odd numbered requests are +client-initiated. Even numbered requests are server-initiated. This +refers to where the *request* was initiated - not where the *frame* was +initiated, so servers will send frames with odd ``Request ID`` in +response to client-initiated requests. Implementations are advised to +start ordering request identifiers at ``1`` and ``0``, increment by +``2``, and wrap around if all available numbers have been exhausted. + +The 8-bit ``Stream ID`` field denotes the stream that the frame is +associated with. Frames belonging to a stream may have content +encoding applied and the receiver may need to decode the raw frame +payload to obtain the original data. Odd numbered IDs are +client-initiated. Even numbered IDs are server-initiated. + +The 8-bit ``Stream Flags`` field defines stream processing semantics. +See the section on streams below. + +The 4-bit ``Type`` field denotes the type of frame being sent. + +The 4-bit ``Flags`` field defines special, per-type attributes for +the frame. + +The sections below define the frame types and their behavior. + +Command Request (``0x01``) +-------------------------- + +This frame contains a request to run a command. + +The payload consists of a CBOR map defining the command request. The +bytestring keys of that map are: + +name + Name of the command that should be executed (bytestring). +args + Map of bytestring keys to various value types containing the named + arguments to this command. + + Each command defines its own set of argument names and their expected + types. + +This frame type MUST ONLY be sent from clients to servers: it is illegal +for a server to send this frame to a client. + +The following flag values are defined for this type: + +0x01 + New command request. When set, this frame represents the beginning + of a new request to run a command. The ``Request ID`` attached to this + frame MUST NOT be active. +0x02 + Command request continuation. When set, this frame is a continuation + from a previous command request frame for its ``Request ID``. This + flag is set when the CBOR data for a command request does not fit + in a single frame. +0x04 + Additional frames expected. When set, the command request didn't fit + into a single frame and additional CBOR data follows in a subsequent + frame. +0x08 + Command data frames expected. When set, command data frames are + expected to follow the final command request frame for this request. + +``0x01`` MUST be set on the initial command request frame for a +``Request ID``. + +``0x01`` or ``0x02`` MUST be set to indicate this frame's role in +a series of command request frames. + +If command data frames are to be sent, ``0x08`` MUST be set on ALL +command request frames. + +Command Data (``0x02``) +----------------------- + +This frame contains raw data for a command. + +Most commands can be executed by specifying arguments. However, +arguments have an upper bound to their length. For commands that +accept data that is beyond this length or whose length isn't known +when the command is initially sent, they will need to stream +arbitrary data to the server. This frame type facilitates the sending +of this data. + +The payload of this frame type consists of a stream of raw data to be +consumed by the command handler on the server. The format of the data +is command specific. + +The following flag values are defined for this type: + +0x01 + Command data continuation. When set, the data for this command + continues into a subsequent frame. + +0x02 + End of data. When set, command data has been fully sent to the + server. The command has been fully issued and no new data for this + command will be sent. The next frame will belong to a new command. + +Command Response Data (``0x03``) +-------------------------------- + +This frame contains response data to an issued command. + +Response data ALWAYS consists of a series of 1 or more CBOR encoded +values. A CBOR value may be using indefinite length encoding. And the +bytes constituting the value may span several frames. + +The following flag values are defined for this type: + +0x01 + Data continuation. When set, an additional frame containing response data + will follow. +0x02 + End of data. When set, the response data has been fully sent and + no additional frames for this response will be sent. + +The ``0x01`` flag is mutually exclusive with the ``0x02`` flag. + +Error Occurred (``0x05``) +------------------------- + +Some kind of error occurred. + +There are 3 general kinds of failures that can occur: + +* Command error encountered before any response issued +* Command error encountered after a response was issued +* Protocol or stream level error + +This frame type is used to capture the latter cases. (The general +command error case is handled by the leading CBOR map in +``Command Response`` frames.) + +The payload of this frame contains a CBOR map detailing the error. That +map has the following bytestring keys: + +type + (bytestring) The overall type of error encountered. Can be one of the + following values: + + protocol + A protocol-level error occurred. This typically means someone + is violating the framing protocol semantics and the server is + refusing to proceed. + + server + A server-level error occurred. This typically indicates some kind of + logic error on the server, likely the fault of the server. + + command + A command-level error, likely the fault of the client. + +message + (array of maps) A richly formatted message that is intended for + human consumption. See the ``Human Output Side-Channel`` frame + section for a description of the format of this data structure. + +Human Output Side-Channel (``0x06``) +------------------------------------ + +This frame contains a message that is intended to be displayed to +people. Whereas most frames communicate machine readable data, this +frame communicates textual data that is intended to be shown to +humans. + +The frame consists of a series of *formatting requests*. Each formatting +request consists of a formatting string, arguments for that formatting +string, and labels to apply to that formatting string. + +A formatting string is a printf()-like string that allows variable +substitution within the string. Labels allow the rendered text to be +*decorated*. Assuming use of the canonical Mercurial code base, a +formatting string can be the input to the ``i18n._`` function. This +allows messages emitted from the server to be localized. So even if +the server has different i18n settings, people could see messages in +their *native* settings. Similarly, the use of labels allows +decorations like coloring and underlining to be applied using the +client's configured rendering settings. + +Formatting strings are similar to ``printf()`` strings or how +Python's ``%`` operator works. The only supported formatting sequences +are ``%s`` and ``%%``. ``%s`` will be replaced by whatever the string +at that position resolves to. ``%%`` will be replaced by ``%``. All +other 2-byte sequences beginning with ``%`` represent a literal +``%`` followed by that character. However, future versions of the +wire protocol reserve the right to allow clients to opt in to receiving +formatting strings with additional formatters, hence why ``%%`` is +required to represent the literal ``%``. + +The frame payload consists of a CBOR array of CBOR maps. Each map +defines an *atom* of text data to print. Each *atom* has the following +bytestring keys: + +msg + (bytestring) The formatting string. Content MUST be ASCII. +args (optional) + Array of bytestrings defining arguments to the formatting string. +labels (optional) + Array of bytestrings defining labels to apply to this atom. + +All data to be printed MUST be encoded into a single frame: this frame +does not support spanning data across multiple frames. + +All textual data encoded in these frames is assumed to be line delimited. +The last atom in the frame SHOULD end with a newline (``\n``). If it +doesn't, clients MAY add a newline to facilitate immediate printing. + +Progress Update (``0x07``) +-------------------------- + +This frame holds the progress of an operation on the peer. Consumption +of these frames allows clients to display progress bars, estimated +completion times, etc. + +Each frame defines the progress of a single operation on the peer. The +payload consists of a CBOR map with the following bytestring keys: + +topic + Topic name (string) +pos + Current numeric position within the topic (integer) +total + Total/end numeric position of this topic (unsigned integer) +label (optional) + Unit label (string) +item (optional) + Item name (string) + +Progress state is created when a frame is received referencing a +*topic* that isn't currently tracked. Progress tracking for that +*topic* is finished when a frame is received reporting the current +position of that topic as ``-1``. + +Multiple *topics* may be active at any given time. + +Rendering of progress information is not mandated or governed by this +specification: implementations MAY render progress information however +they see fit, including not at all. + +The string data describing the topic SHOULD be static strings to +facilitate receivers localizing that string data. The emitter +MUST normalize all string data to valid UTF-8 and receivers SHOULD +validate that received data conforms to UTF-8. The topic name +SHOULD be ASCII. + +Stream Encoding Settings (``0x08``) +----------------------------------- + +This frame type holds information defining the content encoding +settings for a *stream*. + +This frame type is likely consumed by the protocol layer and is not +passed on to applications. + +This frame type MUST ONLY occur on frames having the *Beginning of Stream* +``Stream Flag`` set. + +The payload of this frame defines what content encoding has (possibly) +been applied to the payloads of subsequent frames in this stream. + +The payload begins with an 8-bit integer defining the length of the +encoding *profile*, followed by the string name of that profile, which +must be an ASCII string. All bytes that follow can be used by that +profile for supplemental settings definitions. See the section below +on defined encoding profiles. + +Stream States and Flags +----------------------- + +Streams can be in two states: *open* and *closed*. An *open* stream +is active and frames attached to that stream could arrive at any time. +A *closed* stream is not active. If a frame attached to a *closed* +stream arrives, that frame MUST have an appropriate stream flag +set indicating beginning of stream. All streams are in the *closed* +state by default. + +The ``Stream Flags`` field denotes a set of bit flags for defining +the relationship of this frame within a stream. The following flags +are defined: + +0x01 + Beginning of stream. The first frame in the stream MUST set this + flag. When received, the ``Stream ID`` this frame is attached to + becomes ``open``. + +0x02 + End of stream. The last frame in a stream MUST set this flag. When + received, the ``Stream ID`` this frame is attached to becomes + ``closed``. Any content encoding context associated with this stream + can be destroyed after processing the payload of this frame. + +0x04 + Apply content encoding. When set, any content encoding settings + defined by the stream should be applied when attempting to read + the frame. When not set, the frame payload isn't encoded. + +Streams +------- + +Streams - along with ``Request IDs`` - facilitate grouping of frames. +But the purpose of each is quite different and the groupings they +constitute are independent. + +A ``Request ID`` is essentially a tag. It tells you which logical +request a frame is associated with. + +A *stream* is a sequence of frames grouped for the express purpose +of applying a stateful encoding or for denoting sub-groups of frames. + +Unlike ``Request ID``s which span the request and response, a stream +is unidirectional and stream IDs are independent from client to +server. + +There is no strict hierarchical relationship between ``Request IDs`` +and *streams*. A stream can contain frames having multiple +``Request IDs``. Frames belonging to the same ``Request ID`` can +span multiple streams. + +One goal of streams is to facilitate content encoding. A stream can +define an encoding to be applied to frame payloads. For example, the +payload transmitted over the wire may contain output from a +zstandard compression operation and the receiving end may decompress +that payload to obtain the original data. + +The other goal of streams is to facilitate concurrent execution. For +example, a server could spawn 4 threads to service a request that can +be easily parallelized. Each of those 4 threads could write into its +own stream. Those streams could then in turn be delivered to 4 threads +on the receiving end, with each thread consuming its stream in near +isolation. The *main* thread on both ends merely does I/O and +encodes/decodes frame headers: the bulk of the work is done by worker +threads. + +In addition, since content encoding is defined per stream, each +*worker thread* could perform potentially CPU bound work concurrently +with other threads. This approach of applying encoding at the +sub-protocol / stream level eliminates a potential resource constraint +on the protocol stream as a whole (it is common for the throughput of +a compression engine to be smaller than the throughput of a network). + +Having multiple streams - each with their own encoding settings - also +facilitates the use of advanced data compression techniques. For +example, a transmitter could see that it is generating data faster +and slower than the receiving end is consuming it and adjust its +compression settings to trade CPU for compression ratio accordingly. + +While streams can define a content encoding, not all frames within +that stream must use that content encoding. This can be useful when +data is being served from caches and being derived dynamically. A +cache could pre-compressed data so the server doesn't have to +recompress it. The ability to pick and choose which frames are +compressed allows servers to easily send data to the wire without +involving potentially expensive encoding overhead. + +Content Encoding Profiles +------------------------- + +Streams can have named content encoding *profiles* associated with +them. A profile defines a shared understanding of content encoding +settings and behavior. + +The following profiles are defined: + +TBD + +Command Protocol +---------------- + +A client can request that a remote run a command by sending it +frames defining that command. This logical stream is composed of +1 or more ``Command Request`` frames and and 0 or more ``Command Data`` +frames. + +All frames composing a single command request MUST be associated with +the same ``Request ID``. + +Clients MAY send additional command requests without waiting on the +response to a previous command request. If they do so, they MUST ensure +that the ``Request ID`` field of outbound frames does not conflict +with that of an active ``Request ID`` whose response has not yet been +fully received. + +Servers MAY respond to commands in a different order than they were +sent over the wire. Clients MUST be prepared to deal with this. Servers +also MAY start executing commands in a different order than they were +received, or MAY execute multiple commands concurrently. + +If there is a dependency between commands or a race condition between +commands executing (e.g. a read-only command that depends on the results +of a command that mutates the repository), then clients MUST NOT send +frames issuing a command until a response to all dependent commands has +been received. +TODO think about whether we should express dependencies between commands +to avoid roundtrip latency. + +A command is defined by a command name, 0 or more command arguments, +and optional command data. + +Arguments are the recommended mechanism for transferring fixed sets of +parameters to a command. Data is appropriate for transferring variable +data. Thinking in terms of HTTP, arguments would be headers and data +would be the message body. + +It is recommended for servers to delay the dispatch of a command +until all argument have been received. Servers MAY impose limits on the +maximum argument size. +TODO define failure mechanism. + +Servers MAY dispatch to commands immediately once argument data +is available or delay until command data is received in full. + +Once a ``Command Request`` frame is sent, a client must be prepared to +receive any of the following frames associated with that request: +``Command Response``, ``Error Response``, ``Human Output Side-Channel``, +``Progress Update``. + +The *main* response for a command will be in ``Command Response`` frames. +The payloads of these frames consist of 1 or more CBOR encoded values. +The first CBOR value on the first ``Command Response`` frame is special +and denotes the overall status of the command. This CBOR map contains +the following bytestring keys: + +status + (bytestring) A well-defined message containing the overall status of + this command request. The following values are defined: + + ok + The command was received successfully and its response follows. + error + There was an error processing the command. More details about the + error are encoded in the ``error`` key. + +error (optional) + A map containing information about an encountered error. The map has the + following keys: + + message + (array of maps) A message describing the error. The message uses the + same format as those in the ``Human Output Side-Channel`` frame. + Capabilities ============ @@ -374,6 +1210,23 @@ This capability was introduced at the same time as the ``changegroupsubset`` capability/command. +partial-pull +------------ + +Indicates that the client can deal with partial answers to pull requests +by repeating the request. + +If this parameter is not advertised, the server will not send pull bundles. + +This client capability was introduced in Mercurial 4.6. + +protocaps +--------- + +Whether the server supports the ``protocaps`` command for SSH V1 transport. + +This capability was introduced in Mercurial 4.6. + pushkey ------- @@ -463,53 +1316,6 @@ reflects the priority/preference of that type, where the first value is the most preferred type. -Handshake Protocol -================== - -While not explicitly required, it is common for clients to perform a -*handshake* when connecting to a server. The handshake accomplishes 2 things: - -* Obtaining capabilities and other server features -* Flushing extra server output (e.g. SSH servers may print extra text - when connecting that may confuse the wire protocol) - -This isn't a traditional *handshake* as far as network protocols go because -there is no persistent state as a result of the handshake: the handshake is -simply the issuing of commands and commands are stateless. - -The canonical clients perform a capabilities lookup at connection establishment -time. This is because clients must assume a server only supports the features -of the original Mercurial server implementation until proven otherwise (from -advertised capabilities). Nearly every server running today supports features -that weren't present in the original Mercurial server implementation. Rather -than wait for a client to perform functionality that needs to consult -capabilities, it issues the lookup at connection start to avoid any delay later. - -For HTTP servers, the client sends a ``capabilities`` command request as -soon as the connection is established. The server responds with a capabilities -string, which the client parses. - -For SSH servers, the client sends the ``hello`` command (no arguments) -and a ``between`` command with the ``pairs`` argument having the value -``0000000000000000000000000000000000000000-0000000000000000000000000000000000000000``. - -The ``between`` command has been supported since the original Mercurial -server. Requesting the empty range will return a ``\n`` string response, -which will be encoded as ``1\n\n`` (value length of ``1`` followed by a newline -followed by the value, which happens to be a newline). - -The ``hello`` command was later introduced. Servers supporting it will issue -a response to that command before sending the ``1\n\n`` response to the -``between`` command. Servers not supporting ``hello`` will send an empty -response (``0\n``). - -In addition to the expected output from the ``hello`` and ``between`` commands, -servers may also send other output, such as *message of the day (MOTD)* -announcements. Clients assume servers will send this output before the -Mercurial server replies to the client-issued commands. So any server output -not conforming to the expected command responses is assumed to be not related -to Mercurial and can be ignored. - Content Negotiation =================== @@ -519,8 +1325,8 @@ well-defined response type and only certain commands needed to support functionality like compression. -Currently, only the HTTP transport supports content negotiation at the protocol -layer. +Currently, only the HTTP version 1 transport supports content negotiation +at the protocol layer. HTTP requests advertise supported response formats via the ``X-HgProto-`` request header, where ```` is an integer starting at 1 allowing the logical @@ -537,6 +1343,12 @@ Indicates the client supports receiving ``application/mercurial-0.2`` responses. +cbor + Indicates the client supports receiving ``application/mercurial-cbor`` + responses. + + (Only intended to be used with version 2 transports.) + comp Indicates compression formats the client can decode. Value is a list of comma delimited strings identifying compression formats ordered from @@ -662,6 +1474,8 @@ This command does not accept any arguments. Return type is a ``string``. +This command was introduced in Mercurial 0.9.1 (released July 2006). + changegroup ----------- @@ -737,7 +1551,7 @@ Boolean indicating whether phases data is requested. The return type on success is a ``stream`` where the value is bundle. -On the HTTP transport, the response is zlib compressed. +On the HTTP version 1 transport, the response is zlib compressed. If an error occurs, a generic error response can be sent. @@ -779,6 +1593,8 @@ This command does not accept any arguments. The return type is a ``string``. +This command was introduced in Mercurial 0.9.1 (released July 2006). + listkeys -------- @@ -826,6 +1642,16 @@ There is no trailing newline. +protocaps +--------- + +Notify the server about the client capabilities in the SSH V1 transport +protocol. + +The ``caps`` argument is a space-delimited list of capabilities. + +The server will reply with the string ``OK``. + pushkey ------- @@ -838,13 +1664,14 @@ The return type is a ``string``. The value depends on the transport protocol. -The SSH transport sends a string encoded integer followed by a newline -(``\n``) which indicates operation result. The server may send additional -output on the ``stderr`` stream that should be displayed to the user. +The SSH version 1 transport sends a string encoded integer followed by a +newline (``\n``) which indicates operation result. The server may send +additional output on the ``stderr`` stream that should be displayed to the +user. -The HTTP transport sends a string encoded integer followed by a newline -followed by additional server output that should be displayed to the user. -This may include output from hooks, etc. +The HTTP version 1 transport sends a string encoded integer followed by a +newline followed by additional server output that should be displayed to +the user. This may include output from hooks, etc. The integer result varies by namespace. ``0`` means an error has occurred and there should be additional output to display to the user. @@ -908,18 +1735,179 @@ The encoding of the ``push response`` type varies by transport. -For the SSH transport, this type is composed of 2 ``string`` responses: an -empty response (``0\n``) followed by the integer result value. e.g. -``1\n2``. So the full response might be ``0\n1\n2``. +For the SSH version 1 transport, this type is composed of 2 ``string`` +responses: an empty response (``0\n``) followed by the integer result value. +e.g. ``1\n2``. So the full response might be ``0\n1\n2``. -For the HTTP transport, the response is a ``string`` type composed of an -integer result value followed by a newline (``\n``) followed by string +For the HTTP version 1 transport, the response is a ``string`` type composed +of an integer result value followed by a newline (``\n``) followed by string content holding server output that should be displayed on the client (output hooks, etc). In some cases, the server may respond with a ``bundle2`` bundle. In this -case, the response type is ``stream``. For the HTTP transport, the response -is zlib compressed. +case, the response type is ``stream``. For the HTTP version 1 transport, the +response is zlib compressed. The server may also respond with a generic error type, which contains a string indicating the failure. + +Frame-Based Protocol Commands +============================= + +**Experimental and under active development** + +This section documents the wire protocol commands exposed to transports +using the frame-based protocol. The set of commands exposed through +these transports is distinct from the set of commands exposed to legacy +transports. + +The frame-based protocol uses CBOR to encode command execution requests. +All command arguments must be mapped to a specific or set of CBOR data +types. + +The response to many commands is also CBOR. There is no common response +format: each command defines its own response format. + +TODO require node type be specified, as N bytes of binary node value +could be ambiguous once SHA-1 is replaced. + +branchmap +--------- + +Obtain heads in named branches. + +Receives no arguments. + +The response is a map with bytestring keys defining the branch name. +Values are arrays of bytestring defining raw changeset nodes. + +capabilities +------------ + +Obtain the server's capabilities. + +Receives no arguments. + +This command is typically called only as part of the handshake during +initial connection establishment. + +The response is a map with bytestring keys defining server information. + +The defined keys are: + +commands + A map defining available wire protocol commands on this server. + + Keys in the map are the names of commands that can be invoked. Values + are maps defining information about that command. The bytestring keys + are: + + args + A map of argument names and their expected types. + + Types are defined as a representative value for the expected type. + e.g. an argument expecting a boolean type will have its value + set to true. An integer type will have its value set to 42. The + actual values are arbitrary and may not have meaning. + permissions + An array of permissions required to execute this command. + +compression + An array of maps defining available compression format support. + + The array is sorted from most preferred to least preferred. + + Each entry has the following bytestring keys: + + name + Name of the compression engine. e.g. ``zstd`` or ``zlib``. + +framingmediatypes + An array of bytestrings defining the supported framing protocol + media types. Servers will not accept media types not in this list. + +rawrepoformats + An array of storage formats the repository is using. This set of + requirements can be used to determine whether a client can read a + *raw* copy of file data available. + +heads +----- + +Obtain DAG heads in the repository. + +The command accepts the following arguments: + +publiconly (optional) + (boolean) If set, operate on the DAG for public phase changesets only. + Non-public (i.e. draft) phase DAG heads will not be returned. + +The response is a CBOR array of bytestrings defining changeset nodes +of DAG heads. The array can be empty if the repository is empty or no +changesets satisfied the request. + +TODO consider exposing phase of heads in response + +known +----- + +Determine whether a series of changeset nodes is known to the server. + +The command accepts the following arguments: + +nodes + (array of bytestrings) List of changeset nodes whose presence to + query. + +The response is a bytestring where each byte contains a 0 or 1 for the +corresponding requested node at the same index. + +TODO use a bit array for even more compact response + +listkeys +-------- + +List values in a specified ``pushkey`` namespace. + +The command receives the following arguments: + +namespace + (bytestring) Pushkey namespace to query. + +The response is a map with bytestring keys and values. + +TODO consider using binary to represent nodes in certain pushkey namespaces. + +lookup +------ + +Try to resolve a value to a changeset revision. + +Unlike ``known`` which operates on changeset nodes, lookup operates on +node fragments and other names that a user may use. + +The command receives the following arguments: + +key + (bytestring) Value to try to resolve. + +On success, returns a bytestring containing the resolved node. + +pushkey +------- + +Set a value using the ``pushkey`` protocol. + +The command receives the following arguments: + +namespace + (bytestring) Pushkey namespace to operate on. +key + (bytestring) The pushkey key to set. +old + (bytestring) Old value for this key. +new + (bytestring) New value for this key. + +TODO consider using binary to represent nodes is certain pushkey namespaces. +TODO better define response type and meaning. diff -r fb92df8b634c -r ed5448edcbfa mercurial/hg.py --- a/mercurial/hg.py Wed Apr 04 10:35:09 2018 -0400 +++ b/mercurial/hg.py Wed Apr 18 15:32:08 2018 -0400 @@ -12,6 +12,7 @@ import hashlib import os import shutil +import stat from .i18n import _ from .node import ( @@ -31,6 +32,7 @@ httppeer, localrepo, lock, + logcmdutil, logexchange, merge as mergemod, node, @@ -46,6 +48,10 @@ vfs as vfsmod, ) +from .utils import ( + stringutil, +) + release = lock.release # shared features @@ -60,9 +66,7 @@ hashbranch, branches = branches if not hashbranch and not branches: x = revs or None - if util.safehasattr(revs, 'first'): - y = revs.first() - elif revs: + if revs: y = revs[0] else: y = None @@ -77,7 +81,9 @@ raise error.Abort(_("remote branch lookup not supported")) revs.append(hashbranch) return revs, revs[0] - branchmap = peer.branchmap() + + with peer.commandexecutor() as e: + branchmap = e.callcommand('branchmap', {}).result() def primary(branch): if branch == '.': @@ -151,9 +157,10 @@ # a list of (ui, repo) functions called for wire peer initialization wirepeersetupfuncs = [] -def _peerorrepo(ui, path, create=False, presetupfuncs=None): +def _peerorrepo(ui, path, create=False, presetupfuncs=None, + intents=None): """return a repository object for the specified path""" - obj = _peerlookup(path).instance(ui, path, create) + obj = _peerlookup(path).instance(ui, path, create, intents=intents) ui = getattr(obj, "ui", ui) for f in presetupfuncs or []: f(ui, obj) @@ -166,19 +173,20 @@ f(ui, obj) return obj -def repository(ui, path='', create=False, presetupfuncs=None): +def repository(ui, path='', create=False, presetupfuncs=None, intents=None): """return a repository object for the specified path""" - peer = _peerorrepo(ui, path, create, presetupfuncs=presetupfuncs) + peer = _peerorrepo(ui, path, create, presetupfuncs=presetupfuncs, + intents=intents) repo = peer.local() if not repo: raise error.Abort(_("repository '%s' is not local") % (path or peer.url())) return repo.filtered('visible') -def peer(uiorrepo, opts, path, create=False): +def peer(uiorrepo, opts, path, create=False, intents=None): '''return a repository peer for the specified path''' rui = remoteui(uiorrepo, opts) - return _peerorrepo(rui, path, create).peer() + return _peerorrepo(rui, path, create, intents=intents).peer() def defaultdest(source): '''return default destination of clone if none is given @@ -201,6 +209,24 @@ return '' return os.path.basename(os.path.normpath(path)) +def sharedreposource(repo): + """Returns repository object for source repository of a shared repo. + + If repo is not a shared repository, returns None. + """ + if repo.sharedpath == repo.path: + return None + + if util.safehasattr(repo, 'srcrepo') and repo.srcrepo: + return repo.srcrepo + + # the sharedpath always ends in the .hg; we want the path to the repo + source = repo.vfs.split(repo.sharedpath)[0] + srcurl, branches = parseurl(source) + srcrepo = repository(repo.ui, srcurl) + repo.srcrepo = srcrepo + return srcrepo + def share(ui, source, dest=None, update=True, bookmarks=True, defaultpath=None, relative=False): '''create a shared repository''' @@ -213,7 +239,7 @@ else: dest = ui.expandpath(dest) - if isinstance(source, str): + if isinstance(source, bytes): origsource = ui.expandpath(source) source, branches = parseurl(origsource) srcrepo = repository(ui, source) @@ -250,7 +276,7 @@ # ValueError is raised on Windows if the drive letters differ on # each path raise error.Abort(_('cannot calculate relative path'), - hint=str(e)) + hint=stringutil.forcebytestr(e)) else: requirements += 'shared\n' @@ -399,7 +425,15 @@ raise error.Abort(_("src repository does not support " "revision lookup and so doesn't " "support clone by revision")) - revs = [srcpeer.lookup(r) for r in rev] + + # TODO this is batchable. + remoterevs = [] + for r in rev: + with srcpeer.commandexecutor() as e: + remoterevs.append(e.callcommand('lookup', { + 'key': r, + }).result()) + revs = remoterevs # Obtain a lock before checking for or cloning the pooled repo otherwise # 2 clients may race creating or populating it. @@ -424,7 +458,7 @@ # well. Never update because working copies aren't necessary in # share mode. clone(ui, peeropts, source, dest=sharepath, pull=True, - rev=rev, update=False, stream=stream) + revs=rev, update=False, stream=stream) # Resolve the value to put in [paths] section for the source. if islocal(source): @@ -459,7 +493,7 @@ os.mkdir(dstcachedir) util.copyfile(srcbranchcache, dstbranchcache) -def clone(ui, peeropts, source, dest=None, pull=False, rev=None, +def clone(ui, peeropts, source, dest=None, pull=False, revs=None, update=True, stream=False, branch=None, shareopts=None): """Make a copy of an existing repository. @@ -488,7 +522,7 @@ stream: stream raw data uncompressed from repository (fast over LAN, slow over WAN) - rev: revision to clone up to (implies pull=True) + revs: revision to clone up to (implies pull=True) update: update working directory after clone completes, if destination is local repository (True means update to default rev, @@ -506,13 +540,13 @@ if isinstance(source, bytes): origsource = ui.expandpath(source) - source, branch = parseurl(origsource, branch) + source, branches = parseurl(origsource, branch) srcpeer = peer(ui, peeropts, source) else: srcpeer = source.peer() # in case we were called with a localrepo - branch = (None, branch or []) + branches = (None, branch or []) origsource = source = srcpeer.url() - rev, checkout = addbranchrevs(srcpeer, srcpeer, branch, rev) + revs, checkout = addbranchrevs(srcpeer, srcpeer, branches, revs) if dest is None: dest = defaultdest(source) @@ -545,7 +579,11 @@ # raises RepoLookupError if revision 0 is filtered or otherwise # not available. If we fail to resolve, sharing is not enabled. try: - rootnode = srcpeer.lookup('0') + with srcpeer.commandexecutor() as e: + rootnode = e.callcommand('lookup', { + 'key': '0', + }).result() + if rootnode != node.nullid: sharepath = os.path.join(sharepool, node.hex(rootnode)) else: @@ -563,7 +601,7 @@ if sharepath: return clonewithshare(ui, peeropts, sharepath, source, srcpeer, - dest, pull=pull, rev=rev, update=update, + dest, pull=pull, rev=revs, update=update, stream=stream) srclock = destlock = cleandir = None @@ -579,7 +617,7 @@ copy = False if (srcrepo and srcrepo.cancopy() and islocal(dest) and not phases.hassecret(srcrepo)): - copy = not pull and not rev + copy = not pull and not revs if copy: try: @@ -636,14 +674,24 @@ % dest) raise - revs = None - if rev: + if revs: if not srcpeer.capable('lookup'): raise error.Abort(_("src repository does not support " "revision lookup and so doesn't " "support clone by revision")) - revs = [srcpeer.lookup(r) for r in rev] + + # TODO this is batchable. + remoterevs = [] + for rev in revs: + with srcpeer.commandexecutor() as e: + remoterevs.append(e.callcommand('lookup', { + 'key': rev, + }).result()) + revs = remoterevs + checkout = revs[0] + else: + revs = None local = destpeer.local() if local: u = util.url(abspath) @@ -682,13 +730,17 @@ if update: if update is not True: - checkout = srcpeer.lookup(update) + with srcpeer.commandexecutor() as e: + checkout = e.callcommand('lookup', { + 'key': update, + }).result() + uprev = None status = None if checkout is not None: - try: - uprev = destrepo.lookup(checkout) - except error.RepoLookupError: + if checkout in destrepo: + uprev = checkout + else: if update is not True: try: uprev = destrepo.lookup(update) @@ -725,10 +777,12 @@ return srcpeer, destpeer def _showstats(repo, stats, quietempty=False): - if quietempty and not any(stats): + if quietempty and stats.isempty(): return repo.ui.status(_("%d files updated, %d files merged, " - "%d files removed, %d files unresolved\n") % stats) + "%d files removed, %d files unresolved\n") % ( + stats.updatedcount, stats.mergedcount, + stats.removedcount, stats.unresolvedcount)) def updaterepo(repo, node, overwrite, updatecheck=None): """Update the working directory to node. @@ -744,9 +798,9 @@ """update the working directory to node""" stats = updaterepo(repo, node, False, updatecheck=updatecheck) _showstats(repo, stats, quietempty) - if stats[3]: + if stats.unresolvedcount: repo.ui.status(_("use 'hg resolve' to retry unresolved file merges\n")) - return stats[3] > 0 + return stats.unresolvedcount > 0 # naming conflict in clone() _update = update @@ -757,7 +811,7 @@ repo.vfs.unlinkpath('graftstate', ignoremissing=True) if show_stats: _showstats(repo, stats, quietempty) - return stats[3] > 0 + return stats.unresolvedcount > 0 # naming conflict in updatetotally() _clean = clean @@ -856,12 +910,12 @@ labels=labels) _showstats(repo, stats) - if stats[3]: + if stats.unresolvedcount: repo.ui.status(_("use 'hg resolve' to retry unresolved file merges " "or 'hg merge --abort' to abandon\n")) elif remind and not abort: repo.ui.status(_("(branch merge, don't forget to commit)\n")) - return stats[3] > 0 + return stats.unresolvedcount > 0 def _incoming(displaychlist, subreporecurse, ui, repo, source, opts, buffered=False): @@ -885,7 +939,8 @@ ui.status(_("no changes found\n")) return subreporecurse() ui.pager('incoming') - displayer = cmdutil.show_changeset(ui, other, opts, buffered) + displayer = logcmdutil.changesetdisplayer(ui, other, opts, + buffered=buffered) displaychlist(other, chlist, displayer) displayer.close() finally: @@ -904,7 +959,7 @@ return ret def display(other, chlist, displayer): - limit = cmdutil.loglimit(opts) + limit = logcmdutil.getlimit(opts) if opts.get('newest_first'): chlist.reverse() count = 0 @@ -929,7 +984,7 @@ ui.status(_('comparing with %s\n') % util.hidepassword(dest)) revs, checkout = addbranchrevs(repo, repo, branches, opts.get('rev')) if revs: - revs = [repo.lookup(rev) for rev in scmutil.revrange(repo, revs)] + revs = [repo[rev].node() for rev in scmutil.revrange(repo, revs)] other = peer(repo, opts, dest) outgoing = discovery.findcommonoutgoing(repo, other, revs, @@ -949,7 +1004,7 @@ ret = min(ret, sub.outgoing(ui, dest, opts)) return ret - limit = cmdutil.loglimit(opts) + limit = logcmdutil.getlimit(opts) o, other = _outgoing(ui, repo, dest, opts) if not o: cmdutil.outgoinghooks(ui, repo, other, opts, o) @@ -958,7 +1013,7 @@ if opts.get('newest_first'): o.reverse() ui.pager('outgoing') - displayer = cmdutil.show_changeset(ui, repo, opts) + displayer = logcmdutil.changesetdisplayer(ui, repo, opts) count = 0 for n in o: if limit is not None and count >= limit: @@ -995,7 +1050,7 @@ ret = (ctx.sub(subpath, allowcreate=False).verify() or ret) except error.RepoError as e: - repo.ui.warn(('%s: %s\n') % (rev, e)) + repo.ui.warn(('%d: %s\n') % (rev, e)) except Exception: repo.ui.warn(_('.hgsubstate is corrupt in revision %s\n') % node.short(ctx.node())) @@ -1093,8 +1148,8 @@ st = os.stat(p) except OSError: st = os.stat(prefix) - state.append((st.st_mtime, st.st_size)) - maxmtime = max(maxmtime, st.st_mtime) + state.append((st[stat.ST_MTIME], st.st_size)) + maxmtime = max(maxmtime, st[stat.ST_MTIME]) return tuple(state), maxmtime diff -r fb92df8b634c -r ed5448edcbfa mercurial/hgweb/__init__.py --- a/mercurial/hgweb/__init__.py Wed Apr 04 10:35:09 2018 -0400 +++ b/mercurial/hgweb/__init__.py Wed Apr 18 15:32:08 2018 -0400 @@ -15,7 +15,10 @@ from .. import ( error, pycompat, - util, +) + +from ..utils import ( + procutil, ) from . import ( @@ -35,7 +38,7 @@ - list of virtual:real tuples (multi-repo view) ''' - if ((isinstance(config, str) and not os.path.isdir(config)) or + if ((isinstance(config, bytes) and not os.path.isdir(config)) or isinstance(config, dict) or isinstance(config, list)): # create a multi-dir interface return hgwebdir_mod.hgwebdir(config, baseui=baseui) @@ -51,7 +54,7 @@ self.opts = opts def init(self): - util.setsignalhandler() + procutil.setsignalhandler() self.httpd = server.create_server(self.ui, self.app) if self.opts['port'] and not self.ui.verbose: diff -r fb92df8b634c -r ed5448edcbfa mercurial/hgweb/common.py --- a/mercurial/hgweb/common.py Wed Apr 04 10:35:09 2018 -0400 +++ b/mercurial/hgweb/common.py Wed Apr 18 15:32:08 2018 -0400 @@ -12,6 +12,7 @@ import errno import mimetypes import os +import stat from .. import ( encoding, @@ -22,12 +23,15 @@ httpserver = util.httpserver HTTP_OK = 200 +HTTP_CREATED = 201 HTTP_NOT_MODIFIED = 304 HTTP_BAD_REQUEST = 400 HTTP_UNAUTHORIZED = 401 HTTP_FORBIDDEN = 403 HTTP_NOT_FOUND = 404 HTTP_METHOD_NOT_ALLOWED = 405 +HTTP_NOT_ACCEPTABLE = 406 +HTTP_UNSUPPORTED_MEDIA_TYPE = 415 HTTP_SERVER_ERROR = 500 @@ -45,7 +49,7 @@ authentication info). Return if op allowed, else raise an ErrorResponse exception.''' - user = req.env.get('REMOTE_USER') + user = req.remoteuser deny_read = hgweb.configlist('web', 'deny_read') if deny_read and (not user or ismember(hgweb.repo.ui, user, deny_read)): @@ -60,15 +64,19 @@ elif op == 'pull' or op is None: # op is None for interface requests return + # Allow LFS uploading via PUT requests + if op == 'upload': + if req.method != 'PUT': + msg = 'upload requires PUT request' + raise ErrorResponse(HTTP_METHOD_NOT_ALLOWED, msg) # enforce that you can only push using POST requests - if req.env['REQUEST_METHOD'] != 'POST': + elif req.method != 'POST': msg = 'push requires POST request' raise ErrorResponse(HTTP_METHOD_NOT_ALLOWED, msg) # require ssl by default for pushing, auth info cannot be sniffed # and replayed - scheme = req.env.get('wsgi.url_scheme') - if hgweb.configbool('web', 'push_ssl') and scheme != 'https': + if hgweb.configbool('web', 'push_ssl') and req.urlscheme != 'https': raise ErrorResponse(HTTP_FORBIDDEN, 'ssl required') deny = hgweb.configlist('web', 'deny_push') @@ -81,7 +89,7 @@ # Hooks for hgweb permission checks; extensions can add hooks here. # Each hook is invoked like this: hook(hgweb, request, operation), -# where operation is either read, pull or push. Hooks should either +# where operation is either read, pull, push or upload. Hooks should either # raise an ErrorResponse exception, or just return. # # It is possible to do both authentication and authorization through @@ -93,13 +101,20 @@ def __init__(self, code, message=None, headers=None): if message is None: message = _statusmessage(code) - Exception.__init__(self, message) + Exception.__init__(self, pycompat.sysstr(message)) self.code = code if headers is None: headers = [] self.headers = headers class continuereader(object): + """File object wrapper to handle HTTP 100-continue. + + This is used by servers so they automatically handle Expect: 100-continue + request headers. On first read of the request body, the 100 Continue + response is sent. This should trigger the client into actually sending + the request body. + """ def __init__(self, f, write): self.f = f self._write = write @@ -118,7 +133,8 @@ def _statusmessage(code): responses = httpserver.basehttprequesthandler.responses - return responses.get(code, ('Error', 'Unknown error'))[0] + return pycompat.bytesurl( + responses.get(code, (r'Error', r'Unknown error'))[0]) def statusmessage(code, message=None): return '%d %s' % (code, message or _statusmessage(code)) @@ -132,20 +148,20 @@ return os.stat(spath) def get_mtime(spath): - return get_stat(spath, "00changelog.i").st_mtime + return get_stat(spath, "00changelog.i")[stat.ST_MTIME] def ispathsafe(path): """Determine if a path is safe to use for filesystem access.""" parts = path.split('/') for part in parts: - if (part in ('', os.curdir, os.pardir) or + if (part in ('', pycompat.oscurdir, pycompat.ospardir) or pycompat.ossep in part or pycompat.osaltsep is not None and pycompat.osaltsep in part): return False return True -def staticfile(directory, fname, req): +def staticfile(directory, fname, res): """return a file inside directory with guessed Content-Type header fname always uses '/' as directory separator and isn't allowed to @@ -170,7 +186,9 @@ with open(path, 'rb') as fh: data = fh.read() - req.respond(HTTP_OK, ct, body=data) + res.headers['Content-Type'] = ct + res.setbodybytes(data) + return res except TypeError: raise ErrorResponse(HTTP_SERVER_ERROR, 'illegal filename') except OSError as err: @@ -185,7 +203,7 @@ if stripecount and offset: # account for offset, e.g. due to building the list in reverse count = (stripecount + offset) % stripecount - parity = (stripecount + offset) / stripecount & 1 + parity = (stripecount + offset) // stripecount & 1 else: count = 0 parity = 0 @@ -206,12 +224,6 @@ config("ui", "username") or encoding.environ.get("EMAIL") or "") -def caching(web, req): - tag = r'W/"%d"' % web.mtime - if req.env.get('HTTP_IF_NONE_MATCH') == tag: - raise ErrorResponse(HTTP_NOT_MODIFIED) - req.headers.append(('ETag', tag)) - def cspvalues(ui): """Obtain the Content-Security-Policy header and nonce value. diff -r fb92df8b634c -r ed5448edcbfa mercurial/hgweb/hgweb_mod.py --- a/mercurial/hgweb/hgweb_mod.py Wed Apr 04 10:35:09 2018 -0400 +++ b/mercurial/hgweb/hgweb_mod.py Wed Apr 18 15:32:08 2018 -0400 @@ -14,53 +14,39 @@ from .common import ( ErrorResponse, HTTP_BAD_REQUEST, - HTTP_NOT_FOUND, - HTTP_NOT_MODIFIED, - HTTP_OK, - HTTP_SERVER_ERROR, - caching, cspvalues, permhooks, + statusmessage, ) -from .request import wsgirequest from .. import ( encoding, error, + formatter, hg, hook, profiling, pycompat, + registrar, repoview, templatefilters, templater, + templateutil, ui as uimod, util, - wireproto, + wireprotoserver, ) from . import ( - protocol, + request as requestmod, webcommands, webutil, wsgicgi, ) -# Aliased for API compatibility. -perms = wireproto.permissions - -archivespecs = util.sortdict(( - ('zip', ('application/zip', 'zip', '.zip', None)), - ('gz', ('application/x-gzip', 'tgz', '.tar.gz', None)), - ('bz2', ('application/x-bzip2', 'tbz2', '.tar.bz2', None)), -)) - def getstyle(req, configfn, templatepath): - fromreq = req.form.get('style', [None])[0] - if fromreq is not None: - fromreq = pycompat.sysbytes(fromreq) styles = ( - fromreq, + req.qsparams.get('style', None), configfn('web', 'style'), 'paper', ) @@ -89,7 +75,7 @@ break breadcrumb.append({'url': urlel, 'name': pathel}) urlel = os.path.dirname(urlel) - return reversed(breadcrumb) + return templateutil.mappinglist(reversed(breadcrumb)) class requestcontext(object): """Holds state/context for an individual request. @@ -98,11 +84,11 @@ is prone to race conditions. Instances of this class exist to hold mutable and race-free state for requests. """ - def __init__(self, app, repo): + def __init__(self, app, repo, req, res): self.repo = repo self.reponame = app.reponame - - self.archivespecs = archivespecs + self.req = req + self.res = res self.maxchanges = self.configint('web', 'maxchanges') self.stripecount = self.configint('web', 'stripes') @@ -142,29 +128,15 @@ untrusted=untrusted) def archivelist(self, nodeid): - allowed = self.configlist('web', 'allow_archive') - for typ, spec in self.archivespecs.iteritems(): - if typ in allowed or self.configbool('web', 'allow%s' % typ): - yield {'type': typ, 'extension': spec[2], 'node': nodeid} + return webutil.archivelist(self.repo.ui, nodeid) def templater(self, req): # determine scheme, port and server name # this is needed to create absolute urls - - proto = req.env.get('wsgi.url_scheme') - if proto == 'https': - proto = 'https' - default_port = '443' - else: - proto = 'http' - default_port = '80' - - port = req.env[r'SERVER_PORT'] - port = port != default_port and (r':' + port) or r'' - urlbase = r'%s://%s%s' % (proto, req.env[r'SERVER_NAME'], port) logourl = self.config('web', 'logourl') logoimg = self.config('web', 'logoimg') - staticurl = self.config('web', 'staticurl') or req.url + 'static/' + staticurl = (self.config('web', 'staticurl') + or req.apppath + '/static/') if not staticurl.endswith('/'): staticurl += '/' @@ -181,38 +153,48 @@ if style == styles[0]: vars['style'] = style - start = '&' if req.url[-1] == r'?' else '?' - sessionvars = webutil.sessionvars(vars, start) + sessionvars = webutil.sessionvars(vars, '?') if not self.reponame: self.reponame = (self.config('web', 'name', '') - or req.env.get('REPO_NAME') - or req.url.strip('/') or self.repo.root) + or req.reponame + or req.apppath + or self.repo.root) + filters = {} + templatefilter = registrar.templatefilter(filters) + @templatefilter('websub', intype=bytes) def websubfilter(text): return templatefilters.websub(text, self.websubtable) # create the templater - + # TODO: export all keywords: defaults = templatekw.keywords.copy() defaults = { - 'url': req.url, + 'url': req.apppath + '/', 'logourl': logourl, 'logoimg': logoimg, 'staticurl': staticurl, - 'urlbase': urlbase, + 'urlbase': req.advertisedbaseurl, 'repo': self.reponame, 'encoding': encoding.encoding, 'motd': motd, 'sessionvars': sessionvars, - 'pathdef': makebreadcrumb(req.url), + 'pathdef': makebreadcrumb(req.apppath), 'style': style, 'nonce': self.nonce, } + tres = formatter.templateresources(self.repo.ui, self.repo) tmpl = templater.templater.frommapfile(mapfile, - filters={'websub': websubfilter}, - defaults=defaults) + filters=filters, + defaults=defaults, + resources=tres) return tmpl + def sendtemplate(self, name, **kwargs): + """Helper function to send a response generated from a template.""" + kwargs = pycompat.byteskwargs(kwargs) + self.res.setbodygen(self.tmpl.generate(name, kwargs)) + return self.res.sendresponse() class hgweb(object): """HTTP server for individual repositories. @@ -226,7 +208,7 @@ be multiple active threads inside __call__. """ def __init__(self, repo, name=None, baseui=None): - if isinstance(repo, str): + if isinstance(repo, bytes): if baseui: u = baseui.copy() else: @@ -303,10 +285,12 @@ This may be called by multiple threads. """ - req = wsgirequest(env, respond) - return self.run_wsgi(req) + req = requestmod.parserequestfromenv(env) + res = requestmod.wsgiresponse(req, respond) - def run_wsgi(self, req): + return self.run_wsgi(req, res) + + def run_wsgi(self, req, res): """Internal method to run the WSGI application. This is typically only called by Mercurial. External consumers @@ -315,155 +299,139 @@ with self._obtainrepo() as repo: profile = repo.ui.configbool('profiling', 'enabled') with profiling.profile(repo.ui, enabled=profile): - for r in self._runwsgi(req, repo): + for r in self._runwsgi(req, res, repo): yield r - def _runwsgi(self, req, repo): - rctx = requestcontext(self, repo) + def _runwsgi(self, req, res, repo): + rctx = requestcontext(self, repo, req, res) # This state is global across all threads. encoding.encoding = rctx.config('web', 'encoding') - rctx.repo.ui.environ = req.env + rctx.repo.ui.environ = req.rawenv if rctx.csp: # hgwebdir may have added CSP header. Since we generate our own, # replace it. - req.headers = [h for h in req.headers - if h[0] != 'Content-Security-Policy'] - req.headers.append(('Content-Security-Policy', rctx.csp)) - - # work with CGI variables to create coherent structure - # use SCRIPT_NAME, PATH_INFO and QUERY_STRING as well as our REPO_NAME - - req.url = req.env[r'SCRIPT_NAME'] - if not req.url.endswith('/'): - req.url += '/' - if req.env.get('REPO_NAME'): - req.url += req.env[r'REPO_NAME'] + r'/' + res.headers['Content-Security-Policy'] = rctx.csp - if r'PATH_INFO' in req.env: - parts = req.env[r'PATH_INFO'].strip('/').split('/') - repo_parts = req.env.get(r'REPO_NAME', r'').split(r'/') - if parts[:len(repo_parts)] == repo_parts: - parts = parts[len(repo_parts):] - query = '/'.join(parts) - else: - query = req.env[r'QUERY_STRING'].partition(r'&')[0] - query = query.partition(r';')[0] - - # process this if it's a protocol request - # protocol bits don't need to create any URLs - # and the clients always use the old URL structure + # /api/* is reserved for various API implementations. Dispatch + # accordingly. But URL paths can conflict with subrepos and virtual + # repos in hgwebdir. So until we have a workaround for this, only + # expose the URLs if the feature is enabled. + apienabled = rctx.repo.ui.configbool('experimental', 'web.apiserver') + if apienabled and req.dispatchparts and req.dispatchparts[0] == b'api': + wireprotoserver.handlewsgiapirequest(rctx, req, res, + self.check_perm) + return res.sendresponse() - cmd = pycompat.sysbytes(req.form.get(r'cmd', [r''])[0]) - if protocol.iscmd(cmd): - try: - if query: - raise ErrorResponse(HTTP_NOT_FOUND) + handled = wireprotoserver.handlewsgirequest( + rctx, req, res, self.check_perm) + if handled: + return res.sendresponse() - req.checkperm = lambda op: self.check_perm(rctx, req, op) - # Assume commands with no defined permissions are writes / - # for pushes. This is the safest from a security perspective - # because it doesn't allow commands with undefined semantics - # from bypassing permissions checks. - req.checkperm(perms.get(cmd, 'push')) - return protocol.call(rctx.repo, req, cmd) - except ErrorResponse as inst: - # A client that sends unbundle without 100-continue will - # break if we respond early. - if (cmd == 'unbundle' and - (req.env.get('HTTP_EXPECT', - '').lower() != '100-continue') or - req.env.get('X-HgHttp2', '')): - req.drain() - else: - req.headers.append((r'Connection', r'Close')) - req.respond(inst, protocol.HGTYPE, - body='0\n%s\n' % inst) - return '' + # Old implementations of hgweb supported dispatching the request via + # the initial query string parameter instead of using PATH_INFO. + # If PATH_INFO is present (signaled by ``req.dispatchpath`` having + # a value), we use it. Otherwise fall back to the query string. + if req.dispatchpath is not None: + query = req.dispatchpath + else: + query = req.querystring.partition('&')[0].partition(';')[0] # translate user-visible url structure to internal structure args = query.split('/', 2) - if r'cmd' not in req.form and args and args[0]: + if 'cmd' not in req.qsparams and args and args[0]: cmd = args.pop(0) style = cmd.rfind('-') if style != -1: - req.form['style'] = [cmd[:style]] + req.qsparams['style'] = cmd[:style] cmd = cmd[style + 1:] # avoid accepting e.g. style parameter as command if util.safehasattr(webcommands, cmd): - req.form[r'cmd'] = [cmd] + req.qsparams['cmd'] = cmd if cmd == 'static': - req.form['file'] = ['/'.join(args)] + req.qsparams['file'] = '/'.join(args) else: if args and args[0]: node = args.pop(0).replace('%2F', '/') - req.form['node'] = [node] + req.qsparams['node'] = node if args: - req.form['file'] = args + if 'file' in req.qsparams: + del req.qsparams['file'] + for a in args: + req.qsparams.add('file', a) - ua = req.env.get('HTTP_USER_AGENT', '') + ua = req.headers.get('User-Agent', '') if cmd == 'rev' and 'mercurial' in ua: - req.form['style'] = ['raw'] + req.qsparams['style'] = 'raw' if cmd == 'archive': - fn = req.form['node'][0] - for type_, spec in rctx.archivespecs.iteritems(): + fn = req.qsparams['node'] + for type_, spec in webutil.archivespecs.iteritems(): ext = spec[2] if fn.endswith(ext): - req.form['node'] = [fn[:-len(ext)]] - req.form['type'] = [type_] + req.qsparams['node'] = fn[:-len(ext)] + req.qsparams['type'] = type_ + else: + cmd = req.qsparams.get('cmd', '') # process the web interface request try: - tmpl = rctx.templater(req) - ctype = tmpl('mimetype', encoding=encoding.encoding) - ctype = templater.stringify(ctype) + rctx.tmpl = rctx.templater(req) + ctype = rctx.tmpl.render('mimetype', + {'encoding': encoding.encoding}) # check read permissions non-static content if cmd != 'static': self.check_perm(rctx, req, None) if cmd == '': - req.form[r'cmd'] = [tmpl.cache['default']] - cmd = req.form[r'cmd'][0] + req.qsparams['cmd'] = rctx.tmpl.render('default', {}) + cmd = req.qsparams['cmd'] # Don't enable caching if using a CSP nonce because then it wouldn't # be a nonce. if rctx.configbool('web', 'cache') and not rctx.nonce: - caching(self, req) # sets ETag header or raises NOT_MODIFIED + tag = 'W/"%d"' % self.mtime + if req.headers.get('If-None-Match') == tag: + res.status = '304 Not Modified' + # Response body not allowed on 304. + res.setbodybytes('') + return res.sendresponse() + + res.headers['ETag'] = tag + if cmd not in webcommands.__all__: msg = 'no such method: %s' % cmd raise ErrorResponse(HTTP_BAD_REQUEST, msg) - elif cmd == 'file' and r'raw' in req.form.get(r'style', []): - rctx.ctype = ctype - content = webcommands.rawfile(rctx, req, tmpl) else: - content = getattr(webcommands, cmd)(rctx, req, tmpl) - req.respond(HTTP_OK, ctype) - - return content + # Set some globals appropriate for web handlers. Commands can + # override easily enough. + res.status = '200 Script output follows' + res.headers['Content-Type'] = ctype + return getattr(webcommands, cmd)(rctx) except (error.LookupError, error.RepoLookupError) as err: - req.respond(HTTP_NOT_FOUND, ctype) - msg = str(err) + msg = pycompat.bytestr(err) if (util.safehasattr(err, 'name') and not isinstance(err, error.ManifestLookupError)): msg = 'revision not found: %s' % err.name - return tmpl('error', error=msg) - except (error.RepoError, error.RevlogError) as inst: - req.respond(HTTP_SERVER_ERROR, ctype) - return tmpl('error', error=str(inst)) - except ErrorResponse as inst: - req.respond(inst, ctype) - if inst.code == HTTP_NOT_MODIFIED: - # Not allowed to return a body on a 304 - return [''] - return tmpl('error', error=str(inst)) + + res.status = '404 Not Found' + res.headers['Content-Type'] = ctype + return rctx.sendtemplate('error', error=msg) + except (error.RepoError, error.RevlogError) as e: + res.status = '500 Internal Server Error' + res.headers['Content-Type'] = ctype + return rctx.sendtemplate('error', error=pycompat.bytestr(e)) + except ErrorResponse as e: + res.status = statusmessage(e.code, pycompat.bytestr(e)) + res.headers['Content-Type'] = ctype + return rctx.sendtemplate('error', error=pycompat.bytestr(e)) def check_perm(self, rctx, req, op): for permhook in permhooks: diff -r fb92df8b634c -r ed5448edcbfa mercurial/hgweb/hgwebdir_mod.py --- a/mercurial/hgweb/hgwebdir_mod.py Wed Apr 04 10:35:09 2018 -0400 +++ b/mercurial/hgweb/hgwebdir_mod.py Wed Apr 18 15:32:08 2018 -0400 @@ -10,15 +10,12 @@ import gc import os -import re import time from ..i18n import _ from .common import ( ErrorResponse, - HTTP_NOT_FOUND, - HTTP_OK, HTTP_SERVER_ERROR, cspvalues, get_contact, @@ -26,8 +23,8 @@ ismember, paritygen, staticfile, + statusmessage, ) -from .request import wsgirequest from .. import ( configitems, @@ -38,15 +35,18 @@ pycompat, scmutil, templater, + templateutil, ui as uimod, util, ) from . import ( hgweb_mod, + request as requestmod, webutil, wsgicgi, ) +from ..utils import dateutil def cleannames(items): return [(util.pconvert(name).strip('/'), path) for name, path in items] @@ -83,32 +83,174 @@ yield (prefix + '/' + util.pconvert(path[len(roothead):]).lstrip('/')).strip('/'), path -def geturlcgivars(baseurl, port): - """ - Extract CGI variables from baseurl +def readallowed(ui, req): + """Check allow_read and deny_read config options of a repo's ui object + to determine user permissions. By default, with neither option set (or + both empty), allow all users to read the repo. There are two ways a + user can be denied read access: (1) deny_read is not empty, and the + user is unauthenticated or deny_read contains user (or *), and (2) + allow_read is not empty and the user is not in allow_read. Return True + if user is allowed to read the repo, else return False.""" + + user = req.remoteuser + + deny_read = ui.configlist('web', 'deny_read', untrusted=True) + if deny_read and (not user or ismember(ui, user, deny_read)): + return False + + allow_read = ui.configlist('web', 'allow_read', untrusted=True) + # by default, allow reading if no allow_read option has been set + if not allow_read or ismember(ui, user, allow_read): + return True + + return False + +def rawindexentries(ui, repos, req, subdir=''): + descend = ui.configbool('web', 'descend') + collapse = ui.configbool('web', 'collapse') + seenrepos = set() + seendirs = set() + for name, path in repos: + + if not name.startswith(subdir): + continue + name = name[len(subdir):] + directory = False + + if '/' in name: + if not descend: + continue + + nameparts = name.split('/') + rootname = nameparts[0] + + if not collapse: + pass + elif rootname in seendirs: + continue + elif rootname in seenrepos: + pass + else: + directory = True + name = rootname + + # redefine the path to refer to the directory + discarded = '/'.join(nameparts[1:]) + + # remove name parts plus accompanying slash + path = path[:-len(discarded) - 1] + + try: + r = hg.repository(ui, path) + directory = False + except (IOError, error.RepoError): + pass + + parts = [ + req.apppath.strip('/'), + subdir.strip('/'), + name.strip('/'), + ] + url = '/' + '/'.join(p for p in parts if p) + '/' + + # show either a directory entry or a repository + if directory: + # get the directory's time information + try: + d = (get_mtime(path), dateutil.makedate()[1]) + except OSError: + continue - >>> geturlcgivars(b"http://host.org/base", b"80") - ('host.org', '80', '/base') - >>> geturlcgivars(b"http://host.org:8000/base", b"80") - ('host.org', '8000', '/base') - >>> geturlcgivars(b'/base', 8000) - ('', '8000', '/base') - >>> geturlcgivars(b"base", b'8000') - ('', '8000', '/base') - >>> geturlcgivars(b"http://host", b'8000') - ('host', '8000', '/') - >>> geturlcgivars(b"http://host/", b'8000') - ('host', '8000', '/') - """ - u = util.url(baseurl) - name = u.host or '' - if u.port: - port = u.port - path = u.path or "" - if not path.startswith('/'): - path = '/' + path + # add '/' to the name to make it obvious that + # the entry is a directory, not a regular repository + row = {'contact': "", + 'contact_sort': "", + 'name': name + '/', + 'name_sort': name, + 'url': url, + 'description': "", + 'description_sort': "", + 'lastchange': d, + 'lastchange_sort': d[1] - d[0], + 'archives': templateutil.mappinglist([]), + 'isdirectory': True, + 'labels': templateutil.hybridlist([], name='label'), + } + + seendirs.add(name) + yield row + continue + + u = ui.copy() + try: + u.readconfig(os.path.join(path, '.hg', 'hgrc')) + except Exception as e: + u.warn(_('error reading %s/.hg/hgrc: %s\n') % (path, e)) + continue + + def get(section, name, default=uimod._unset): + return u.config(section, name, default, untrusted=True) + + if u.configbool("web", "hidden", untrusted=True): + continue + + if not readallowed(u, req): + continue - return name, pycompat.bytestr(port), path + # update time with local timezone + try: + r = hg.repository(ui, path) + except IOError: + u.warn(_('error accessing repository at %s\n') % path) + continue + except error.RepoError: + u.warn(_('error accessing repository at %s\n') % path) + continue + try: + d = (get_mtime(r.spath), dateutil.makedate()[1]) + except OSError: + continue + + contact = get_contact(get) + description = get("web", "description") + seenrepos.add(name) + name = get("web", "name", name) + labels = u.configlist('web', 'labels', untrusted=True) + row = {'contact': contact or "unknown", + 'contact_sort': contact.upper() or "unknown", + 'name': name, + 'name_sort': name, + 'url': url, + 'description': description or "unknown", + 'description_sort': description.upper() or "unknown", + 'lastchange': d, + 'lastchange_sort': d[1] - d[0], + 'archives': webutil.archivelist(u, "tip", url), + 'isdirectory': None, + 'labels': templateutil.hybridlist(labels, name='label'), + } + + yield row + +def _indexentriesgen(context, ui, repos, req, stripecount, sortcolumn, + descending, subdir): + rows = rawindexentries(ui, repos, req, subdir=subdir) + + sortdefault = None, False + + if sortcolumn and sortdefault != (sortcolumn, descending): + sortkey = '%s_sort' % sortcolumn + rows = sorted(rows, key=lambda x: x[sortkey], + reverse=descending) + + for row, parity in zip(rows, paritygen(stripecount)): + row['parity'] = parity + yield row + +def indexentries(ui, repos, req, stripecount, sortcolumn='', + descending=False, subdir=''): + args = (ui, repos, req, stripecount, sortcolumn, descending, subdir) + return templateutil.mappinggenerator(_indexentriesgen, args=args) class hgwebdir(object): """HTTP server for multiple repositories. @@ -180,7 +322,6 @@ self.stripecount = self.ui.config('web', 'stripes') if self.stripecount: self.stripecount = int(self.stripecount) - self._baseurl = self.ui.config('web', 'baseurl') prefix = self.ui.config('web', 'prefix') if prefix.startswith('/'): prefix = prefix[1:] @@ -197,36 +338,17 @@ wsgicgi.launch(self) def __call__(self, env, respond): - req = wsgirequest(env, respond) - return self.run_wsgi(req) - - def read_allowed(self, ui, req): - """Check allow_read and deny_read config options of a repo's ui object - to determine user permissions. By default, with neither option set (or - both empty), allow all users to read the repo. There are two ways a - user can be denied read access: (1) deny_read is not empty, and the - user is unauthenticated or deny_read contains user (or *), and (2) - allow_read is not empty and the user is not in allow_read. Return True - if user is allowed to read the repo, else return False.""" + baseurl = self.ui.config('web', 'baseurl') + req = requestmod.parserequestfromenv(env, altbaseurl=baseurl) + res = requestmod.wsgiresponse(req, respond) - user = req.env.get('REMOTE_USER') - - deny_read = ui.configlist('web', 'deny_read', untrusted=True) - if deny_read and (not user or ismember(ui, user, deny_read)): - return False + return self.run_wsgi(req, res) - allow_read = ui.configlist('web', 'allow_read', untrusted=True) - # by default, allow reading if no allow_read option has been set - if (not allow_read) or ismember(ui, user, allow_read): - return True - - return False - - def run_wsgi(self, req): + def run_wsgi(self, req, res): profile = self.ui.configbool('profiling', 'enabled') with profiling.profile(self.ui, enabled=profile): try: - for r in self._runwsgi(req): + for r in self._runwsgi(req, res): yield r finally: # There are known cycles in localrepository that prevent @@ -238,25 +360,28 @@ # instances instead of every request. gc.collect() - def _runwsgi(self, req): + def _runwsgi(self, req, res): try: self.refresh() csp, nonce = cspvalues(self.ui) if csp: - req.headers.append(('Content-Security-Policy', csp)) + res.headers['Content-Security-Policy'] = csp - virtual = req.env.get("PATH_INFO", "").strip('/') + virtual = req.dispatchpath.strip('/') tmpl = self.templater(req, nonce) - ctype = tmpl('mimetype', encoding=encoding.encoding) - ctype = templater.stringify(ctype) + ctype = tmpl.render('mimetype', {'encoding': encoding.encoding}) + + # Global defaults. These can be overridden by any handler. + res.status = '200 Script output follows' + res.headers['Content-Type'] = ctype # a static file - if virtual.startswith('static/') or 'static' in req.form: + if virtual.startswith('static/') or 'static' in req.qsparams: if virtual.startswith('static/'): fname = virtual[7:] else: - fname = req.form['static'][0] + fname = req.qsparams['static'] static = self.ui.config("web", "static", None, untrusted=False) if not static: @@ -264,24 +389,23 @@ if isinstance(tp, str): tp = [tp] static = [os.path.join(p, 'static') for p in tp] - staticfile(static, fname, req) - return [] + + staticfile(static, fname, res) + return res.sendresponse() # top-level index repos = dict(self.repos) if (not virtual or virtual == 'index') and virtual not in repos: - req.respond(HTTP_OK, ctype) - return self.makeindex(req, tmpl) + return self.makeindex(req, res, tmpl) # nested indexes and hgwebs if virtual.endswith('/index') and virtual not in repos: subdir = virtual[:-len('index')] if any(r.startswith(subdir) for r in repos): - req.respond(HTTP_OK, ctype) - return self.makeindex(req, tmpl, subdir) + return self.makeindex(req, res, tmpl, subdir) def _virtualdirs(): # Check the full virtual path, each parent, and the root ('') @@ -296,11 +420,19 @@ for virtualrepo in _virtualdirs(): real = repos.get(virtualrepo) if real: - req.env['REPO_NAME'] = virtualrepo + # Re-parse the WSGI environment to take into account our + # repository path component. + uenv = req.rawenv + if pycompat.ispy3: + uenv = {k.decode('latin1'): v for k, v in + uenv.iteritems()} + req = requestmod.parserequestfromenv( + uenv, reponame=virtualrepo, + altbaseurl=self.ui.config('web', 'baseurl')) try: # ensure caller gets private copy of ui repo = hg.repository(self.ui.copy(), real) - return hgweb_mod.hgweb(repo).run_wsgi(req) + return hgweb_mod.hgweb(repo).run_wsgi(req, res) except IOError as inst: msg = encoding.strtolocal(inst.strerror) raise ErrorResponse(HTTP_SERVER_ERROR, msg) @@ -310,173 +442,26 @@ # browse subdirectories subdir = virtual + '/' if [r for r in repos if r.startswith(subdir)]: - req.respond(HTTP_OK, ctype) - return self.makeindex(req, tmpl, subdir) + return self.makeindex(req, res, tmpl, subdir) # prefixes not found - req.respond(HTTP_NOT_FOUND, ctype) - return tmpl("notfound", repo=virtual) + res.status = '404 Not Found' + res.setbodygen(tmpl.generate('notfound', {'repo': virtual})) + return res.sendresponse() - except ErrorResponse as err: - req.respond(err, ctype) - return tmpl('error', error=err.message or '') + except ErrorResponse as e: + res.status = statusmessage(e.code, pycompat.bytestr(e)) + res.setbodygen(tmpl.generate('error', {'error': e.message or ''})) + return res.sendresponse() finally: tmpl = None - def makeindex(self, req, tmpl, subdir=""): - - def archivelist(ui, nodeid, url): - allowed = ui.configlist("web", "allow_archive", untrusted=True) - archives = [] - for typ, spec in hgweb_mod.archivespecs.iteritems(): - if typ in allowed or ui.configbool("web", "allow" + typ, - untrusted=True): - archives.append({"type": typ, "extension": spec[2], - "node": nodeid, "url": url}) - return archives - - def rawentries(subdir="", **map): - - descend = self.ui.configbool('web', 'descend') - collapse = self.ui.configbool('web', 'collapse') - seenrepos = set() - seendirs = set() - for name, path in self.repos: - - if not name.startswith(subdir): - continue - name = name[len(subdir):] - directory = False - - if '/' in name: - if not descend: - continue - - nameparts = name.split('/') - rootname = nameparts[0] - - if not collapse: - pass - elif rootname in seendirs: - continue - elif rootname in seenrepos: - pass - else: - directory = True - name = rootname - - # redefine the path to refer to the directory - discarded = '/'.join(nameparts[1:]) - - # remove name parts plus accompanying slash - path = path[:-len(discarded) - 1] - - try: - r = hg.repository(self.ui, path) - directory = False - except (IOError, error.RepoError): - pass - - parts = [name] - parts.insert(0, '/' + subdir.rstrip('/')) - if req.env['SCRIPT_NAME']: - parts.insert(0, req.env['SCRIPT_NAME']) - url = re.sub(r'/+', '/', '/'.join(parts) + '/') - - # show either a directory entry or a repository - if directory: - # get the directory's time information - try: - d = (get_mtime(path), util.makedate()[1]) - except OSError: - continue - - # add '/' to the name to make it obvious that - # the entry is a directory, not a regular repository - row = {'contact': "", - 'contact_sort': "", - 'name': name + '/', - 'name_sort': name, - 'url': url, - 'description': "", - 'description_sort': "", - 'lastchange': d, - 'lastchange_sort': d[1]-d[0], - 'archives': [], - 'isdirectory': True, - 'labels': [], - } - - seendirs.add(name) - yield row - continue - - u = self.ui.copy() - try: - u.readconfig(os.path.join(path, '.hg', 'hgrc')) - except Exception as e: - u.warn(_('error reading %s/.hg/hgrc: %s\n') % (path, e)) - continue - def get(section, name, default=uimod._unset): - return u.config(section, name, default, untrusted=True) - - if u.configbool("web", "hidden", untrusted=True): - continue - - if not self.read_allowed(u, req): - continue - - # update time with local timezone - try: - r = hg.repository(self.ui, path) - except IOError: - u.warn(_('error accessing repository at %s\n') % path) - continue - except error.RepoError: - u.warn(_('error accessing repository at %s\n') % path) - continue - try: - d = (get_mtime(r.spath), util.makedate()[1]) - except OSError: - continue - - contact = get_contact(get) - description = get("web", "description") - seenrepos.add(name) - name = get("web", "name", name) - row = {'contact': contact or "unknown", - 'contact_sort': contact.upper() or "unknown", - 'name': name, - 'name_sort': name, - 'url': url, - 'description': description or "unknown", - 'description_sort': description.upper() or "unknown", - 'lastchange': d, - 'lastchange_sort': d[1]-d[0], - 'archives': archivelist(u, "tip", url), - 'isdirectory': None, - 'labels': u.configlist('web', 'labels', untrusted=True), - } - - yield row - - sortdefault = None, False - def entries(sortcolumn="", descending=False, subdir="", **map): - rows = rawentries(subdir=subdir, **map) - - if sortcolumn and sortdefault != (sortcolumn, descending): - sortkey = '%s_sort' % sortcolumn - rows = sorted(rows, key=lambda x: x[sortkey], - reverse=descending) - for row, parity in zip(rows, paritygen(self.stripecount)): - row['parity'] = parity - yield row - + def makeindex(self, req, res, tmpl, subdir=""): self.refresh() sortable = ["name", "description", "contact", "lastchange"] - sortcolumn, descending = sortdefault - if 'sort' in req.form: - sortcolumn = req.form['sort'][0] + sortcolumn, descending = None, False + if 'sort' in req.qsparams: + sortcolumn = req.qsparams['sort'] descending = sortcolumn.startswith('-') if descending: sortcolumn = sortcolumn[1:] @@ -489,12 +474,21 @@ for column in sortable] self.refresh() - self.updatereqenv(req.env) + + entries = indexentries(self.ui, self.repos, req, + self.stripecount, sortcolumn=sortcolumn, + descending=descending, subdir=subdir) - return tmpl("index", entries=entries, subdir=subdir, - pathdef=hgweb_mod.makebreadcrumb('/' + subdir, self.prefix), - sortcolumn=sortcolumn, descending=descending, - **dict(sort)) + mapping = { + 'entries': entries, + 'subdir': subdir, + 'pathdef': hgweb_mod.makebreadcrumb('/' + subdir, self.prefix), + 'sortcolumn': sortcolumn, + 'descending': descending, + } + mapping.update(sort) + res.setbodygen(tmpl.generate('index', mapping)) + return res.sendresponse() def templater(self, req, nonce): @@ -507,30 +501,24 @@ def config(section, name, default=uimod._unset, untrusted=True): return self.ui.config(section, name, default, untrusted) - self.updatereqenv(req.env) - - url = req.env.get('SCRIPT_NAME', '') - if not url.endswith('/'): - url += '/' - vars = {} styles, (style, mapfile) = hgweb_mod.getstyle(req, config, self.templatepath) if style == styles[0]: vars['style'] = style - start = r'&' if url[-1] == r'?' else r'?' - sessionvars = webutil.sessionvars(vars, start) + sessionvars = webutil.sessionvars(vars, r'?') logourl = config('web', 'logourl') logoimg = config('web', 'logoimg') - staticurl = config('web', 'staticurl') or url + 'static/' + staticurl = (config('web', 'staticurl') + or req.apppath + '/static/') if not staticurl.endswith('/'): staticurl += '/' defaults = { "encoding": encoding.encoding, "motd": motd, - "url": url, + "url": req.apppath + '/', "logourl": logourl, "logoimg": logoimg, "staticurl": staticurl, @@ -540,10 +528,3 @@ } tmpl = templater.templater.frommapfile(mapfile, defaults=defaults) return tmpl - - def updatereqenv(self, env): - if self._baseurl is not None: - name, port, path = geturlcgivars(self._baseurl, env['SERVER_PORT']) - env['SERVER_NAME'] = name - env['SERVER_PORT'] = port - env['SCRIPT_NAME'] = path diff -r fb92df8b634c -r ed5448edcbfa mercurial/hgweb/protocol.py --- a/mercurial/hgweb/protocol.py Wed Apr 04 10:35:09 2018 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,202 +0,0 @@ -# -# Copyright 21 May 2005 - (c) 2005 Jake Edge -# Copyright 2005-2007 Matt Mackall -# -# This software may be used and distributed according to the terms of the -# GNU General Public License version 2 or any later version. - -from __future__ import absolute_import - -import cgi -import struct - -from .common import ( - HTTP_OK, -) - -from .. import ( - error, - pycompat, - util, - wireproto, -) -stringio = util.stringio - -urlerr = util.urlerr -urlreq = util.urlreq - -HGTYPE = 'application/mercurial-0.1' -HGTYPE2 = 'application/mercurial-0.2' -HGERRTYPE = 'application/hg-error' - -def decodevaluefromheaders(req, headerprefix): - """Decode a long value from multiple HTTP request headers. - - Returns the value as a bytes, not a str. - """ - chunks = [] - i = 1 - prefix = headerprefix.upper().replace(r'-', r'_') - while True: - v = req.env.get(r'HTTP_%s_%d' % (prefix, i)) - if v is None: - break - chunks.append(pycompat.bytesurl(v)) - i += 1 - - return ''.join(chunks) - -class webproto(wireproto.abstractserverproto): - def __init__(self, req, ui): - self.req = req - self.response = '' - self.ui = ui - self.name = 'http' - self.checkperm = req.checkperm - - def getargs(self, args): - knownargs = self._args() - data = {} - keys = args.split() - for k in keys: - if k == '*': - star = {} - for key in knownargs.keys(): - if key != 'cmd' and key not in keys: - star[key] = knownargs[key][0] - data['*'] = star - else: - data[k] = knownargs[k][0] - return [data[k] for k in keys] - def _args(self): - args = self.req.form.copy() - if pycompat.ispy3: - args = {k.encode('ascii'): [v.encode('ascii') for v in vs] - for k, vs in args.items()} - postlen = int(self.req.env.get(r'HTTP_X_HGARGS_POST', 0)) - if postlen: - args.update(cgi.parse_qs( - self.req.read(postlen), keep_blank_values=True)) - return args - - argvalue = decodevaluefromheaders(self.req, r'X-HgArg') - args.update(cgi.parse_qs(argvalue, keep_blank_values=True)) - return args - def getfile(self, fp): - length = int(self.req.env[r'CONTENT_LENGTH']) - # If httppostargs is used, we need to read Content-Length - # minus the amount that was consumed by args. - length -= int(self.req.env.get(r'HTTP_X_HGARGS_POST', 0)) - for s in util.filechunkiter(self.req, limit=length): - fp.write(s) - def redirect(self): - self.oldio = self.ui.fout, self.ui.ferr - self.ui.ferr = self.ui.fout = stringio() - def restore(self): - val = self.ui.fout.getvalue() - self.ui.ferr, self.ui.fout = self.oldio - return val - - def _client(self): - return 'remote:%s:%s:%s' % ( - self.req.env.get('wsgi.url_scheme') or 'http', - urlreq.quote(self.req.env.get('REMOTE_HOST', '')), - urlreq.quote(self.req.env.get('REMOTE_USER', ''))) - - def responsetype(self, prefer_uncompressed): - """Determine the appropriate response type and compression settings. - - Returns a tuple of (mediatype, compengine, engineopts). - """ - # Determine the response media type and compression engine based - # on the request parameters. - protocaps = decodevaluefromheaders(self.req, r'X-HgProto').split(' ') - - if '0.2' in protocaps: - # All clients are expected to support uncompressed data. - if prefer_uncompressed: - return HGTYPE2, util._noopengine(), {} - - # Default as defined by wire protocol spec. - compformats = ['zlib', 'none'] - for cap in protocaps: - if cap.startswith('comp='): - compformats = cap[5:].split(',') - break - - # Now find an agreed upon compression format. - for engine in wireproto.supportedcompengines(self.ui, self, - util.SERVERROLE): - if engine.wireprotosupport().name in compformats: - opts = {} - level = self.ui.configint('server', - '%slevel' % engine.name()) - if level is not None: - opts['level'] = level - - return HGTYPE2, engine, opts - - # No mutually supported compression format. Fall back to the - # legacy protocol. - - # Don't allow untrusted settings because disabling compression or - # setting a very high compression level could lead to flooding - # the server's network or CPU. - opts = {'level': self.ui.configint('server', 'zliblevel')} - return HGTYPE, util.compengines['zlib'], opts - -def iscmd(cmd): - return cmd in wireproto.commands - -def call(repo, req, cmd): - p = webproto(req, repo.ui) - - def genversion2(gen, engine, engineopts): - # application/mercurial-0.2 always sends a payload header - # identifying the compression engine. - name = engine.wireprotosupport().name - assert 0 < len(name) < 256 - yield struct.pack('B', len(name)) - yield name - - for chunk in gen: - yield chunk - - rsp = wireproto.dispatch(repo, p, cmd) - if isinstance(rsp, bytes): - req.respond(HTTP_OK, HGTYPE, body=rsp) - return [] - elif isinstance(rsp, wireproto.streamres_legacy): - gen = rsp.gen - req.respond(HTTP_OK, HGTYPE) - return gen - elif isinstance(rsp, wireproto.streamres): - gen = rsp.gen - - # This code for compression should not be streamres specific. It - # is here because we only compress streamres at the moment. - mediatype, engine, engineopts = p.responsetype(rsp.prefer_uncompressed) - gen = engine.compressstream(gen, engineopts) - - if mediatype == HGTYPE2: - gen = genversion2(gen, engine, engineopts) - - req.respond(HTTP_OK, mediatype) - return gen - elif isinstance(rsp, wireproto.pushres): - val = p.restore() - rsp = '%d\n%s' % (rsp.res, val) - req.respond(HTTP_OK, HGTYPE, body=rsp) - return [] - elif isinstance(rsp, wireproto.pusherr): - # drain the incoming bundle - req.drain() - p.restore() - rsp = '0\n%s\n' % rsp.res - req.respond(HTTP_OK, HGTYPE, body=rsp) - return [] - elif isinstance(rsp, wireproto.ooberror): - rsp = rsp.message - req.respond(HTTP_OK, HGERRTYPE, body=rsp) - return [] - raise error.ProgrammingError('hgweb.protocol internal failure', rsp) diff -r fb92df8b634c -r ed5448edcbfa mercurial/hgweb/request.py --- a/mercurial/hgweb/request.py Wed Apr 04 10:35:09 2018 -0400 +++ b/mercurial/hgweb/request.py Wed Apr 18 15:32:08 2018 -0400 @@ -8,143 +8,558 @@ from __future__ import absolute_import -import cgi -import errno -import socket +#import wsgiref.validate -from .common import ( - ErrorResponse, - HTTP_NOT_MODIFIED, - statusmessage, +from ..thirdparty import ( + attr, ) - from .. import ( + error, pycompat, util, ) -shortcuts = { - 'cl': [('cmd', ['changelog']), ('rev', None)], - 'sl': [('cmd', ['shortlog']), ('rev', None)], - 'cs': [('cmd', ['changeset']), ('node', None)], - 'f': [('cmd', ['file']), ('filenode', None)], - 'fl': [('cmd', ['filelog']), ('filenode', None)], - 'fd': [('cmd', ['filediff']), ('node', None)], - 'fa': [('cmd', ['annotate']), ('filenode', None)], - 'mf': [('cmd', ['manifest']), ('manifest', None)], - 'ca': [('cmd', ['archive']), ('node', None)], - 'tags': [('cmd', ['tags'])], - 'tip': [('cmd', ['changeset']), ('node', ['tip'])], - 'static': [('cmd', ['static']), ('file', None)] -} +class multidict(object): + """A dict like object that can store multiple values for a key. + + Used to store parsed request parameters. + + This is inspired by WebOb's class of the same name. + """ + def __init__(self): + self._items = {} + + def __getitem__(self, key): + """Returns the last set value for a key.""" + return self._items[key][-1] + + def __setitem__(self, key, value): + """Replace a values for a key with a new value.""" + self._items[key] = [value] + + def __delitem__(self, key): + """Delete all values for a key.""" + del self._items[key] + + def __contains__(self, key): + return key in self._items + + def __len__(self): + return len(self._items) + + def get(self, key, default=None): + try: + return self.__getitem__(key) + except KeyError: + return default + + def add(self, key, value): + """Add a new value for a key. Does not replace existing values.""" + self._items.setdefault(key, []).append(value) + + def getall(self, key): + """Obtains all values for a key.""" + return self._items.get(key, []) + + def getone(self, key): + """Obtain a single value for a key. + + Raises KeyError if key not defined or it has multiple values set. + """ + vals = self._items[key] + + if len(vals) > 1: + raise KeyError('multiple values for %r' % key) + + return vals[0] + + def asdictoflists(self): + return {k: list(v) for k, v in self._items.iteritems()} + +@attr.s(frozen=True) +class parsedrequest(object): + """Represents a parsed WSGI request. + + Contains both parsed parameters as well as a handle on the input stream. + """ -def normalize(form): - # first expand the shortcuts - for k in shortcuts: - if k in form: - for name, value in shortcuts[k]: - if value is None: - value = form[k] - form[name] = value - del form[k] - # And strip the values - for k, v in form.iteritems(): - form[k] = [i.strip() for i in v] - return form + # Request method. + method = attr.ib() + # Full URL for this request. + url = attr.ib() + # URL without any path components. Just ://. + baseurl = attr.ib() + # Advertised URL. Like ``url`` and ``baseurl`` but uses SERVER_NAME instead + # of HTTP: Host header for hostname. This is likely what clients used. + advertisedurl = attr.ib() + advertisedbaseurl = attr.ib() + # URL scheme (part before ``://``). e.g. ``http`` or ``https``. + urlscheme = attr.ib() + # Value of REMOTE_USER, if set, or None. + remoteuser = attr.ib() + # Value of REMOTE_HOST, if set, or None. + remotehost = attr.ib() + # Relative WSGI application path. If defined, will begin with a + # ``/``. + apppath = attr.ib() + # List of path parts to be used for dispatch. + dispatchparts = attr.ib() + # URL path component (no query string) used for dispatch. Can be + # ``None`` to signal no path component given to the request, an + # empty string to signal a request to the application's root URL, + # or a string not beginning with ``/`` containing the requested + # path under the application. + dispatchpath = attr.ib() + # The name of the repository being accessed. + reponame = attr.ib() + # Raw query string (part after "?" in URL). + querystring = attr.ib() + # multidict of query string parameters. + qsparams = attr.ib() + # wsgiref.headers.Headers instance. Operates like a dict with case + # insensitive keys. + headers = attr.ib() + # Request body input stream. + bodyfh = attr.ib() + # WSGI environment dict, unmodified. + rawenv = attr.ib() -class wsgirequest(object): - """Higher-level API for a WSGI request. +def parserequestfromenv(env, reponame=None, altbaseurl=None): + """Parse URL components from environment variables. + + WSGI defines request attributes via environment variables. This function + parses the environment variables into a data structure. - WSGI applications are invoked with 2 arguments. They are used to - instantiate instances of this class, which provides higher-level APIs - for obtaining request parameters, writing HTTP output, etc. + If ``reponame`` is defined, the leading path components matching that + string are effectively shifted from ``PATH_INFO`` to ``SCRIPT_NAME``. + This simulates the world view of a WSGI application that processes + requests from the base URL of a repo. + + If ``altbaseurl`` (typically comes from ``web.baseurl`` config option) + is defined, it is used - instead of the WSGI environment variables - for + constructing URL components up to and including the WSGI application path. + For example, if the current WSGI application is at ``/repo`` and a request + is made to ``/rev/@`` with this argument set to + ``http://myserver:9000/prefix``, the URL and path components will resolve as + if the request were to ``http://myserver:9000/prefix/rev/@``. In other + words, ``wsgi.url_scheme``, ``SERVER_NAME``, ``SERVER_PORT``, and + ``SCRIPT_NAME`` are all effectively replaced by components from this URL. """ - def __init__(self, wsgienv, start_response): - version = wsgienv[r'wsgi.version'] - if (version < (1, 0)) or (version >= (2, 0)): - raise RuntimeError("Unknown and unsupported WSGI version %d.%d" - % version) - self.inp = wsgienv[r'wsgi.input'] - self.err = wsgienv[r'wsgi.errors'] - self.threaded = wsgienv[r'wsgi.multithread'] - self.multiprocess = wsgienv[r'wsgi.multiprocess'] - self.run_once = wsgienv[r'wsgi.run_once'] - self.env = wsgienv - self.form = normalize(cgi.parse(self.inp, - self.env, - keep_blank_values=1)) - self._start_response = start_response - self.server_write = None - self.headers = [] + # PEP 3333 defines the WSGI spec and is a useful reference for this code. + + # We first validate that the incoming object conforms with the WSGI spec. + # We only want to be dealing with spec-conforming WSGI implementations. + # TODO enable this once we fix internal violations. + #wsgiref.validate.check_environ(env) + + # PEP-0333 states that environment keys and values are native strings + # (bytes on Python 2 and str on Python 3). The code points for the Unicode + # strings on Python 3 must be between \00000-\000FF. We deal with bytes + # in Mercurial, so mass convert string keys and values to bytes. + if pycompat.ispy3: + env = {k.encode('latin-1'): v for k, v in env.iteritems()} + env = {k: v.encode('latin-1') if isinstance(v, str) else v + for k, v in env.iteritems()} - def __iter__(self): - return iter([]) + # Some hosting solutions are emulating hgwebdir, and dispatching directly + # to an hgweb instance using this environment variable. This was always + # checked prior to d7fd203e36cc; keep doing so to avoid breaking them. + if not reponame: + reponame = env.get('REPO_NAME') + + if altbaseurl: + altbaseurl = util.url(altbaseurl) + + # https://www.python.org/dev/peps/pep-0333/#environ-variables defines + # the environment variables. + # https://www.python.org/dev/peps/pep-0333/#url-reconstruction defines + # how URLs are reconstructed. + fullurl = env['wsgi.url_scheme'] + '://' + + if altbaseurl and altbaseurl.scheme: + advertisedfullurl = altbaseurl.scheme + '://' + else: + advertisedfullurl = fullurl - def read(self, count=-1): - return self.inp.read(count) + def addport(s, port): + if s.startswith('https://'): + if port != '443': + s += ':' + port + else: + if port != '80': + s += ':' + port + + return s + + if env.get('HTTP_HOST'): + fullurl += env['HTTP_HOST'] + else: + fullurl += env['SERVER_NAME'] + fullurl = addport(fullurl, env['SERVER_PORT']) + + if altbaseurl and altbaseurl.host: + advertisedfullurl += altbaseurl.host - def drain(self): - '''need to read all data from request, httplib is half-duplex''' - length = int(self.env.get('CONTENT_LENGTH') or 0) - for s in util.filechunkiter(self.inp, limit=length): - pass + if altbaseurl.port: + port = altbaseurl.port + elif altbaseurl.scheme == 'http' and not altbaseurl.port: + port = '80' + elif altbaseurl.scheme == 'https' and not altbaseurl.port: + port = '443' + else: + port = env['SERVER_PORT'] + + advertisedfullurl = addport(advertisedfullurl, port) + else: + advertisedfullurl += env['SERVER_NAME'] + advertisedfullurl = addport(advertisedfullurl, env['SERVER_PORT']) + + baseurl = fullurl + advertisedbaseurl = advertisedfullurl + + fullurl += util.urlreq.quote(env.get('SCRIPT_NAME', '')) + fullurl += util.urlreq.quote(env.get('PATH_INFO', '')) - def respond(self, status, type, filename=None, body=None): - if not isinstance(type, str): - type = pycompat.sysstr(type) - if self._start_response is not None: - self.headers.append((r'Content-Type', type)) - if filename: - filename = (filename.rpartition('/')[-1] - .replace('\\', '\\\\').replace('"', '\\"')) - self.headers.append(('Content-Disposition', - 'inline; filename="%s"' % filename)) - if body is not None: - self.headers.append((r'Content-Length', str(len(body)))) + if altbaseurl: + path = altbaseurl.path or '' + if path and not path.startswith('/'): + path = '/' + path + advertisedfullurl += util.urlreq.quote(path) + else: + advertisedfullurl += util.urlreq.quote(env.get('SCRIPT_NAME', '')) + + advertisedfullurl += util.urlreq.quote(env.get('PATH_INFO', '')) + + if env.get('QUERY_STRING'): + fullurl += '?' + env['QUERY_STRING'] + advertisedfullurl += '?' + env['QUERY_STRING'] + + # If ``reponame`` is defined, that must be a prefix on PATH_INFO + # that represents the repository being dispatched to. When computing + # the dispatch info, we ignore these leading path components. - for k, v in self.headers: - if not isinstance(v, str): - raise TypeError('header value must be string: %r' % (v,)) + if altbaseurl: + apppath = altbaseurl.path or '' + if apppath and not apppath.startswith('/'): + apppath = '/' + apppath + else: + apppath = env.get('SCRIPT_NAME', '') + + if reponame: + repoprefix = '/' + reponame.strip('/') + + if not env.get('PATH_INFO'): + raise error.ProgrammingError('reponame requires PATH_INFO') + + if not env['PATH_INFO'].startswith(repoprefix): + raise error.ProgrammingError('PATH_INFO does not begin with repo ' + 'name: %s (%s)' % (env['PATH_INFO'], + reponame)) + + dispatchpath = env['PATH_INFO'][len(repoprefix):] + + if dispatchpath and not dispatchpath.startswith('/'): + raise error.ProgrammingError('reponame prefix of PATH_INFO does ' + 'not end at path delimiter: %s (%s)' % + (env['PATH_INFO'], reponame)) - if isinstance(status, ErrorResponse): - self.headers.extend(status.headers) - if status.code == HTTP_NOT_MODIFIED: - # RFC 2616 Section 10.3.5: 304 Not Modified has cases where - # it MUST NOT include any headers other than these and no - # body - self.headers = [(k, v) for (k, v) in self.headers if - k in ('Date', 'ETag', 'Expires', - 'Cache-Control', 'Vary')] - status = statusmessage(status.code, str(status)) - elif status == 200: - status = '200 Script output follows' - elif isinstance(status, int): - status = statusmessage(status) + apppath = apppath.rstrip('/') + repoprefix + dispatchparts = dispatchpath.strip('/').split('/') + dispatchpath = '/'.join(dispatchparts) + + elif 'PATH_INFO' in env: + if env['PATH_INFO'].strip('/'): + dispatchparts = env['PATH_INFO'].strip('/').split('/') + dispatchpath = '/'.join(dispatchparts) + else: + dispatchparts = [] + dispatchpath = '' + else: + dispatchparts = [] + dispatchpath = None + + querystring = env.get('QUERY_STRING', '') + + # We store as a list so we have ordering information. We also store as + # a dict to facilitate fast lookup. + qsparams = multidict() + for k, v in util.urlreq.parseqsl(querystring, keep_blank_values=True): + qsparams.add(k, v) + + # HTTP_* keys contain HTTP request headers. The Headers structure should + # perform case normalization for us. We just rewrite underscore to dash + # so keys match what likely went over the wire. + headers = [] + for k, v in env.iteritems(): + if k.startswith('HTTP_'): + headers.append((k[len('HTTP_'):].replace('_', '-'), v)) + + from . import wsgiheaders # avoid cycle + headers = wsgiheaders.Headers(headers) + + # This is kind of a lie because the HTTP header wasn't explicitly + # sent. But for all intents and purposes it should be OK to lie about + # this, since a consumer will either either value to determine how many + # bytes are available to read. + if 'CONTENT_LENGTH' in env and 'HTTP_CONTENT_LENGTH' not in env: + headers['Content-Length'] = env['CONTENT_LENGTH'] + + if 'CONTENT_TYPE' in env and 'HTTP_CONTENT_TYPE' not in env: + headers['Content-Type'] = env['CONTENT_TYPE'] - self.server_write = self._start_response(status, self.headers) - self._start_response = None - self.headers = [] - if body is not None: - self.write(body) - self.server_write = None + bodyfh = env['wsgi.input'] + if 'Content-Length' in headers: + bodyfh = util.cappedreader(bodyfh, int(headers['Content-Length'])) + + return parsedrequest(method=env['REQUEST_METHOD'], + url=fullurl, baseurl=baseurl, + advertisedurl=advertisedfullurl, + advertisedbaseurl=advertisedbaseurl, + urlscheme=env['wsgi.url_scheme'], + remoteuser=env.get('REMOTE_USER'), + remotehost=env.get('REMOTE_HOST'), + apppath=apppath, + dispatchparts=dispatchparts, dispatchpath=dispatchpath, + reponame=reponame, + querystring=querystring, + qsparams=qsparams, + headers=headers, + bodyfh=bodyfh, + rawenv=env) - def write(self, thing): - if thing: - try: - self.server_write(thing) - except socket.error as inst: - if inst[0] != errno.ECONNRESET: - raise +class offsettrackingwriter(object): + """A file object like object that is append only and tracks write count. + + Instances are bound to a callable. This callable is called with data + whenever a ``write()`` is attempted. + + Instances track the amount of written data so they can answer ``tell()`` + requests. - def writelines(self, lines): - for line in lines: - self.write(line) + The intent of this class is to wrap the ``write()`` function returned by + a WSGI ``start_response()`` function. Since ``write()`` is a callable and + not a file object, it doesn't implement other file object methods. + """ + def __init__(self, writefn): + self._write = writefn + self._offset = 0 + + def write(self, s): + res = self._write(s) + # Some Python objects don't report the number of bytes written. + if res is None: + self._offset += len(s) + else: + self._offset += res def flush(self): - return None + pass + + def tell(self): + return self._offset + +class wsgiresponse(object): + """Represents a response to a WSGI request. + + A response consists of a status line, headers, and a body. + + Consumers must populate the ``status`` and ``headers`` fields and + make a call to a ``setbody*()`` method before the response can be + issued. + + When it is time to start sending the response over the wire, + ``sendresponse()`` is called. It handles emitting the header portion + of the response message. It then yields chunks of body data to be + written to the peer. Typically, the WSGI application itself calls + and returns the value from ``sendresponse()``. + """ + + def __init__(self, req, startresponse): + """Create an empty response tied to a specific request. + + ``req`` is a ``parsedrequest``. ``startresponse`` is the + ``start_response`` function passed to the WSGI application. + """ + self._req = req + self._startresponse = startresponse + + self.status = None + from . import wsgiheaders # avoid cycle + self.headers = wsgiheaders.Headers([]) + + self._bodybytes = None + self._bodygen = None + self._bodywillwrite = False + self._started = False + self._bodywritefn = None + + def _verifybody(self): + if (self._bodybytes is not None or self._bodygen is not None + or self._bodywillwrite): + raise error.ProgrammingError('cannot define body multiple times') + + def setbodybytes(self, b): + """Define the response body as static bytes. + + The empty string signals that there is no response body. + """ + self._verifybody() + self._bodybytes = b + self.headers['Content-Length'] = '%d' % len(b) + + def setbodygen(self, gen): + """Define the response body as a generator of bytes.""" + self._verifybody() + self._bodygen = gen + + def setbodywillwrite(self): + """Signal an intent to use write() to emit the response body. + + **This is the least preferred way to send a body.** + + It is preferred for WSGI applications to emit a generator of chunks + constituting the response body. However, some consumers can't emit + data this way. So, WSGI provides a way to obtain a ``write(data)`` + function that can be used to synchronously perform an unbuffered + write. + + Calling this function signals an intent to produce the body in this + manner. + """ + self._verifybody() + self._bodywillwrite = True + + def sendresponse(self): + """Send the generated response to the client. + + Before this is called, ``status`` must be set and one of + ``setbodybytes()`` or ``setbodygen()`` must be called. + + Calling this method multiple times is not allowed. + """ + if self._started: + raise error.ProgrammingError('sendresponse() called multiple times') + + self._started = True + + if not self.status: + raise error.ProgrammingError('status line not defined') + + if (self._bodybytes is None and self._bodygen is None + and not self._bodywillwrite): + raise error.ProgrammingError('response body not defined') - def close(self): - return None + # RFC 7232 Section 4.1 states that a 304 MUST generate one of + # {Cache-Control, Content-Location, Date, ETag, Expires, Vary} + # and SHOULD NOT generate other headers unless they could be used + # to guide cache updates. Furthermore, RFC 7230 Section 3.3.2 + # states that no response body can be issued. Content-Length can + # be sent. But if it is present, it should be the size of the response + # that wasn't transferred. + if self.status.startswith('304 '): + # setbodybytes('') will set C-L to 0. This doesn't conform with the + # spec. So remove it. + if self.headers.get('Content-Length') == '0': + del self.headers['Content-Length'] + + # Strictly speaking, this is too strict. But until it causes + # problems, let's be strict. + badheaders = {k for k in self.headers.keys() + if k.lower() not in ('date', 'etag', 'expires', + 'cache-control', + 'content-location', + 'vary')} + if badheaders: + raise error.ProgrammingError( + 'illegal header on 304 response: %s' % + ', '.join(sorted(badheaders))) + + if self._bodygen is not None or self._bodywillwrite: + raise error.ProgrammingError("must use setbodybytes('') with " + "304 responses") + + # Various HTTP clients (notably httplib) won't read the HTTP response + # until the HTTP request has been sent in full. If servers (us) send a + # response before the HTTP request has been fully sent, the connection + # may deadlock because neither end is reading. + # + # We work around this by "draining" the request data before + # sending any response in some conditions. + drain = False + close = False + + # If the client sent Expect: 100-continue, we assume it is smart enough + # to deal with the server sending a response before reading the request. + # (httplib doesn't do this.) + if self._req.headers.get('Expect', '').lower() == '100-continue': + pass + # Only tend to request methods that have bodies. Strictly speaking, + # we should sniff for a body. But this is fine for our existing + # WSGI applications. + elif self._req.method not in ('POST', 'PUT'): + pass + else: + # If we don't know how much data to read, there's no guarantee + # that we can drain the request responsibly. The WSGI + # specification only says that servers *should* ensure the + # input stream doesn't overrun the actual request. So there's + # no guarantee that reading until EOF won't corrupt the stream + # state. + if not isinstance(self._req.bodyfh, util.cappedreader): + close = True + else: + # We /could/ only drain certain HTTP response codes. But 200 and + # non-200 wire protocol responses both require draining. Since + # we have a capped reader in place for all situations where we + # drain, it is safe to read from that stream. We'll either do + # a drain or no-op if we're already at EOF. + drain = True + + if close: + self.headers['Connection'] = 'Close' + + if drain: + assert isinstance(self._req.bodyfh, util.cappedreader) + while True: + chunk = self._req.bodyfh.read(32768) + if not chunk: + break + + strheaders = [(pycompat.strurl(k), pycompat.strurl(v)) for + k, v in self.headers.items()] + write = self._startresponse(pycompat.sysstr(self.status), + strheaders) + + if self._bodybytes: + yield self._bodybytes + elif self._bodygen: + for chunk in self._bodygen: + yield chunk + elif self._bodywillwrite: + self._bodywritefn = write + else: + error.ProgrammingError('do not know how to send body') + + def getbodyfile(self): + """Obtain a file object like object representing the response body. + + For this to work, you must call ``setbodywillwrite()`` and then + ``sendresponse()`` first. ``sendresponse()`` is a generator and the + function won't run to completion unless the generator is advanced. The + generator yields not items. The easiest way to consume it is with + ``list(res.sendresponse())``, which should resolve to an empty list - + ``[]``. + """ + if not self._bodywillwrite: + raise error.ProgrammingError('must call setbodywillwrite() first') + + if not self._started: + raise error.ProgrammingError('must call sendresponse() first; did ' + 'you remember to consume it since it ' + 'is a generator?') + + assert self._bodywritefn + return offsettrackingwriter(self._bodywritefn) def wsgiapplication(app_maker): '''For compatibility with old CGI scripts. A plain hgweb() or hgwebdir() diff -r fb92df8b634c -r ed5448edcbfa mercurial/hgweb/server.py --- a/mercurial/hgweb/server.py Wed Apr 04 10:35:09 2018 -0400 +++ b/mercurial/hgweb/server.py Wed Apr 18 15:32:08 2018 -0400 @@ -13,6 +13,7 @@ import socket import sys import traceback +import wsgiref.validate from ..i18n import _ @@ -111,6 +112,9 @@ self.log_error(r"Exception happened during processing " r"request '%s':%s%s", self.path, newline, tb) + def do_PUT(self): + self.do_POST() + def do_GET(self): self.do_POST() @@ -132,29 +136,28 @@ env[r'SERVER_NAME'] = self.server.server_name env[r'SERVER_PORT'] = str(self.server.server_port) env[r'REQUEST_URI'] = self.path - env[r'SCRIPT_NAME'] = self.server.prefix - env[r'PATH_INFO'] = path[len(self.server.prefix):] + env[r'SCRIPT_NAME'] = pycompat.sysstr(self.server.prefix) + env[r'PATH_INFO'] = pycompat.sysstr(path[len(self.server.prefix):]) env[r'REMOTE_HOST'] = self.client_address[0] env[r'REMOTE_ADDR'] = self.client_address[0] - if query: - env[r'QUERY_STRING'] = query + env[r'QUERY_STRING'] = query or r'' if pycompat.ispy3: if self.headers.get_content_type() is None: env[r'CONTENT_TYPE'] = self.headers.get_default_type() else: env[r'CONTENT_TYPE'] = self.headers.get_content_type() - length = self.headers.get('content-length') + length = self.headers.get(r'content-length') else: if self.headers.typeheader is None: env[r'CONTENT_TYPE'] = self.headers.type else: env[r'CONTENT_TYPE'] = self.headers.typeheader - length = self.headers.getheader('content-length') + length = self.headers.getheader(r'content-length') if length: env[r'CONTENT_LENGTH'] = length for header in [h for h in self.headers.keys() - if h not in ('content-type', 'content-length')]: + if h not in (r'content-type', r'content-length')]: hkey = r'HTTP_' + header.replace(r'-', r'_').upper() hval = self.headers.get(header) hval = hval.replace(r'\n', r'').strip() @@ -162,7 +165,7 @@ env[hkey] = hval env[r'SERVER_PROTOCOL'] = self.request_version env[r'wsgi.version'] = (1, 0) - env[r'wsgi.url_scheme'] = self.url_scheme + env[r'wsgi.url_scheme'] = pycompat.sysstr(self.url_scheme) if env.get(r'HTTP_EXPECT', '').lower() == '100-continue': self.rfile = common.continuereader(self.rfile, self.wfile.write) @@ -174,6 +177,8 @@ socketserver.ForkingMixIn) env[r'wsgi.run_once'] = 0 + wsgiref.validate.check_environ(env) + self.saved_status = None self.saved_headers = [] self.length = None @@ -237,6 +242,11 @@ self.wfile.write('0\r\n\r\n') self.wfile.flush() + def version_string(self): + if self.server.serverheader: + return self.server.serverheader + return httpservermod.basehttprequesthandler.version_string(self) + class _httprequesthandlerssl(_httprequesthandler): """HTTPS handler based on Python's ssl module""" @@ -265,8 +275,8 @@ def setup(self): self.connection = self.request - self.rfile = socket._fileobject(self.request, "rb", self.rbufsize) - self.wfile = socket._fileobject(self.request, "wb", self.wbufsize) + self.rfile = self.request.makefile(r"rb", self.rbufsize) + self.wfile = self.request.makefile(r"wb", self.wbufsize) try: import threading @@ -281,7 +291,7 @@ def openlog(opt, default): if opt and opt != '-': - return open(opt, 'a') + return open(opt, 'ab') return default class MercurialHTTPServer(_mixin, httpservermod.httpserver, object): @@ -310,6 +320,8 @@ self.addr, self.port = self.socket.getsockname()[0:2] self.fqaddr = socket.getfqdn(addr[0]) + self.serverheader = ui.config('web', 'server-header') + class IPv6HTTPServer(MercurialHTTPServer): address_family = getattr(socket, 'AF_INET6', None) def __init__(self, *args, **kwargs): diff -r fb92df8b634c -r ed5448edcbfa mercurial/hgweb/webcommands.py --- a/mercurial/hgweb/webcommands.py Wed Apr 04 10:35:09 2018 -0400 +++ b/mercurial/hgweb/webcommands.py Wed Apr 18 15:32:08 2018 -0400 @@ -19,7 +19,6 @@ ErrorResponse, HTTP_FORBIDDEN, HTTP_NOT_FOUND, - HTTP_OK, get_contact, paritygen, staticfile, @@ -37,7 +36,11 @@ scmutil, smartset, templater, - util, + templateutil, +) + +from ..utils import ( + stringutil, ) from . import ( @@ -53,10 +56,20 @@ The decorator takes as its positional arguments the name/path the command should be accessible under. + When called, functions receive as arguments a ``requestcontext``, + ``wsgirequest``, and a templater instance for generatoring output. + The functions should populate the ``rctx.res`` object with details + about the HTTP response. + + The function returns a generator to be consumed by the WSGI application. + For most commands, this should be the result from + ``web.res.sendresponse()``. Many commands will call ``web.sendtemplate()`` + to render a template. + Usage: @webcommand('mycommand') - def mycommand(web, req, tmpl): + def mycommand(web): pass """ @@ -69,7 +82,7 @@ return func @webcommand('log') -def log(web, req, tmpl): +def log(web): """ /log[/{revision}[/{path}]] -------------------------- @@ -85,28 +98,24 @@ file will be shown. This form is equivalent to the ``filelog`` handler. """ - if 'file' in req.form and req.form['file'][0]: - return filelog(web, req, tmpl) + if web.req.qsparams.get('file'): + return filelog(web) else: - return changelog(web, req, tmpl) + return changelog(web) @webcommand('rawfile') -def rawfile(web, req, tmpl): +def rawfile(web): guessmime = web.configbool('web', 'guessmime') - path = webutil.cleanpath(web.repo, req.form.get('file', [''])[0]) + path = webutil.cleanpath(web.repo, web.req.qsparams.get('file', '')) if not path: - content = manifest(web, req, tmpl) - req.respond(HTTP_OK, web.ctype) - return content + return manifest(web) try: - fctx = webutil.filectx(web.repo, req) + fctx = webutil.filectx(web.repo, web.req) except error.LookupError as inst: try: - content = manifest(web, req, tmpl) - req.respond(HTTP_OK, web.ctype) - return content + return manifest(web) except ErrorResponse: raise inst @@ -116,23 +125,27 @@ if guessmime: mt = mimetypes.guess_type(path)[0] if mt is None: - if util.binary(text): + if stringutil.binary(text): mt = 'application/binary' else: mt = 'text/plain' if mt.startswith('text/'): mt += '; charset="%s"' % encoding.encoding - req.respond(HTTP_OK, mt, path, body=text) - return [] + web.res.headers['Content-Type'] = mt + filename = (path.rpartition('/')[-1] + .replace('\\', '\\\\').replace('"', '\\"')) + web.res.headers['Content-Disposition'] = 'inline; filename="%s"' % filename + web.res.setbodybytes(text) + return web.res.sendresponse() -def _filerevision(web, req, tmpl, fctx): +def _filerevision(web, fctx): f = fctx.path() text = fctx.data() parity = paritygen(web.stripecount) ishead = fctx.filerev() in fctx.filelog().headrevs() - if util.binary(text): + if stringutil.binary(text): mt = mimetypes.guess_type(f)[0] or 'application/octet-stream' text = '(binary:%s)' % mt @@ -143,18 +156,19 @@ "linenumber": "% 6d" % (lineno + 1), "parity": next(parity)} - return tmpl("filerevision", - file=f, - path=webutil.up(f), - text=lines(), - symrev=webutil.symrevorshortnode(req, fctx), - rename=webutil.renamelink(fctx), - permissions=fctx.manifest().flags(f), - ishead=int(ishead), - **webutil.commonentry(web.repo, fctx)) + return web.sendtemplate( + 'filerevision', + file=f, + path=webutil.up(f), + text=lines(), + symrev=webutil.symrevorshortnode(web.req, fctx), + rename=webutil.renamelink(fctx), + permissions=fctx.manifest().flags(f), + ishead=int(ishead), + **pycompat.strkwargs(webutil.commonentry(web.repo, fctx))) @webcommand('file') -def file(web, req, tmpl): +def file(web): """ /file/{revision}[/{path}] ------------------------- @@ -173,18 +187,21 @@ If ``path`` is not defined, information about the root directory will be rendered. """ - path = webutil.cleanpath(web.repo, req.form.get('file', [''])[0]) + if web.req.qsparams.get('style') == 'raw': + return rawfile(web) + + path = webutil.cleanpath(web.repo, web.req.qsparams.get('file', '')) if not path: - return manifest(web, req, tmpl) + return manifest(web) try: - return _filerevision(web, req, tmpl, webutil.filectx(web.repo, req)) + return _filerevision(web, webutil.filectx(web.repo, web.req)) except error.LookupError as inst: try: - return manifest(web, req, tmpl) + return manifest(web) except ErrorResponse: raise inst -def _search(web, req, tmpl): +def _search(web): MODE_REVISION = 'rev' MODE_KEYWORD = 'keyword' MODE_REVSET = 'revset' @@ -232,7 +249,7 @@ def getsearchmode(query): try: - ctx = web.repo[query] + ctx = scmutil.revsymbol(web.repo, query) except (error.RepoError, error.LookupError): # query is not an exact revision pointer, need to # decide if it's a revset expression or keywords @@ -259,7 +276,8 @@ if not funcsused.issubset(revset.safesymbols): return MODE_KEYWORD, query - mfunc = revset.match(web.repo.ui, revdef, repo=web.repo) + mfunc = revset.match(web.repo.ui, revdef, + lookup=revset.lookupfn(web.repo)) try: revs = mfunc(web.repo) return MODE_REVSET, revs @@ -271,44 +289,47 @@ LookupError): return MODE_KEYWORD, query - def changelist(**map): + def changelist(context): count = 0 for ctx in searchfunc[0](funcarg): count += 1 n = ctx.node() - showtags = webutil.showtag(web.repo, tmpl, 'changelogtag', n) - files = webutil.listfilediffs(tmpl, ctx.files(), n, web.maxfiles) + showtags = webutil.showtag(web.repo, web.tmpl, 'changelogtag', n) + files = webutil.listfilediffs(web.tmpl, ctx.files(), n, + web.maxfiles) - yield tmpl('searchentry', - parity=next(parity), - changelogtag=showtags, - files=files, - **webutil.commonentry(web.repo, ctx)) + lm = webutil.commonentry(web.repo, ctx) + lm.update({ + 'parity': next(parity), + 'changelogtag': showtags, + 'files': files, + }) + yield lm if count >= revcount: break - query = req.form['rev'][0] + query = web.req.qsparams['rev'] revcount = web.maxchanges - if 'revcount' in req.form: + if 'revcount' in web.req.qsparams: try: - revcount = int(req.form.get('revcount', [revcount])[0]) + revcount = int(web.req.qsparams.get('revcount', revcount)) revcount = max(revcount, 1) - tmpl.defaults['sessionvars']['revcount'] = revcount + web.tmpl.defaults['sessionvars']['revcount'] = revcount except ValueError: pass - lessvars = copy.copy(tmpl.defaults['sessionvars']) - lessvars['revcount'] = max(revcount / 2, 1) + lessvars = copy.copy(web.tmpl.defaults['sessionvars']) + lessvars['revcount'] = max(revcount // 2, 1) lessvars['rev'] = query - morevars = copy.copy(tmpl.defaults['sessionvars']) + morevars = copy.copy(web.tmpl.defaults['sessionvars']) morevars['revcount'] = revcount * 2 morevars['rev'] = query mode, funcarg = getsearchmode(query) - if 'forcekw' in req.form: + if 'forcekw' in web.req.qsparams: showforcekw = '' showunforcekw = searchfuncs[mode][1] mode = MODE_KEYWORD @@ -325,14 +346,21 @@ tip = web.repo['tip'] parity = paritygen(web.stripecount) - return tmpl('search', query=query, node=tip.hex(), symrev='tip', - entries=changelist, archives=web.archivelist("tip"), - morevars=morevars, lessvars=lessvars, - modedesc=searchfunc[1], - showforcekw=showforcekw, showunforcekw=showunforcekw) + return web.sendtemplate( + 'search', + query=query, + node=tip.hex(), + symrev='tip', + entries=templateutil.mappinggenerator(changelist, name='searchentry'), + archives=web.archivelist('tip'), + morevars=morevars, + lessvars=lessvars, + modedesc=searchfunc[1], + showforcekw=showforcekw, + showunforcekw=showunforcekw) @webcommand('changelog') -def changelog(web, req, tmpl, shortlog=False): +def changelog(web, shortlog=False): """ /changelog[/{revision}] ----------------------- @@ -358,11 +386,11 @@ """ query = '' - if 'node' in req.form: - ctx = webutil.changectx(web.repo, req) - symrev = webutil.symrevorshortnode(req, ctx) - elif 'rev' in req.form: - return _search(web, req, tmpl) + if 'node' in web.req.qsparams: + ctx = webutil.changectx(web.repo, web.req) + symrev = webutil.symrevorshortnode(web.req, ctx) + elif 'rev' in web.req.qsparams: + return _search(web) else: ctx = web.repo['tip'] symrev = 'tip' @@ -377,7 +405,7 @@ if curcount > revcount + 1: break - entry = webutil.changelistentry(web, web.repo[rev], tmpl) + entry = webutil.changelistentry(web, web.repo[rev]) entry['parity'] = next(parity) yield entry @@ -386,17 +414,17 @@ else: revcount = web.maxchanges - if 'revcount' in req.form: + if 'revcount' in web.req.qsparams: try: - revcount = int(req.form.get('revcount', [revcount])[0]) + revcount = int(web.req.qsparams.get('revcount', revcount)) revcount = max(revcount, 1) - tmpl.defaults['sessionvars']['revcount'] = revcount + web.tmpl.defaults['sessionvars']['revcount'] = revcount except ValueError: pass - lessvars = copy.copy(tmpl.defaults['sessionvars']) - lessvars['revcount'] = max(revcount / 2, 1) - morevars = copy.copy(tmpl.defaults['sessionvars']) + lessvars = copy.copy(web.tmpl.defaults['sessionvars']) + lessvars['revcount'] = max(revcount // 2, 1) + morevars = copy.copy(web.tmpl.defaults['sessionvars']) morevars['revcount'] = revcount * 2 count = len(web.repo) @@ -413,15 +441,24 @@ else: nextentry = [] - return tmpl('shortlog' if shortlog else 'changelog', changenav=changenav, - node=ctx.hex(), rev=pos, symrev=symrev, changesets=count, - entries=entries, - latestentry=latestentry, nextentry=nextentry, - archives=web.archivelist("tip"), revcount=revcount, - morevars=morevars, lessvars=lessvars, query=query) + return web.sendtemplate( + 'shortlog' if shortlog else 'changelog', + changenav=changenav, + node=ctx.hex(), + rev=pos, + symrev=symrev, + changesets=count, + entries=entries, + latestentry=latestentry, + nextentry=nextentry, + archives=web.archivelist('tip'), + revcount=revcount, + morevars=morevars, + lessvars=lessvars, + query=query) @webcommand('shortlog') -def shortlog(web, req, tmpl): +def shortlog(web): """ /shortlog --------- @@ -432,10 +469,10 @@ difference is the ``shortlog`` template will be rendered instead of the ``changelog`` template. """ - return changelog(web, req, tmpl, shortlog=True) + return changelog(web, shortlog=True) @webcommand('changeset') -def changeset(web, req, tmpl): +def changeset(web): """ /changeset[/{revision}] ----------------------- @@ -450,9 +487,11 @@ ``changesetbookmark``, ``filenodelink``, ``filenolink``, and the many templates related to diffs may all be used to produce the output. """ - ctx = webutil.changectx(web.repo, req) + ctx = webutil.changectx(web.repo, web.req) - return tmpl('changeset', **webutil.changesetentry(web, req, tmpl, ctx)) + return web.sendtemplate( + 'changeset', + **webutil.changesetentry(web, ctx)) rev = webcommand('rev')(changeset) @@ -465,7 +504,7 @@ return path @webcommand('manifest') -def manifest(web, req, tmpl): +def manifest(web): """ /manifest[/{revision}[/{path}]] ------------------------------- @@ -481,13 +520,13 @@ The ``manifest`` template will be rendered for this handler. """ - if 'node' in req.form: - ctx = webutil.changectx(web.repo, req) - symrev = webutil.symrevorshortnode(req, ctx) + if 'node' in web.req.qsparams: + ctx = webutil.changectx(web.repo, web.req) + symrev = webutil.symrevorshortnode(web.req, ctx) else: ctx = web.repo['tip'] symrev = 'tip' - path = webutil.cleanpath(web.repo, req.form.get('file', [''])[0]) + path = webutil.cleanpath(web.repo, web.req.qsparams.get('file', '')) mf = ctx.manifest() node = ctx.node() @@ -495,7 +534,7 @@ dirs = {} parity = paritygen(web.stripecount) - if path and path[-1] != "/": + if path and path[-1:] != "/": path += "/" l = len(path) abspath = "/" + path @@ -542,7 +581,7 @@ emptydirs = [] h = dirs[d] while isinstance(h, dict) and len(h) == 1: - k, v = h.items()[0] + k, v = next(iter(h.items())) if v: emptydirs.append(k) h = v @@ -553,18 +592,19 @@ "emptydirs": "/".join(emptydirs), "basename": d} - return tmpl("manifest", - symrev=symrev, - path=abspath, - up=webutil.up(abspath), - upparity=next(parity), - fentries=filelist, - dentries=dirlist, - archives=web.archivelist(hex(node)), - **webutil.commonentry(web.repo, ctx)) + return web.sendtemplate( + 'manifest', + symrev=symrev, + path=abspath, + up=webutil.up(abspath), + upparity=next(parity), + fentries=filelist, + dentries=dirlist, + archives=web.archivelist(hex(node)), + **pycompat.strkwargs(webutil.commonentry(web.repo, ctx))) @webcommand('tags') -def tags(web, req, tmpl): +def tags(web): """ /tags ----- @@ -590,14 +630,15 @@ "date": web.repo[n].date(), "node": hex(n)} - return tmpl("tags", - node=hex(web.repo.changelog.tip()), - entries=lambda **x: entries(False, False, **x), - entriesnotip=lambda **x: entries(True, False, **x), - latestentry=lambda **x: entries(True, True, **x)) + return web.sendtemplate( + 'tags', + node=hex(web.repo.changelog.tip()), + entries=lambda **x: entries(False, False, **x), + entriesnotip=lambda **x: entries(True, False, **x), + latestentry=lambda **x: entries(True, True, **x)) @webcommand('bookmarks') -def bookmarks(web, req, tmpl): +def bookmarks(web): """ /bookmarks ---------- @@ -628,14 +669,15 @@ else: latestrev = -1 - return tmpl("bookmarks", - node=hex(web.repo.changelog.tip()), - lastchange=[{"date": web.repo[latestrev].date()}], - entries=lambda **x: entries(latestonly=False, **x), - latestentry=lambda **x: entries(latestonly=True, **x)) + return web.sendtemplate( + 'bookmarks', + node=hex(web.repo.changelog.tip()), + lastchange=[{'date': web.repo[latestrev].date()}], + entries=lambda **x: entries(latestonly=False, **x), + latestentry=lambda **x: entries(latestonly=True, **x)) @webcommand('branches') -def branches(web, req, tmpl): +def branches(web): """ /branches --------- @@ -650,11 +692,15 @@ """ entries = webutil.branchentries(web.repo, web.stripecount) latestentry = webutil.branchentries(web.repo, web.stripecount, 1) - return tmpl('branches', node=hex(web.repo.changelog.tip()), - entries=entries, latestentry=latestentry) + + return web.sendtemplate( + 'branches', + node=hex(web.repo.changelog.tip()), + entries=entries, + latestentry=latestentry) @webcommand('summary') -def summary(web, req, tmpl): +def summary(web): """ /summary -------- @@ -668,7 +714,7 @@ """ i = reversed(web.repo.tagslist()) - def tagentries(**map): + def tagentries(context): parity = paritygen(web.stripecount) count = 0 for k, n in i: @@ -679,11 +725,12 @@ if count > 10: # limit to 10 tags break - yield tmpl("tagentry", - parity=next(parity), - tag=k, - node=hex(n), - date=web.repo[n].date()) + yield { + 'parity': next(parity), + 'tag': k, + 'node': hex(n), + 'date': web.repo[n].date(), + } def bookmarks(**map): parity = paritygen(web.stripecount) @@ -696,7 +743,7 @@ 'date': web.repo[n].date(), 'node': hex(n)} - def changelist(**map): + def changelist(context): parity = paritygen(web.stripecount, offset=start - end) l = [] # build a list in forward order for efficiency revs = [] @@ -704,11 +751,9 @@ revs = web.repo.changelog.revs(start, end - 1) for i in revs: ctx = web.repo[i] - - l.append(tmpl( - 'shortlogentry', - parity=next(parity), - **webutil.commonentry(web.repo, ctx))) + lm = webutil.commonentry(web.repo, ctx) + lm['parity'] = next(parity) + l.append(lm) for entry in reversed(l): yield entry @@ -721,21 +766,25 @@ desc = web.config("web", "description") if not desc: desc = 'unknown' - return tmpl("summary", - desc=desc, - owner=get_contact(web.config) or "unknown", - lastchange=tip.date(), - tags=tagentries, - bookmarks=bookmarks, - branches=webutil.branchentries(web.repo, web.stripecount, 10), - shortlog=changelist, - node=tip.hex(), - symrev='tip', - archives=web.archivelist("tip"), - labels=web.configlist('web', 'labels')) + labels = web.configlist('web', 'labels') + + return web.sendtemplate( + 'summary', + desc=desc, + owner=get_contact(web.config) or 'unknown', + lastchange=tip.date(), + tags=templateutil.mappinggenerator(tagentries, name='tagentry'), + bookmarks=bookmarks, + branches=webutil.branchentries(web.repo, web.stripecount, 10), + shortlog=templateutil.mappinggenerator(changelist, + name='shortlogentry'), + node=tip.hex(), + symrev='tip', + archives=web.archivelist('tip'), + labels=templateutil.hybridlist(labels, name='label')) @webcommand('filediff') -def filediff(web, req, tmpl): +def filediff(web): """ /diff/{revision}/{path} ----------------------- @@ -749,10 +798,10 @@ """ fctx, ctx = None, None try: - fctx = webutil.filectx(web.repo, req) + fctx = webutil.filectx(web.repo, web.req) except LookupError: - ctx = webutil.changectx(web.repo, req) - path = webutil.cleanpath(web.repo, req.form['file'][0]) + ctx = webutil.changectx(web.repo, web.req) + path = webutil.cleanpath(web.repo, web.req.qsparams['file']) if path not in ctx.files(): raise @@ -762,27 +811,29 @@ basectx = ctx.p1() style = web.config('web', 'style') - if 'style' in req.form: - style = req.form['style'][0] + if 'style' in web.req.qsparams: + style = web.req.qsparams['style'] - diffs = webutil.diffs(web, tmpl, ctx, basectx, [path], style) + diffs = webutil.diffs(web, ctx, basectx, [path], style) if fctx is not None: rename = webutil.renamelink(fctx) ctx = fctx else: rename = [] ctx = ctx - return tmpl("filediff", - file=path, - symrev=webutil.symrevorshortnode(req, ctx), - rename=rename, - diff=diffs, - **webutil.commonentry(web.repo, ctx)) + + return web.sendtemplate( + 'filediff', + file=path, + symrev=webutil.symrevorshortnode(web.req, ctx), + rename=rename, + diff=diffs, + **pycompat.strkwargs(webutil.commonentry(web.repo, ctx))) diff = webcommand('diff')(filediff) @webcommand('comparison') -def comparison(web, req, tmpl): +def comparison(web): """ /comparison/{revision}/{path} ----------------------------- @@ -798,14 +849,14 @@ The ``filecomparison`` template is rendered. """ - ctx = webutil.changectx(web.repo, req) - if 'file' not in req.form: + ctx = webutil.changectx(web.repo, web.req) + if 'file' not in web.req.qsparams: raise ErrorResponse(HTTP_NOT_FOUND, 'file not given') - path = webutil.cleanpath(web.repo, req.form['file'][0]) + path = webutil.cleanpath(web.repo, web.req.qsparams['file']) parsecontext = lambda v: v == 'full' and -1 or int(v) - if 'context' in req.form: - context = parsecontext(req.form['context'][0]) + if 'context' in web.req.qsparams: + context = parsecontext(web.req.qsparams['context']) else: context = parsecontext(web.config('web', 'comparisoncontext', '5')) @@ -836,26 +887,28 @@ pfctx = ctx.parents()[0][path] leftlines = filelines(pfctx) - comparison = webutil.compare(tmpl, context, leftlines, rightlines) + comparison = webutil.compare(web.tmpl, context, leftlines, rightlines) if fctx is not None: rename = webutil.renamelink(fctx) ctx = fctx else: rename = [] ctx = ctx - return tmpl('filecomparison', - file=path, - symrev=webutil.symrevorshortnode(req, ctx), - rename=rename, - leftrev=leftrev, - leftnode=hex(leftnode), - rightrev=rightrev, - rightnode=hex(rightnode), - comparison=comparison, - **webutil.commonentry(web.repo, ctx)) + + return web.sendtemplate( + 'filecomparison', + file=path, + symrev=webutil.symrevorshortnode(web.req, ctx), + rename=rename, + leftrev=leftrev, + leftnode=hex(leftnode), + rightrev=rightrev, + rightnode=hex(rightnode), + comparison=comparison, + **pycompat.strkwargs(webutil.commonentry(web.repo, ctx))) @webcommand('annotate') -def annotate(web, req, tmpl): +def annotate(web): """ /annotate/{revision}/{path} --------------------------- @@ -871,7 +924,7 @@ The ``fileannotate`` template is rendered. """ - fctx = webutil.filectx(web.repo, req) + fctx = webutil.filectx(web.repo, web.req) f = fctx.path() parity = paritygen(web.stripecount) ishead = fctx.filerev() in fctx.filelog().headrevs() @@ -899,13 +952,14 @@ if fctx.isbinary(): mt = (mimetypes.guess_type(fctx.path())[0] or 'application/octet-stream') - lines = [((fctx.filectx(fctx.filerev()), 1), '(binary:%s)' % mt)] + lines = [dagop.annotateline(fctx=fctx.filectx(fctx.filerev()), + lineno=1, text='(binary:%s)' % mt)] else: - lines = webutil.annotate(req, fctx, web.repo.ui) + lines = webutil.annotate(web.req, fctx, web.repo.ui) previousrev = None blockparitygen = paritygen(1) - for lineno, (aline, l) in enumerate(lines): + for lineno, aline in enumerate(lines): f = aline.fctx rev = f.rev() if rev != previousrev: @@ -925,28 +979,29 @@ "blockhead": blockhead, "blockparity": blockparity, "targetline": aline.lineno, - "line": l, + "line": aline.text, "lineno": lineno + 1, "lineid": "l%d" % (lineno + 1), "linenumber": "% 6d" % (lineno + 1), "revdate": f.date()} - diffopts = webutil.difffeatureopts(req, web.repo.ui, 'annotate') + diffopts = webutil.difffeatureopts(web.req, web.repo.ui, 'annotate') diffopts = {k: getattr(diffopts, k) for k in diffopts.defaults} - return tmpl("fileannotate", - file=f, - annotate=annotate, - path=webutil.up(f), - symrev=webutil.symrevorshortnode(req, fctx), - rename=webutil.renamelink(fctx), - permissions=fctx.manifest().flags(f), - ishead=int(ishead), - diffopts=diffopts, - **webutil.commonentry(web.repo, fctx)) + return web.sendtemplate( + 'fileannotate', + file=f, + annotate=annotate, + path=webutil.up(f), + symrev=webutil.symrevorshortnode(web.req, fctx), + rename=webutil.renamelink(fctx), + permissions=fctx.manifest().flags(f), + ishead=int(ishead), + diffopts=diffopts, + **pycompat.strkwargs(webutil.commonentry(web.repo, fctx))) @webcommand('filelog') -def filelog(web, req, tmpl): +def filelog(web): """ /filelog/{revision}/{path} -------------------------- @@ -960,16 +1015,16 @@ """ try: - fctx = webutil.filectx(web.repo, req) + fctx = webutil.filectx(web.repo, web.req) f = fctx.path() fl = fctx.filelog() except error.LookupError: - f = webutil.cleanpath(web.repo, req.form['file'][0]) + f = webutil.cleanpath(web.repo, web.req.qsparams['file']) fl = web.repo.file(f) numrevs = len(fl) if not numrevs: # file doesn't exist at all raise - rev = webutil.changectx(web.repo, req).rev() + rev = webutil.changectx(web.repo, web.req).rev() first = fl.linkrev(0) if rev < first: # current rev is from before file existed raise @@ -979,27 +1034,27 @@ fctx = web.repo.filectx(f, fl.linkrev(frev)) revcount = web.maxshortchanges - if 'revcount' in req.form: + if 'revcount' in web.req.qsparams: try: - revcount = int(req.form.get('revcount', [revcount])[0]) + revcount = int(web.req.qsparams.get('revcount', revcount)) revcount = max(revcount, 1) - tmpl.defaults['sessionvars']['revcount'] = revcount + web.tmpl.defaults['sessionvars']['revcount'] = revcount except ValueError: pass - lrange = webutil.linerange(req) + lrange = webutil.linerange(web.req) - lessvars = copy.copy(tmpl.defaults['sessionvars']) - lessvars['revcount'] = max(revcount / 2, 1) - morevars = copy.copy(tmpl.defaults['sessionvars']) + lessvars = copy.copy(web.tmpl.defaults['sessionvars']) + lessvars['revcount'] = max(revcount // 2, 1) + morevars = copy.copy(web.tmpl.defaults['sessionvars']) morevars['revcount'] = revcount * 2 - patch = 'patch' in req.form + patch = 'patch' in web.req.qsparams if patch: - lessvars['patch'] = morevars['patch'] = req.form['patch'][0] - descend = 'descend' in req.form + lessvars['patch'] = morevars['patch'] = web.req.qsparams['patch'] + descend = 'descend' in web.req.qsparams if descend: - lessvars['descend'] = morevars['descend'] = req.form['descend'][0] + lessvars['descend'] = morevars['descend'] = web.req.qsparams['descend'] count = fctx.filerev() + 1 start = max(0, count - revcount) # first rev on this page @@ -1007,18 +1062,20 @@ parity = paritygen(web.stripecount, offset=start - end) repo = web.repo - revs = fctx.filelog().revs(start, end - 1) + filelog = fctx.filelog() + revs = [filerev for filerev in filelog.revs(start, end - 1) + if filelog.linkrev(filerev) in repo] entries = [] diffstyle = web.config('web', 'style') - if 'style' in req.form: - diffstyle = req.form['style'][0] + if 'style' in web.req.qsparams: + diffstyle = web.req.qsparams['style'] def diff(fctx, linerange=None): ctx = fctx.changectx() basectx = ctx.p1() path = fctx.path() - return webutil.diffs(web, tmpl, ctx, basectx, [path], diffstyle, + return webutil.diffs(web, ctx, basectx, [path], diffstyle, linerange=linerange, lineidprefix='%s-' % ctx.hex()[:12]) @@ -1027,7 +1084,7 @@ linerange = webutil.formatlinerange(*lrange) # deactivate numeric nav links when linerange is specified as this # would required a dedicated "revnav" class - nav = None + nav = templateutil.mappinglist([]) if descend: it = dagop.blockdescendants(fctx, *lrange) else: @@ -1044,7 +1101,7 @@ file=path, diff=diffs, linerange=webutil.formatlinerange(*lr), - **webutil.commonentry(repo, c))) + **pycompat.strkwargs(webutil.commonentry(repo, c)))) if i == revcount: break lessvars['linerange'] = webutil.formatlinerange(*lrange) @@ -1061,29 +1118,30 @@ file=f, diff=diffs, rename=webutil.renamelink(iterfctx), - **webutil.commonentry(repo, iterfctx))) + **pycompat.strkwargs(webutil.commonentry(repo, iterfctx)))) entries.reverse() revnav = webutil.filerevnav(web.repo, fctx.path()) nav = revnav.gen(end - 1, revcount, count) latestentry = entries[:1] - return tmpl("filelog", - file=f, - nav=nav, - symrev=webutil.symrevorshortnode(req, fctx), - entries=entries, - descend=descend, - patch=patch, - latestentry=latestentry, - linerange=linerange, - revcount=revcount, - morevars=morevars, - lessvars=lessvars, - **webutil.commonentry(web.repo, fctx)) + return web.sendtemplate( + 'filelog', + file=f, + nav=nav, + symrev=webutil.symrevorshortnode(web.req, fctx), + entries=entries, + descend=descend, + patch=patch, + latestentry=latestentry, + linerange=linerange, + revcount=revcount, + morevars=morevars, + lessvars=lessvars, + **pycompat.strkwargs(webutil.commonentry(web.repo, fctx))) @webcommand('archive') -def archive(web, req, tmpl): +def archive(web): """ /archive/{revision}.{format}[/{path}] ------------------------------------- @@ -1103,11 +1161,11 @@ No template is used for this handler. Raw, binary content is generated. """ - type_ = req.form.get('type', [None])[0] + type_ = web.req.qsparams.get('type') allowed = web.configlist("web", "allow_archive") - key = req.form['node'][0] + key = web.req.qsparams['node'] - if type_ not in web.archivespecs: + if type_ not in webutil.archivespecs: msg = 'Unsupported archive type: %s' % type_ raise ErrorResponse(HTTP_NOT_FOUND, msg) @@ -1116,44 +1174,51 @@ msg = 'Archive type not allowed: %s' % type_ raise ErrorResponse(HTTP_FORBIDDEN, msg) - reponame = re.sub(r"\W+", "-", os.path.basename(web.reponame)) + reponame = re.sub(br"\W+", "-", os.path.basename(web.reponame)) cnode = web.repo.lookup(key) arch_version = key if cnode == key or key == 'tip': arch_version = short(cnode) name = "%s-%s" % (reponame, arch_version) - ctx = webutil.changectx(web.repo, req) + ctx = webutil.changectx(web.repo, web.req) pats = [] match = scmutil.match(ctx, []) - file = req.form.get('file', None) + file = web.req.qsparams.get('file') if file: - pats = ['path:' + file[0]] + pats = ['path:' + file] match = scmutil.match(ctx, pats, default='path') if pats: files = [f for f in ctx.manifest().keys() if match(f)] if not files: raise ErrorResponse(HTTP_NOT_FOUND, - 'file(s) not found: %s' % file[0]) + 'file(s) not found: %s' % file) + + mimetype, artype, extension, encoding = webutil.archivespecs[type_] + + web.res.headers['Content-Type'] = mimetype + web.res.headers['Content-Disposition'] = 'attachment; filename=%s%s' % ( + name, extension) - mimetype, artype, extension, encoding = web.archivespecs[type_] - headers = [ - ('Content-Disposition', 'attachment; filename=%s%s' % (name, extension)) - ] if encoding: - headers.append(('Content-Encoding', encoding)) - req.headers.extend(headers) - req.respond(HTTP_OK, mimetype) + web.res.headers['Content-Encoding'] = encoding - archival.archive(web.repo, req, cnode, artype, prefix=name, + web.res.setbodywillwrite() + if list(web.res.sendresponse()): + raise error.ProgrammingError('sendresponse() should not emit data ' + 'if writing later') + + bodyfh = web.res.getbodyfile() + + archival.archive(web.repo, bodyfh, cnode, artype, prefix=name, matchfn=match, subrepos=web.configbool("web", "archivesubrepos")) + return [] - @webcommand('static') -def static(web, req, tmpl): - fname = req.form['file'][0] +def static(web): + fname = web.req.qsparams['file'] # a repo owner may set web.static in .hg/hgrc to get any file # readable by the user running the CGI script static = web.config("web", "static", None, untrusted=False) @@ -1162,11 +1227,12 @@ if isinstance(tp, str): tp = [tp] static = [os.path.join(p, 'static') for p in tp] - staticfile(static, fname, req) - return [] + + staticfile(static, fname, web.res) + return web.res.sendresponse() @webcommand('graph') -def graph(web, req, tmpl): +def graph(web): """ /graph[/{revision}] ------------------- @@ -1189,9 +1255,9 @@ This handler will render the ``graph`` template. """ - if 'node' in req.form: - ctx = webutil.changectx(web.repo, req) - symrev = webutil.symrevorshortnode(req, ctx) + if 'node' in web.req.qsparams: + ctx = webutil.changectx(web.repo, web.req) + symrev = webutil.symrevorshortnode(web.req, ctx) else: ctx = web.repo['tip'] symrev = 'tip' @@ -1199,21 +1265,21 @@ bg_height = 39 revcount = web.maxshortchanges - if 'revcount' in req.form: + if 'revcount' in web.req.qsparams: try: - revcount = int(req.form.get('revcount', [revcount])[0]) + revcount = int(web.req.qsparams.get('revcount', revcount)) revcount = max(revcount, 1) - tmpl.defaults['sessionvars']['revcount'] = revcount + web.tmpl.defaults['sessionvars']['revcount'] = revcount except ValueError: pass - lessvars = copy.copy(tmpl.defaults['sessionvars']) - lessvars['revcount'] = max(revcount / 2, 1) - morevars = copy.copy(tmpl.defaults['sessionvars']) + lessvars = copy.copy(web.tmpl.defaults['sessionvars']) + lessvars['revcount'] = max(revcount // 2, 1) + morevars = copy.copy(web.tmpl.defaults['sessionvars']) morevars['revcount'] = revcount * 2 - graphtop = req.form.get('graphtop', [ctx.hex()])[0] - graphvars = copy.copy(tmpl.defaults['sessionvars']) + graphtop = web.req.qsparams.get('graphtop', ctx.hex()) + graphvars = copy.copy(web.tmpl.defaults['sessionvars']) graphvars['graphtop'] = graphtop count = len(web.repo) @@ -1305,17 +1371,24 @@ rows = len(tree) - return tmpl('graph', rev=rev, symrev=symrev, revcount=revcount, - uprev=uprev, - lessvars=lessvars, morevars=morevars, downrev=downrev, - graphvars=graphvars, - rows=rows, - bg_height=bg_height, - changesets=count, - nextentry=nextentry, - jsdata=lambda **x: jsdata(), - nodes=lambda **x: nodes(), - node=ctx.hex(), changenav=changenav) + return web.sendtemplate( + 'graph', + rev=rev, + symrev=symrev, + revcount=revcount, + uprev=uprev, + lessvars=lessvars, + morevars=morevars, + downrev=downrev, + graphvars=graphvars, + rows=rows, + bg_height=bg_height, + changesets=count, + nextentry=nextentry, + jsdata=lambda **x: jsdata(), + nodes=lambda **x: nodes(), + node=ctx.hex(), + changenav=changenav) def _getdoc(e): doc = e[0].__doc__ @@ -1326,7 +1399,7 @@ return doc @webcommand('help') -def help(web, req, tmpl): +def help(web): """ /help[/{topic}] --------------- @@ -1342,7 +1415,7 @@ """ from .. import commands, help as helpmod # avoid cycle - topicname = req.form.get('node', [None])[0] + topicname = web.req.qsparams.get('node') if not topicname: def topics(**map): for entries, summary, _doc in helpmod.helptable: @@ -1371,8 +1444,12 @@ for c, doc in other: yield {'topic': c, 'summary': doc} - return tmpl('helptopics', topics=topics, earlycommands=earlycommands, - othercommands=othercommands, title='Index') + return web.sendtemplate( + 'helptopics', + topics=topics, + earlycommands=earlycommands, + othercommands=othercommands, + title='Index') # Render an index of sub-topics. if topicname in helpmod.subtopics: @@ -1384,8 +1461,11 @@ 'summary': summary, }) - return tmpl('helptopics', topics=topics, title=topicname, - subindex=True) + return web.sendtemplate( + 'helptopics', + topics=topics, + title=topicname, + subindex=True) u = webutil.wsgiui.load() u.verbose = True @@ -1403,9 +1483,13 @@ try: doc = helpmod.help_(u, commands, topic, subtopic=subtopic) - except error.UnknownCommand: + except error.Abort: raise ErrorResponse(HTTP_NOT_FOUND) - return tmpl('help', topic=topicname, doc=doc) + + return web.sendtemplate( + 'help', + topic=topicname, + doc=doc) # tell hggettext to extract docstrings from these functions: i18nfunctions = commands.values() diff -r fb92df8b634c -r ed5448edcbfa mercurial/hgweb/webutil.py --- a/mercurial/hgweb/webutil.py Wed Apr 04 10:35:09 2018 -0400 +++ b/mercurial/hgweb/webutil.py Wed Apr 18 15:32:08 2018 -0400 @@ -28,19 +28,48 @@ error, match, mdiff, + obsutil, patch, pathutil, pycompat, + scmutil, templatefilters, templatekw, + templateutil, ui as uimod, util, ) +from ..utils import ( + stringutil, +) + +archivespecs = util.sortdict(( + ('zip', ('application/zip', 'zip', '.zip', None)), + ('gz', ('application/x-gzip', 'tgz', '.tar.gz', None)), + ('bz2', ('application/x-bzip2', 'tbz2', '.tar.bz2', None)), +)) + +def archivelist(ui, nodeid, url=None): + allowed = ui.configlist('web', 'allow_archive', untrusted=True) + archives = [] + + for typ, spec in archivespecs.iteritems(): + if typ in allowed or ui.configbool('web', 'allow' + typ, + untrusted=True): + archives.append({ + 'type': typ, + 'extension': spec[2], + 'node': nodeid, + 'url': url, + }) + + return templateutil.mappinglist(archives) + def up(p): - if p[0] != "/": + if p[0:1] != "/": p = "/" + p - if p[-1] == "/": + if p[-1:] == "/": p = p[:-1] up = os.path.dirname(p) if up == "/": @@ -96,14 +125,16 @@ :limit: how far shall we link The return is: - - a single element tuple + - a single element mappinglist - containing a dictionary with a `before` and `after` key - - values are generator functions taking arbitrary number of kwargs - - yield items are dictionaries with `label` and `node` keys + - values are dictionaries with `label` and `node` keys """ if not self: # empty repo - return ({'before': (), 'after': ()},) + return templateutil.mappinglist([ + {'before': templateutil.mappinglist([]), + 'after': templateutil.mappinglist([])}, + ]) targets = [] for f in _navseq(1, pagelen): @@ -114,22 +145,25 @@ targets.sort() first = self._first() - navbefore = [("(%i)" % first, self.hex(first))] + navbefore = [{'label': '(%i)' % first, 'node': self.hex(first)}] navafter = [] for rev in targets: if rev not in self._revlog: continue if pos < rev < limit: - navafter.append(("+%d" % abs(rev - pos), self.hex(rev))) + navafter.append({'label': '+%d' % abs(rev - pos), + 'node': self.hex(rev)}) if 0 < rev < pos: - navbefore.append(("-%d" % abs(rev - pos), self.hex(rev))) - + navbefore.append({'label': '-%d' % abs(rev - pos), + 'node': self.hex(rev)}) - navafter.append(("tip", "tip")) + navafter.append({'label': 'tip', 'node': 'tip'}) - data = lambda i: {"label": i[0], "node": i[1]} - return ({'before': lambda **map: (data(i) for i in navbefore), - 'after': lambda **map: (data(i) for i in navafter)},) + # TODO: maybe this can be a scalar object supporting tomap() + return templateutil.mappinglist([ + {'before': templateutil.mappinglist(navbefore), + 'after': templateutil.mappinglist(navafter)}, + ]) class filerevnav(revnav): @@ -147,46 +181,45 @@ def hex(self, rev): return hex(self._changelog.node(self._revlog.linkrev(rev))) -class _siblings(object): - def __init__(self, siblings=None, hiderev=None): - if siblings is None: - siblings = [] - self.siblings = [s for s in siblings if s.node() != nullid] - if len(self.siblings) == 1 and self.siblings[0].rev() == hiderev: - self.siblings = [] +# TODO: maybe this can be a wrapper class for changectx/filectx list, which +# yields {'ctx': ctx} +def _ctxsgen(context, ctxs): + for s in ctxs: + d = { + 'node': s.hex(), + 'rev': s.rev(), + 'user': s.user(), + 'date': s.date(), + 'description': s.description(), + 'branch': s.branch(), + } + if util.safehasattr(s, 'path'): + d['file'] = s.path() + yield d - def __iter__(self): - for s in self.siblings: - d = { - 'node': s.hex(), - 'rev': s.rev(), - 'user': s.user(), - 'date': s.date(), - 'description': s.description(), - 'branch': s.branch(), - } - if util.safehasattr(s, 'path'): - d['file'] = s.path() - yield d - - def __len__(self): - return len(self.siblings) +def _siblings(siblings=None, hiderev=None): + if siblings is None: + siblings = [] + siblings = [s for s in siblings if s.node() != nullid] + if len(siblings) == 1 and siblings[0].rev() == hiderev: + siblings = [] + return templateutil.mappinggenerator(_ctxsgen, args=(siblings,)) def difffeatureopts(req, ui, section): diffopts = patch.difffeatureopts(ui, untrusted=True, section=section, whitespace=True) for k in ('ignorews', 'ignorewsamount', 'ignorewseol', 'ignoreblanklines'): - v = req.form.get(k, [None])[0] + v = req.qsparams.get(k) if v is not None: - v = util.parsebool(v) + v = stringutil.parsebool(v) setattr(diffopts, k, v if v is not None else True) return diffopts def annotate(req, fctx, ui): diffopts = difffeatureopts(req, ui, 'annotate') - return fctx.annotate(follow=True, linenumber=True, diffopts=diffopts) + return fctx.annotate(follow=True, diffopts=diffopts) def parents(ctx, hide=None): if isinstance(ctx, context.basefilectx): @@ -242,12 +275,18 @@ return branches def showtag(repo, tmpl, t1, node=nullid, **args): + args = pycompat.byteskwargs(args) for t in repo.nodetags(node): - yield tmpl(t1, tag=t, **args) + lm = args.copy() + lm['tag'] = t + yield tmpl.generate(t1, lm) def showbookmark(repo, tmpl, t1, node=nullid, **args): + args = pycompat.byteskwargs(args) for t in repo.nodebookmarks(node): - yield tmpl(t1, bookmark=t, **args) + lm = args.copy() + lm['bookmark'] = t + yield tmpl.generate(t1, lm) def branchentries(repo, stripecount, limit=0): tips = [] @@ -284,57 +323,46 @@ path = path.lstrip('/') return pathutil.canonpath(repo.root, '', path) -def changeidctx(repo, changeid): - try: - ctx = repo[changeid] - except error.RepoError: - man = repo.manifestlog._revlog - ctx = repo[man.linkrev(man.rev(man.lookup(changeid)))] - - return ctx - def changectx(repo, req): changeid = "tip" - if 'node' in req.form: - changeid = req.form['node'][0] + if 'node' in req.qsparams: + changeid = req.qsparams['node'] ipos = changeid.find(':') if ipos != -1: changeid = changeid[(ipos + 1):] - elif 'manifest' in req.form: - changeid = req.form['manifest'][0] - return changeidctx(repo, changeid) + return scmutil.revsymbol(repo, changeid) def basechangectx(repo, req): - if 'node' in req.form: - changeid = req.form['node'][0] + if 'node' in req.qsparams: + changeid = req.qsparams['node'] ipos = changeid.find(':') if ipos != -1: changeid = changeid[:ipos] - return changeidctx(repo, changeid) + return scmutil.revsymbol(repo, changeid) return None def filectx(repo, req): - if 'file' not in req.form: + if 'file' not in req.qsparams: raise ErrorResponse(HTTP_NOT_FOUND, 'file not given') - path = cleanpath(repo, req.form['file'][0]) - if 'node' in req.form: - changeid = req.form['node'][0] - elif 'filenode' in req.form: - changeid = req.form['filenode'][0] + path = cleanpath(repo, req.qsparams['file']) + if 'node' in req.qsparams: + changeid = req.qsparams['node'] + elif 'filenode' in req.qsparams: + changeid = req.qsparams['filenode'] else: raise ErrorResponse(HTTP_NOT_FOUND, 'node or filenode not given') try: - fctx = repo[changeid][path] + fctx = scmutil.revsymbol(repo, changeid)[path] except error.RepoError: fctx = repo.filectx(path, fileid=changeid) return fctx def linerange(req): - linerange = req.form.get('linerange') - if linerange is None: + linerange = req.qsparams.getall('linerange') + if not linerange: return None if len(linerange) > 1: raise ErrorResponse(HTTP_BAD_REQUEST, @@ -347,20 +375,41 @@ try: return util.processlinerange(fromline, toline) except error.ParseError as exc: - raise ErrorResponse(HTTP_BAD_REQUEST, str(exc)) + raise ErrorResponse(HTTP_BAD_REQUEST, pycompat.bytestr(exc)) def formatlinerange(fromline, toline): return '%d:%d' % (fromline + 1, toline) -def succsandmarkers(repo, ctx): - for item in templatekw.showsuccsandmarkers(repo, ctx): +def succsandmarkers(context, mapping): + repo = context.resource(mapping, 'repo') + itemmappings = templatekw.showsuccsandmarkers(context, mapping) + for item in itemmappings.tovalue(context, mapping): item['successors'] = _siblings(repo[successor] for successor in item['successors']) yield item +# teach templater succsandmarkers is switched to (context, mapping) API +succsandmarkers._requires = {'repo', 'ctx'} + +def whyunstable(context, mapping): + repo = context.resource(mapping, 'repo') + ctx = context.resource(mapping, 'ctx') + + entries = obsutil.whyunstable(repo, ctx) + for entry in entries: + if entry.get('divergentnodes'): + entry['divergentnodes'] = _siblings(entry['divergentnodes']) + yield entry + +whyunstable._requires = {'repo', 'ctx'} + def commonentry(repo, ctx): node = ctx.node() return { + # TODO: perhaps ctx.changectx() should be assigned if ctx is a + # filectx, but I'm not pretty sure if that would always work because + # fctx.parents() != fctx.changectx.parents() for example. + 'ctx': ctx, 'rev': ctx.rev(), 'node': hex(node), 'author': ctx.user(), @@ -369,8 +418,9 @@ 'extra': ctx.extra(), 'phase': ctx.phasestr(), 'obsolete': ctx.obsolete(), - 'succsandmarkers': lambda **x: succsandmarkers(repo, ctx), + 'succsandmarkers': succsandmarkers, 'instabilities': [{"instability": i} for i in ctx.instabilities()], + 'whyunstable': whyunstable, 'branch': nodebranchnodefault(ctx), 'inbranch': nodeinbranch(repo, ctx), 'branches': nodebranchdict(repo, ctx), @@ -380,7 +430,7 @@ 'child': lambda **x: children(ctx), } -def changelistentry(web, ctx, tmpl): +def changelistentry(web, ctx): '''Obtain a dictionary to be used for entries in a changelist. This function is called when producing items for the "entries" list passed @@ -389,8 +439,8 @@ repo = web.repo rev = ctx.rev() n = ctx.node() - showtags = showtag(repo, tmpl, 'changelogtag', n) - files = listfilediffs(tmpl, ctx.files(), n, web.maxfiles) + showtags = showtag(repo, web.tmpl, 'changelogtag', n) + files = listfilediffs(web.tmpl, ctx.files(), n, web.maxfiles) entry = commonentry(repo, ctx) entry.update( @@ -403,16 +453,16 @@ return entry def symrevorshortnode(req, ctx): - if 'node' in req.form: - return templatefilters.revescape(req.form['node'][0]) + if 'node' in req.qsparams: + return templatefilters.revescape(req.qsparams['node']) else: return short(ctx.node()) -def changesetentry(web, req, tmpl, ctx): +def changesetentry(web, ctx): '''Obtain a dictionary to be used to render the "changeset" template.''' - showtags = showtag(web.repo, tmpl, 'changesettag', ctx.node()) - showbookmarks = showbookmark(web.repo, tmpl, 'changesetbookmark', + showtags = showtag(web.repo, web.tmpl, 'changesettag', ctx.node()) + showbookmarks = showbookmark(web.repo, web.tmpl, 'changesetbookmark', ctx.node()) showbranch = nodebranchnodefault(ctx) @@ -420,27 +470,30 @@ parity = paritygen(web.stripecount) for blockno, f in enumerate(ctx.files()): template = 'filenodelink' if f in ctx else 'filenolink' - files.append(tmpl(template, - node=ctx.hex(), file=f, blockno=blockno + 1, - parity=next(parity))) + files.append(web.tmpl.generate(template, { + 'node': ctx.hex(), + 'file': f, + 'blockno': blockno + 1, + 'parity': next(parity), + })) - basectx = basechangectx(web.repo, req) + basectx = basechangectx(web.repo, web.req) if basectx is None: basectx = ctx.p1() style = web.config('web', 'style') - if 'style' in req.form: - style = req.form['style'][0] + if 'style' in web.req.qsparams: + style = web.req.qsparams['style'] - diff = diffs(web, tmpl, ctx, basectx, None, style) + diff = diffs(web, ctx, basectx, None, style) parity = paritygen(web.stripecount) diffstatsgen = diffstatgen(ctx, basectx) - diffstats = diffstat(tmpl, ctx, diffstatsgen, parity) + diffstats = diffstat(web.tmpl, ctx, diffstatsgen, parity) return dict( diff=diff, - symrev=symrevorshortnode(req, ctx), + symrev=symrevorshortnode(web.req, ctx), basenode=basectx.hex(), changesettag=showtags, changesetbookmark=showbookmarks, @@ -449,15 +502,15 @@ diffsummary=lambda **x: diffsummary(diffstatsgen), diffstat=diffstats, archives=web.archivelist(ctx.hex()), - **commonentry(web.repo, ctx)) + **pycompat.strkwargs(commonentry(web.repo, ctx))) def listfilediffs(tmpl, files, node, max): for f in files[:max]: - yield tmpl('filedifflink', node=hex(node), file=f) + yield tmpl.generate('filedifflink', {'node': hex(node), 'file': f}) if len(files) > max: - yield tmpl('fileellipses') + yield tmpl.generate('fileellipses', {}) -def diffs(web, tmpl, ctx, basectx, files, style, linerange=None, +def diffs(web, ctx, basectx, files, style, linerange=None, lineidprefix=''): def prettyprintlines(lines, blockno): @@ -471,11 +524,12 @@ ltype = "difflineat" else: ltype = "diffline" - yield tmpl(ltype, - line=l, - lineno=lineno, - lineid=lineidprefix + "l%s" % difflineno, - linenumber="% 8s" % difflineno) + yield web.tmpl.generate(ltype, { + 'line': l, + 'lineno': lineno, + 'lineid': lineidprefix + "l%s" % difflineno, + 'linenumber': "% 8s" % difflineno, + }) repo = web.repo if files: @@ -500,24 +554,30 @@ continue lines.extend(hunklines) if lines: - yield tmpl('diffblock', parity=next(parity), blockno=blockno, - lines=prettyprintlines(lines, blockno)) + yield web.tmpl.generate('diffblock', { + 'parity': next(parity), + 'blockno': blockno, + 'lines': prettyprintlines(lines, blockno), + }) def compare(tmpl, context, leftlines, rightlines): '''Generator function that provides side-by-side comparison data.''' def compline(type, leftlineno, leftline, rightlineno, rightline): - lineid = leftlineno and ("l%s" % leftlineno) or '' - lineid += rightlineno and ("r%s" % rightlineno) or '' - return tmpl('comparisonline', - type=type, - lineid=lineid, - leftlineno=leftlineno, - leftlinenumber="% 6s" % (leftlineno or ''), - leftline=leftline or '', - rightlineno=rightlineno, - rightlinenumber="% 6s" % (rightlineno or ''), - rightline=rightline or '') + lineid = leftlineno and ("l%d" % leftlineno) or '' + lineid += rightlineno and ("r%d" % rightlineno) or '' + llno = '%d' % leftlineno if leftlineno else '' + rlno = '%d' % rightlineno if rightlineno else '' + return tmpl.generate('comparisonline', { + 'type': type, + 'lineid': lineid, + 'leftlineno': leftlineno, + 'leftlinenumber': "% 6s" % llno, + 'leftline': leftline or '', + 'rightlineno': rightlineno, + 'rightlinenumber': "% 6s" % rlno, + 'rightline': rightline or '', + }) def getblock(opcodes): for type, llo, lhi, rlo, rhi in opcodes: @@ -547,10 +607,11 @@ s = difflib.SequenceMatcher(None, leftlines, rightlines) if context < 0: - yield tmpl('comparisonblock', lines=getblock(s.get_opcodes())) + yield tmpl.generate('comparisonblock', + {'lines': getblock(s.get_opcodes())}) else: for oc in s.get_grouped_opcodes(n=context): - yield tmpl('comparisonblock', lines=getblock(oc)) + yield tmpl.generate('comparisonblock', {'lines': getblock(oc)}) def diffstatgen(ctx, basectx): '''Generator function that provides the diffstat data.''' @@ -584,29 +645,49 @@ template = 'diffstatlink' if filename in files else 'diffstatnolink' total = adds + removes fileno += 1 - yield tmpl(template, node=ctx.hex(), file=filename, fileno=fileno, - total=total, addpct=pct(adds), removepct=pct(removes), - parity=next(parity)) + yield tmpl.generate(template, { + 'node': ctx.hex(), + 'file': filename, + 'fileno': fileno, + 'total': total, + 'addpct': pct(adds), + 'removepct': pct(removes), + 'parity': next(parity), + }) -class sessionvars(object): +class sessionvars(templateutil.wrapped): def __init__(self, vars, start='?'): - self.start = start - self.vars = vars + self._start = start + self._vars = vars + def __getitem__(self, key): - return self.vars[key] + return self._vars[key] + def __setitem__(self, key, value): - self.vars[key] = value + self._vars[key] = value + def __copy__(self): - return sessionvars(copy.copy(self.vars), self.start) - def __iter__(self): - separator = self.start - for key, value in sorted(self.vars.iteritems()): + return sessionvars(copy.copy(self._vars), self._start) + + def itermaps(self, context): + separator = self._start + for key, value in sorted(self._vars.iteritems()): yield {'name': key, 'value': pycompat.bytestr(value), 'separator': separator, } separator = '&' + def join(self, context, mapping, sep): + # could be '{separator}{name}={value|urlescape}' + raise error.ParseError(_('not displayable without template')) + + def show(self, context, mapping): + return self.join(context, '') + + def tovalue(self, context, mapping): + return self._vars + class wsgiui(uimod.ui): # default termwidth breaks under mod_wsgi def termwidth(self): @@ -619,14 +700,14 @@ websubdefs += repo.ui.configitems('interhg') for key, pattern in websubdefs: # grab the delimiter from the character after the "s" - unesc = pattern[1] + unesc = pattern[1:2] delim = re.escape(unesc) # identify portions of the pattern, taking care to avoid escaped # delimiters. the replace format and flags are optional, but # delimiters are required. match = re.match( - r'^s%s(.+)(?:(?<=\\\\)|(?@,;:\\"/\[\]\?=]') + +def _formatparam(param, value=None, quote=1): + """Convenience function to format and return a key=value pair. + This will quote the value if needed or if quote is true. + """ + if value is not None and len(value) > 0: + if quote or tspecials.search(value): + value = value.replace('\\', '\\\\').replace('"', r'\"') + return '%s="%s"' % (param, value) + else: + return '%s=%s' % (param, value) + else: + return param + + +class Headers(object): + """Manage a collection of HTTP response headers""" + + def __init__(self, headers=None): + headers = headers if headers is not None else [] + if type(headers) is not list: + raise TypeError("Headers must be a list of name/value tuples") + self._headers = headers + if __debug__: + for k, v in headers: + self._convert_string_type(k) + self._convert_string_type(v) + + def _convert_string_type(self, value): + """Convert/check value type.""" + if type(value) is bytes: + return value + raise AssertionError(u"Header names/values must be" + u" of type bytes (got %s)" % repr(value)) + + def __len__(self): + """Return the total number of headers, including duplicates.""" + return len(self._headers) + + def __setitem__(self, name, val): + """Set the value of a header.""" + del self[name] + self._headers.append( + (self._convert_string_type(name), self._convert_string_type(val))) + + def __delitem__(self, name): + """Delete all occurrences of a header, if present. + Does *not* raise an exception if the header is missing. + """ + name = self._convert_string_type(name.lower()) + self._headers[:] = [kv for kv in self._headers if kv[0].lower() != name] + + def __getitem__(self, name): + """Get the first header value for 'name' + Return None if the header is missing instead of raising an exception. + Note that if the header appeared multiple times, the first exactly which + occurrence gets returned is undefined. Use getall() to get all + the values matching a header field name. + """ + return self.get(name) + + def __contains__(self, name): + """Return true if the message contains the header.""" + return self.get(name) is not None + + + def get_all(self, name): + """Return a list of all the values for the named field. + These will be sorted in the order they appeared in the original header + list or were added to this instance, and may contain duplicates. Any + fields deleted and re-inserted are always appended to the header list. + If no fields exist with the given name, returns an empty list. + """ + name = self._convert_string_type(name.lower()) + return [kv[1] for kv in self._headers if kv[0].lower()==name] + + + def get(self, name, default=None): + """Get the first header value for 'name', or return 'default'""" + name = self._convert_string_type(name.lower()) + for k, v in self._headers: + if k.lower()==name: + return v + return default + + + def keys(self): + """Return a list of all the header field names. + These will be sorted in the order they appeared in the original header + list, or were added to this instance, and may contain duplicates. + Any fields deleted and re-inserted are always appended to the header + list. + """ + return [k for k, v in self._headers] + + def values(self): + """Return a list of all header values. + These will be sorted in the order they appeared in the original header + list, or were added to this instance, and may contain duplicates. + Any fields deleted and re-inserted are always appended to the header + list. + """ + return [v for k, v in self._headers] + + def items(self): + """Get all the header fields and values. + These will be sorted in the order they were in the original header + list, or were added to this instance, and may contain duplicates. + Any fields deleted and re-inserted are always appended to the header + list. + """ + return self._headers[:] + + def __repr__(self): + return "%s(%r)" % (self.__class__.__name__, self._headers) + + def __str__(self): + """str() returns the formatted headers, complete with end line, + suitable for direct HTTP transmission.""" + return '\r\n'.join(["%s: %s" % kv for kv in self._headers]+['','']) + + def __bytes__(self): + return str(self).encode('iso-8859-1') + + def setdefault(self, name, value): + """Return first matching header value for 'name', or 'value' + If there is no header named 'name', add a new header with name 'name' + and value 'value'.""" + result = self.get(name) + if result is None: + self._headers.append((self._convert_string_type(name), + self._convert_string_type(value))) + return value + else: + return result + + def add_header(self, _name, _value, **_params): + """Extended header setting. + _name is the header field to add. keyword arguments can be used to set + additional parameters for the header field, with underscores converted + to dashes. Normally the parameter will be added as key="value" unless + value is None, in which case only the key will be added. + Example: + h.add_header('content-disposition', 'attachment', filename='bud.gif') + Note that unlike the corresponding 'email.message' method, this does + *not* handle '(charset, language, value)' tuples: all values must be + strings or None. + """ + parts = [] + if _value is not None: + _value = self._convert_string_type(_value) + parts.append(_value) + for k, v in _params.items(): + k = self._convert_string_type(k) + if v is None: + parts.append(k.replace('_', '-')) + else: + v = self._convert_string_type(v) + parts.append(_formatparam(k.replace('_', '-'), v)) + self._headers.append( + (self._convert_string_type(_name), "; ".join(parts))) diff -r fb92df8b634c -r ed5448edcbfa mercurial/hook.py --- a/mercurial/hook.py Wed Apr 04 10:35:09 2018 -0400 +++ b/mercurial/hook.py Wed Apr 18 15:32:08 2018 -0400 @@ -19,6 +19,10 @@ pycompat, util, ) +from .utils import ( + procutil, + stringutil, +) def _pythonhook(ui, repo, htype, hname, funcname, args, throw): '''call python hook. hook is callable object, looked up as @@ -41,7 +45,7 @@ % (hname, funcname)) modname = funcname[:d] oldpaths = sys.path - if util.mainfrozen(): + if procutil.mainfrozen(): # binary installs require sys.path manipulation modpath, modfile = os.path.split(modname) if modpath and modfile: @@ -49,12 +53,12 @@ modname = modfile with demandimport.deactivated(): try: - obj = __import__(modname) + obj = __import__(pycompat.sysstr(modname)) except (ImportError, SyntaxError): e1 = sys.exc_info() try: # extensions are loaded with hgext_ prefix - obj = __import__("hgext_%s" % modname) + obj = __import__(r"hgext_%s" % pycompat.sysstr(modname)) except (ImportError, SyntaxError): e2 = sys.exc_info() if ui.tracebackflag: @@ -133,12 +137,8 @@ for k, v in args.iteritems(): if callable(v): v = v() - if isinstance(v, dict): - # make the dictionary element order stable across Python - # implementations - v = ('{' + - ', '.join('%r: %r' % i for i in sorted(v.iteritems())) + - '}') + if isinstance(v, (dict, list)): + v = stringutil.pprint(v, bprefix=False) env['HG_' + k.upper()] = v if repo: @@ -151,7 +151,7 @@ ui.log('exthook', 'exthook-%s: %s finished in %0.2f seconds\n', name, cmd, duration) if r: - desc, r = util.explainexit(r) + desc = procutil.explainexit(r) if throw: raise error.HookAbort(_('%s hook %s') % (name, desc)) ui.warn(_('warning: %s hook %s\n') % (name, desc)) @@ -222,11 +222,11 @@ for hname, cmd in hooks: if oldstdout == -1 and _redirect: try: - stdoutno = util.stdout.fileno() - stderrno = util.stderr.fileno() + stdoutno = procutil.stdout.fileno() + stderrno = procutil.stderr.fileno() # temporarily redirect stdout to stderr, if possible if stdoutno >= 0 and stderrno >= 0: - util.stdout.flush() + procutil.stdout.flush() oldstdout = os.dup(stdoutno) os.dup2(stderrno, stdoutno) except (OSError, AttributeError): @@ -265,14 +265,14 @@ raised = False res[hname] = r, raised + finally: + # The stderr is fully buffered on Windows when connected to a pipe. + # A forcible flush is required to make small stderr data in the + # remote side available to the client immediately. + procutil.stderr.flush() - # The stderr is fully buffered on Windows when connected to a pipe. - # A forcible flush is required to make small stderr data in the - # remote side available to the client immediately. - util.stderr.flush() - finally: if _redirect and oldstdout >= 0: - util.stdout.flush() # write hook output to stderr fd + procutil.stdout.flush() # write hook output to stderr fd os.dup2(oldstdout, stdoutno) os.close(oldstdout) diff -r fb92df8b634c -r ed5448edcbfa mercurial/httpclient/__init__.py --- a/mercurial/httpclient/__init__.py Wed Apr 04 10:35:09 2018 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,912 +0,0 @@ -# Copyright 2010, Google Inc. -# All rights reserved. -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions are -# met: -# -# * Redistributions of source code must retain the above copyright -# notice, this list of conditions and the following disclaimer. -# * Redistributions in binary form must reproduce the above -# copyright notice, this list of conditions and the following disclaimer -# in the documentation and/or other materials provided with the -# distribution. -# * Neither the name of Google Inc. nor the names of its -# contributors may be used to endorse or promote products derived from -# this software without specific prior written permission. - -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT -# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT -# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, -# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY -# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -"""Improved HTTP/1.1 client library - -This library contains an HTTPConnection which is similar to the one in -httplib, but has several additional features: - - * supports keepalives natively - * uses select() to block for incoming data - * notices when the server responds early to a request - * implements ssl inline instead of in a different class -""" -from __future__ import absolute_import - -# Many functions in this file have too many arguments. -# pylint: disable=R0913 -import email -import email.message -import errno -import inspect -import logging -import select -import socket -import ssl -import sys - -try: - import cStringIO as io - io.StringIO -except ImportError: - import io - -try: - import httplib - httplib.HTTPException -except ImportError: - import http.client as httplib - -from . import ( - _readers, -) - -logger = logging.getLogger(__name__) - -__all__ = ['HTTPConnection', 'HTTPResponse'] - -HTTP_VER_1_0 = b'HTTP/1.0' -HTTP_VER_1_1 = b'HTTP/1.1' - -OUTGOING_BUFFER_SIZE = 1 << 15 -INCOMING_BUFFER_SIZE = 1 << 20 - -HDR_ACCEPT_ENCODING = 'accept-encoding' -HDR_CONNECTION_CTRL = 'connection' -HDR_CONTENT_LENGTH = 'content-length' -HDR_XFER_ENCODING = 'transfer-encoding' - -XFER_ENCODING_CHUNKED = 'chunked' - -CONNECTION_CLOSE = 'close' - -EOL = b'\r\n' -_END_HEADERS = EOL * 2 - -# Based on some searching around, 1 second seems like a reasonable -# default here. -TIMEOUT_ASSUME_CONTINUE = 1 -TIMEOUT_DEFAULT = None - -if sys.version_info > (3, 0): - _unicode = str -else: - _unicode = unicode - -def _ensurebytes(data): - if not isinstance(data, (_unicode, bytes)): - data = str(data) - if not isinstance(data, bytes): - try: - return data.encode('latin-1') - except UnicodeEncodeError as err: - raise UnicodeEncodeError( - err.encoding, - err.object, - err.start, - err.end, - '%r is not valid Latin-1 Use .encode("utf-8") ' - 'if sending as utf-8 is desired.' % ( - data[err.start:err.end],)) - return data - -class _CompatMessage(email.message.Message): - """Workaround for rfc822.Message and email.message.Message API diffs.""" - - @classmethod - def from_string(cls, s): - if sys.version_info > (3, 0): - # Python 3 can't decode headers from bytes, so we have to - # trust RFC 2616 and decode the headers as iso-8859-1 - # bytes. - s = s.decode('iso-8859-1') - headers = email.message_from_string(s, _class=_CompatMessage) - # Fix multi-line headers to match httplib's behavior from - # Python 2.x, since email.message.Message handles them in - # slightly different ways. - if sys.version_info < (3, 0): - new = [] - for h, v in headers._headers: - if '\r\n' in v: - v = '\n'.join([' ' + x.lstrip() for x in v.split('\r\n')])[1:] - new.append((h, v)) - headers._headers = new - return headers - - def getheaders(self, key): - return self.get_all(key) - - def getheader(self, key, default=None): - return self.get(key, failobj=default) - - -class HTTPResponse(object): - """Response from an HTTP server. - - The response will continue to load as available. If you need the - complete response before continuing, check the .complete() method. - """ - def __init__(self, sock, timeout, method): - self.sock = sock - self.method = method - self.raw_response = b'' - self._headers_len = 0 - self.headers = None - self.will_close = False - self.status_line = b'' - self.status = None - self.continued = False - self.http_version = None - self.reason = None - self._reader = None - - self._read_location = 0 - self._eol = EOL - - self._timeout = timeout - - @property - def _end_headers(self): - return self._eol * 2 - - def complete(self): - """Returns true if this response is completely loaded. - - Note that if this is a connection where complete means the - socket is closed, this will nearly always return False, even - in cases where all the data has actually been loaded. - """ - if self._reader: - return self._reader.done() - - def _close(self): - if self._reader is not None: - # We're a friend of the reader class here. - # pylint: disable=W0212 - self._reader._close() - - def getheader(self, header, default=None): - return self.headers.getheader(header, default=default) - - def getheaders(self): - if sys.version_info < (3, 0): - return [(k.lower(), v) for k, v in self.headers.items()] - # Starting in Python 3, headers aren't lowercased before being - # returned here. - return self.headers.items() - - def readline(self): - """Read a single line from the response body. - - This may block until either a line ending is found or the - response is complete. - """ - blocks = [] - while True: - self._reader.readto(b'\n', blocks) - - if blocks and blocks[-1][-1:] == b'\n' or self.complete(): - break - - self._select() - - return b''.join(blocks) - - def read(self, length=None): - """Read data from the response body.""" - # if length is None, unbounded read - while (not self.complete() # never select on a finished read - and (not length # unbounded, so we wait for complete() - or length > self._reader.available_data)): - self._select() - if not length: - length = self._reader.available_data - r = self._reader.read(length) - if self.complete() and self.will_close: - self.sock.close() - return r - - def _select(self): - r, unused_write, unused_err = select.select( - [self.sock], [], [], self._timeout) - if not r: - # socket was not readable. If the response is not - # complete, raise a timeout. - if not self.complete(): - logger.info('timed out with timeout of %s', self._timeout) - raise HTTPTimeoutException('timeout reading data') - try: - data = self.sock.recv(INCOMING_BUFFER_SIZE) - except ssl.SSLError as e: - if e.args[0] != ssl.SSL_ERROR_WANT_READ: - raise - logger.debug('SSL_ERROR_WANT_READ in _select, should retry later') - return True - logger.debug('response read %d data during _select', len(data)) - # If the socket was readable and no data was read, that means - # the socket was closed. Inform the reader (if any) so it can - # raise an exception if this is an invalid situation. - if not data: - if self._reader: - # We're a friend of the reader class here. - # pylint: disable=W0212 - self._reader._close() - return False - else: - self._load_response(data) - return True - - # This method gets replaced by _load later, which confuses pylint. - def _load_response(self, data): # pylint: disable=E0202 - # Being here implies we're not at the end of the headers yet, - # since at the end of this method if headers were completely - # loaded we replace this method with the load() method of the - # reader we created. - self.raw_response += data - # This is a bogus server with bad line endings - if self._eol not in self.raw_response: - for bad_eol in (b'\n', b'\r'): - if (bad_eol in self.raw_response - # verify that bad_eol is not the end of the incoming data - # as this could be a response line that just got - # split between \r and \n. - and (self.raw_response.index(bad_eol) < - (len(self.raw_response) - 1))): - logger.info('bogus line endings detected, ' - 'using %r for EOL', bad_eol) - self._eol = bad_eol - break - # exit early if not at end of headers - if self._end_headers not in self.raw_response or self.headers: - return - - # handle 100-continue response - hdrs, body = self.raw_response.split(self._end_headers, 1) - unused_http_ver, status = hdrs.split(b' ', 1) - if status.startswith(b'100'): - self.raw_response = body - self.continued = True - logger.debug('continue seen, setting body to %r', body) - return - - # arriving here means we should parse response headers - # as all headers have arrived completely - hdrs, body = self.raw_response.split(self._end_headers, 1) - del self.raw_response - if self._eol in hdrs: - self.status_line, hdrs = hdrs.split(self._eol, 1) - else: - self.status_line = hdrs - hdrs = b'' - # TODO HTTP < 1.0 support - (self.http_version, self.status, - self.reason) = self.status_line.split(b' ', 2) - self.status = int(self.status) - if self._eol != EOL: - hdrs = hdrs.replace(self._eol, b'\r\n') - headers = _CompatMessage.from_string(hdrs) - content_len = None - if HDR_CONTENT_LENGTH in headers: - content_len = int(headers[HDR_CONTENT_LENGTH]) - if self.http_version == HTTP_VER_1_0: - self.will_close = True - elif HDR_CONNECTION_CTRL in headers: - self.will_close = ( - headers[HDR_CONNECTION_CTRL].lower() == CONNECTION_CLOSE) - if (HDR_XFER_ENCODING in headers - and headers[HDR_XFER_ENCODING].lower() == XFER_ENCODING_CHUNKED): - self._reader = _readers.ChunkedReader(self._eol) - logger.debug('using a chunked reader') - else: - # HEAD responses are forbidden from returning a body, and - # it's implausible for a CONNECT response to use - # close-is-end logic for an OK response. - if (self.method == b'HEAD' or - (self.method == b'CONNECT' and content_len is None)): - content_len = 0 - if content_len is not None: - logger.debug('using a content-length reader with length %d', - content_len) - self._reader = _readers.ContentLengthReader(content_len) - else: - # Response body had no length specified and is not - # chunked, so the end of the body will only be - # identifiable by the termination of the socket by the - # server. My interpretation of the spec means that we - # are correct in hitting this case if - # transfer-encoding, content-length, and - # connection-control were left unspecified. - self._reader = _readers.CloseIsEndReader() - logger.debug('using a close-is-end reader') - self.will_close = True - - if body: - # We're a friend of the reader class here. - # pylint: disable=W0212 - self._reader._load(body) - logger.debug('headers complete') - self.headers = headers - # We're a friend of the reader class here. - # pylint: disable=W0212 - self._load_response = self._reader._load - -def _foldheaders(headers): - """Given some headers, rework them so we can safely overwrite values. - - >>> _foldheaders({'Accept-Encoding': 'wat'}) - {'accept-encoding': ('Accept-Encoding', 'wat')} - """ - return dict((k.lower(), (k, v)) for k, v in headers.items()) - -try: - inspect.signature - def _handlesarg(func, arg): - """ Try to determine if func accepts arg - - If it takes arg, return True - If it happens to take **args, then it could do anything: - * It could throw a different TypeError, just for fun - * It could throw an ArgumentError or anything else - * It could choose not to throw an Exception at all - ... return 'unknown' - - Otherwise, return False - """ - params = inspect.signature(func).parameters - if arg in params: - return True - for p in params: - if params[p].kind == inspect._ParameterKind.VAR_KEYWORD: - return 'unknown' - return False -except AttributeError: - def _handlesarg(func, arg): - """ Try to determine if func accepts arg - - If it takes arg, return True - If it happens to take **args, then it could do anything: - * It could throw a different TypeError, just for fun - * It could throw an ArgumentError or anything else - * It could choose not to throw an Exception at all - ... return 'unknown' - - Otherwise, return False - """ - spec = inspect.getargspec(func) - if arg in spec.args: - return True - if spec.keywords: - return 'unknown' - return False - -class HTTPConnection(object): - """Connection to a single http server. - - Supports 100-continue and keepalives natively. Uses select() for - non-blocking socket operations. - """ - http_version = HTTP_VER_1_1 - response_class = HTTPResponse - - def __init__(self, host, port=None, use_ssl=None, ssl_validator=None, - timeout=TIMEOUT_DEFAULT, - continue_timeout=TIMEOUT_ASSUME_CONTINUE, - proxy_hostport=None, proxy_headers=None, - ssl_wrap_socket=None, **ssl_opts): - """Create a new HTTPConnection. - - Args: - host: The host to which we'll connect. - port: Optional. The port over which we'll connect. Default 80 for - non-ssl, 443 for ssl. - use_ssl: Optional. Whether to use ssl. Defaults to False if port is - not 443, true if port is 443. - ssl_validator: a function(socket) to validate the ssl cert - timeout: Optional. Connection timeout, default is TIMEOUT_DEFAULT. - continue_timeout: Optional. Timeout for waiting on an expected - "100 Continue" response. Default is TIMEOUT_ASSUME_CONTINUE. - proxy_hostport: Optional. Tuple of (host, port) to use as an http - proxy for the connection. Default is to not use a proxy. - proxy_headers: Optional dict of header keys and values to send to - a proxy when using CONNECT. For compatibility with - httplib, the Proxy-Authorization header may be - specified in headers for request(), which will clobber - any such header specified here if specified. Providing - this option and not proxy_hostport will raise an - ValueError. - ssl_wrap_socket: Optional function to use for wrapping - sockets. If unspecified, the one from the ssl module will - be used if available, or something that's compatible with - it if on a Python older than 2.6. - - Any extra keyword arguments to this function will be provided - to the ssl_wrap_socket method. If no ssl - """ - host = _ensurebytes(host) - if port is None and host.count(b':') == 1 or b']:' in host: - host, port = host.rsplit(b':', 1) - port = int(port) - if b'[' in host: - host = host[1:-1] - if ssl_wrap_socket is not None: - _wrap_socket = ssl_wrap_socket - else: - _wrap_socket = ssl.wrap_socket - call_wrap_socket = None - handlesubar = _handlesarg(_wrap_socket, 'server_hostname') - if handlesubar is True: - # supports server_hostname - call_wrap_socket = _wrap_socket - handlesnobar = _handlesarg(_wrap_socket, 'serverhostname') - if handlesnobar is True and handlesubar is not True: - # supports serverhostname - def call_wrap_socket(sock, server_hostname=None, **ssl_opts): - return _wrap_socket(sock, serverhostname=server_hostname, - **ssl_opts) - if handlesubar is False and handlesnobar is False: - # does not support either - def call_wrap_socket(sock, server_hostname=None, **ssl_opts): - return _wrap_socket(sock, **ssl_opts) - if call_wrap_socket is None: - # we assume it takes **args - def call_wrap_socket(sock, **ssl_opts): - if 'server_hostname' in ssl_opts: - ssl_opts['serverhostname'] = ssl_opts['server_hostname'] - return _wrap_socket(sock, **ssl_opts) - self._ssl_wrap_socket = call_wrap_socket - if use_ssl is None and port is None: - use_ssl = False - port = 80 - elif use_ssl is None: - use_ssl = (port == 443) - elif port is None: - port = (use_ssl and 443 or 80) - self.port = port - self.ssl = use_ssl - self.ssl_opts = ssl_opts - self._ssl_validator = ssl_validator - self.host = host - self.sock = None - self._current_response = None - self._current_response_taken = False - if proxy_hostport is None: - self._proxy_host = self._proxy_port = None - if proxy_headers: - raise ValueError( - 'proxy_headers may not be specified unless ' - 'proxy_hostport is also specified.') - else: - self._proxy_headers = {} - else: - self._proxy_host, self._proxy_port = proxy_hostport - self._proxy_headers = _foldheaders(proxy_headers or {}) - - self.timeout = timeout - self.continue_timeout = continue_timeout - - def _connect(self, proxy_headers): - """Connect to the host and port specified in __init__.""" - if self.sock: - return - if self._proxy_host is not None: - logger.info('Connecting to http proxy %s:%s', - self._proxy_host, self._proxy_port) - sock = socket.create_connection((self._proxy_host, - self._proxy_port)) - if self.ssl: - data = self._buildheaders(b'CONNECT', b'%s:%d' % (self.host, - self.port), - proxy_headers, HTTP_VER_1_0) - sock.send(data) - sock.setblocking(0) - r = self.response_class(sock, self.timeout, b'CONNECT') - timeout_exc = HTTPTimeoutException( - 'Timed out waiting for CONNECT response from proxy') - while not r.complete(): - try: - # We're a friend of the response class, so let - # us use the private attribute. - # pylint: disable=W0212 - if not r._select(): - if not r.complete(): - raise timeout_exc - except HTTPTimeoutException: - # This raise/except pattern looks goofy, but - # _select can raise the timeout as well as the - # loop body. I wish it wasn't this convoluted, - # but I don't have a better solution - # immediately handy. - raise timeout_exc - if r.status != 200: - raise HTTPProxyConnectFailedException( - 'Proxy connection failed: %d %s' % (r.status, - r.read())) - logger.info('CONNECT (for SSL) to %s:%s via proxy succeeded.', - self.host, self.port) - else: - sock = socket.create_connection((self.host, self.port)) - if self.ssl: - # This is the default, but in the case of proxied SSL - # requests the proxy logic above will have cleared - # blocking mode, so re-enable it just to be safe. - sock.setblocking(1) - logger.debug('wrapping socket for ssl with options %r', - self.ssl_opts) - sock = self._ssl_wrap_socket(sock, server_hostname=self.host, - **self.ssl_opts) - if self._ssl_validator: - self._ssl_validator(sock) - sock.setblocking(0) - self.sock = sock - - def _buildheaders(self, method, path, headers, http_ver): - if self.ssl and self.port == 443 or self.port == 80: - # default port for protocol, so leave it out - hdrhost = self.host - else: - # include nonstandard port in header - if b':' in self.host: # must be IPv6 - hdrhost = b'[%s]:%d' % (self.host, self.port) - else: - hdrhost = b'%s:%d' % (self.host, self.port) - if self._proxy_host and not self.ssl: - # When talking to a regular http proxy we must send the - # full URI, but in all other cases we must not (although - # technically RFC 2616 says servers must accept our - # request if we screw up, experimentally few do that - # correctly.) - assert path[0:1] == b'/', 'path must start with a /' - path = b'http://%s%s' % (hdrhost, path) - outgoing = [b'%s %s %s%s' % (method, path, http_ver, EOL)] - headers[b'host'] = (b'Host', hdrhost) - headers[HDR_ACCEPT_ENCODING] = (HDR_ACCEPT_ENCODING, 'identity') - for hdr, val in sorted((_ensurebytes(h), _ensurebytes(v)) - for h, v in headers.values()): - outgoing.append(b'%s: %s%s' % (hdr, val, EOL)) - outgoing.append(EOL) - return b''.join(outgoing) - - def close(self): - """Close the connection to the server. - - This is a no-op if the connection is already closed. The - connection may automatically close if requested by the server - or required by the nature of a response. - """ - if self.sock is None: - return - self.sock.close() - self.sock = None - logger.info('closed connection to %s on %s', self.host, self.port) - - def busy(self): - """Returns True if this connection object is currently in use. - - If a response is still pending, this will return True, even if - the request has finished sending. In the future, - HTTPConnection may transparently juggle multiple connections - to the server, in which case this will be useful to detect if - any of those connections is ready for use. - """ - cr = self._current_response - if cr is not None: - if self._current_response_taken: - if cr.will_close: - self.sock = None - self._current_response = None - return False - elif cr.complete(): - self._current_response = None - return False - return True - return False - - def _reconnect(self, where, pheaders): - logger.info('reconnecting during %s', where) - self.close() - self._connect(pheaders) - - def request(self, method, path, body=None, headers=None, - expect_continue=False): - """Send a request to the server. - - For increased flexibility, this does not return the response - object. Future versions of HTTPConnection that juggle multiple - sockets will be able to send (for example) 5 requests all at - once, and then let the requests arrive as data is - available. Use the `getresponse()` method to retrieve the - response. - """ - if headers is None: - headers = {} - method = _ensurebytes(method) - path = _ensurebytes(path) - if self.busy(): - raise httplib.CannotSendRequest( - 'Can not send another request before ' - 'current response is read!') - self._current_response_taken = False - - logger.info('sending %s request for %s to %s on port %s', - method, path, self.host, self.port) - - hdrs = _foldheaders(headers) - # Figure out headers that have to be computed from the request - # body. - chunked = False - if body and HDR_CONTENT_LENGTH not in hdrs: - if getattr(body, '__len__', False): - hdrs[HDR_CONTENT_LENGTH] = (HDR_CONTENT_LENGTH, - b'%d' % len(body)) - elif getattr(body, 'read', False): - hdrs[HDR_XFER_ENCODING] = (HDR_XFER_ENCODING, - XFER_ENCODING_CHUNKED) - chunked = True - else: - raise BadRequestData('body has no __len__() nor read()') - # Figure out expect-continue header - if hdrs.get('expect', ('', ''))[1].lower() == b'100-continue': - expect_continue = True - elif expect_continue: - hdrs['expect'] = (b'Expect', b'100-Continue') - # httplib compatibility: if the user specified a - # proxy-authorization header, that's actually intended for a - # proxy CONNECT action, not the real request, but only if - # we're going to use a proxy. - pheaders = dict(self._proxy_headers) - if self._proxy_host and self.ssl: - pa = hdrs.pop('proxy-authorization', None) - if pa is not None: - pheaders['proxy-authorization'] = pa - # Build header data - outgoing_headers = self._buildheaders( - method, path, hdrs, self.http_version) - - # If we're reusing the underlying socket, there are some - # conditions where we'll want to retry, so make a note of the - # state of self.sock - fresh_socket = self.sock is None - self._connect(pheaders) - response = None - first = True - - while ((outgoing_headers or body) - and not (response and response.complete())): - select_timeout = self.timeout - out = outgoing_headers or body - blocking_on_continue = False - if expect_continue and not outgoing_headers and not ( - response and (response.headers or response.continued)): - logger.info( - 'waiting up to %s seconds for' - ' continue response from server', - self.continue_timeout) - select_timeout = self.continue_timeout - blocking_on_continue = True - out = False - if out: - w = [self.sock] - else: - w = [] - r, w, x = select.select([self.sock], w, [], select_timeout) - # if we were expecting a 100 continue and it's been long - # enough, just go ahead and assume it's ok. This is the - # recommended behavior from the RFC. - if r == w == x == []: - if blocking_on_continue: - expect_continue = False - logger.info('no response to continue expectation from ' - 'server, optimistically sending request body') - else: - raise HTTPTimeoutException('timeout sending data') - was_first = first - - # incoming data - if r: - try: - try: - data = r[0].recv(INCOMING_BUFFER_SIZE) - except ssl.SSLError as e: - if e.args[0] != ssl.SSL_ERROR_WANT_READ: - raise - logger.debug('SSL_ERROR_WANT_READ while sending ' - 'data, retrying...') - continue - if not data: - logger.info('socket appears closed in read') - self.sock = None - self._current_response = None - if response is not None: - # We're a friend of the response class, so let - # us use the private attribute. - # pylint: disable=W0212 - response._close() - # This if/elif ladder is a bit subtle, - # comments in each branch should help. - if response is not None and response.complete(): - # Server responded completely and then - # closed the socket. We should just shut - # things down and let the caller get their - # response. - logger.info('Got an early response, ' - 'aborting remaining request.') - break - elif was_first and response is None: - # Most likely a keepalive that got killed - # on the server's end. Commonly happens - # after getting a really large response - # from the server. - logger.info( - 'Connection appeared closed in read on first' - ' request loop iteration, will retry.') - self._reconnect('read', pheaders) - continue - else: - # We didn't just send the first data hunk, - # and either have a partial response or no - # response at all. There's really nothing - # meaningful we can do here. - raise HTTPStateError( - 'Connection appears closed after ' - 'some request data was written, but the ' - 'response was missing or incomplete!') - logger.debug('read %d bytes in request()', len(data)) - if response is None: - response = self.response_class( - r[0], self.timeout, method) - # We're a friend of the response class, so let us - # use the private attribute. - # pylint: disable=W0212 - response._load_response(data) - # Jump to the next select() call so we load more - # data if the server is still sending us content. - continue - except socket.error as e: - if e[0] != errno.EPIPE and not was_first: - raise - - # outgoing data - if w and out: - try: - if getattr(out, 'read', False): - # pylint guesses the type of out incorrectly here - # pylint: disable=E1103 - data = out.read(OUTGOING_BUFFER_SIZE) - if not data: - continue - if len(data) < OUTGOING_BUFFER_SIZE: - if chunked: - body = b'0' + EOL + EOL - else: - body = None - if chunked: - # This encode is okay because we know - # hex() is building us only 0-9 and a-f - # digits. - asciilen = hex(len(data))[2:].encode('ascii') - out = asciilen + EOL + data + EOL - else: - out = data - amt = w[0].send(out) - except socket.error as e: - if e[0] == ssl.SSL_ERROR_WANT_WRITE and self.ssl: - # This means that SSL hasn't flushed its buffer into - # the socket yet. - # TODO: find a way to block on ssl flushing its buffer - # similar to selecting on a raw socket. - continue - if e[0] == errno.EWOULDBLOCK or e[0] == errno.EAGAIN: - continue - elif (e[0] not in (errno.ECONNRESET, errno.EPIPE) - and not first): - raise - self._reconnect('write', pheaders) - amt = self.sock.send(out) - logger.debug('sent %d', amt) - first = False - if out is body: - body = out[amt:] - else: - outgoing_headers = out[amt:] - # End of request-sending loop. - - # close if the server response said to or responded before eating - # the whole request - if response is None: - response = self.response_class(self.sock, self.timeout, method) - if not fresh_socket: - if not response._select(): - # This means the response failed to get any response - # data at all, and in all probability the socket was - # closed before the server even saw our request. Try - # the request again on a fresh socket. - logger.debug('response._select() failed during request().' - ' Assuming request needs to be retried.') - self.sock = None - # Call this method explicitly to re-try the - # request. We don't use self.request() because - # some tools (notably Mercurial) expect to be able - # to subclass and redefine request(), and they - # don't have the same argspec as we do. - # - # TODO restructure sending of requests to avoid - # this recursion - return HTTPConnection.request( - self, method, path, body=body, headers=headers, - expect_continue=expect_continue) - data_left = bool(outgoing_headers or body) - if data_left: - logger.info('stopped sending request early, ' - 'will close the socket to be safe.') - response.will_close = True - if response.will_close: - # The socket will be closed by the response, so we disown - # the socket - self.sock = None - self._current_response = response - - def getresponse(self): - """Returns the response to the most recent request.""" - if self._current_response is None: - raise httplib.ResponseNotReady() - r = self._current_response - while r.headers is None: - # We're a friend of the response class, so let us use the - # private attribute. - # pylint: disable=W0212 - if not r._select() and not r.complete(): - raise _readers.HTTPRemoteClosedError() - if r.will_close: - self.sock = None - self._current_response = None - elif r.complete(): - self._current_response = None - else: - self._current_response_taken = True - return r - - -class HTTPTimeoutException(httplib.HTTPException): - """A timeout occurred while waiting on the server.""" - - -class BadRequestData(httplib.HTTPException): - """Request body object has neither __len__ nor read.""" - - -class HTTPProxyConnectFailedException(httplib.HTTPException): - """Connecting to the HTTP proxy failed.""" - - -class HTTPStateError(httplib.HTTPException): - """Invalid internal state encountered.""" - -# Forward this exception type from _readers since it needs to be part -# of the public API. -HTTPRemoteClosedError = _readers.HTTPRemoteClosedError -# no-check-code diff -r fb92df8b634c -r ed5448edcbfa mercurial/httpclient/_readers.py --- a/mercurial/httpclient/_readers.py Wed Apr 04 10:35:09 2018 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,239 +0,0 @@ -# Copyright 2011, Google Inc. -# All rights reserved. -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions are -# met: -# -# * Redistributions of source code must retain the above copyright -# notice, this list of conditions and the following disclaimer. -# * Redistributions in binary form must reproduce the above -# copyright notice, this list of conditions and the following disclaimer -# in the documentation and/or other materials provided with the -# distribution. -# * Neither the name of Google Inc. nor the names of its -# contributors may be used to endorse or promote products derived from -# this software without specific prior written permission. - -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT -# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT -# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, -# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY -# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -"""Reader objects to abstract out different body response types. - -This module is package-private. It is not expected that these will -have any clients outside of httpplus. -""" -from __future__ import absolute_import - -try: - import httplib - httplib.HTTPException -except ImportError: - import http.client as httplib - -import logging - -logger = logging.getLogger(__name__) - - -class ReadNotReady(Exception): - """Raised when read() is attempted but not enough data is loaded.""" - - -class HTTPRemoteClosedError(httplib.HTTPException): - """The server closed the remote socket in the middle of a response.""" - - -class AbstractReader(object): - """Abstract base class for response readers. - - Subclasses must implement _load, and should implement _close if - it's not an error for the server to close their socket without - some termination condition being detected during _load. - """ - def __init__(self): - self._finished = False - self._done_chunks = [] - self.available_data = 0 - - def _addchunk(self, data): - self._done_chunks.append(data) - self.available_data += len(data) - - def _pushchunk(self, data): - self._done_chunks.insert(0, data) - self.available_data += len(data) - - def _popchunk(self): - b = self._done_chunks.pop(0) - self.available_data -= len(b) - - return b - - def done(self): - """Returns true if the response body is entirely read.""" - return self._finished - - def read(self, amt): - """Read amt bytes from the response body.""" - if self.available_data < amt and not self._finished: - raise ReadNotReady() - blocks = [] - need = amt - while self._done_chunks: - b = self._popchunk() - if len(b) > need: - nb = b[:need] - self._pushchunk(b[need:]) - b = nb - blocks.append(b) - need -= len(b) - if need == 0: - break - result = b''.join(blocks) - assert len(result) == amt or (self._finished and len(result) < amt) - - return result - - def readto(self, delimstr, blocks = None): - """return available data chunks up to the first one in which - delimstr occurs. No data will be returned after delimstr -- - the chunk in which it occurs will be split and the remainder - pushed back onto the available data queue. If blocks is - supplied chunks will be added to blocks, otherwise a new list - will be allocated. - """ - if blocks is None: - blocks = [] - - while self._done_chunks: - b = self._popchunk() - i = b.find(delimstr) + len(delimstr) - if i: - if i < len(b): - self._pushchunk(b[i:]) - blocks.append(b[:i]) - break - else: - blocks.append(b) - - return blocks - - def _load(self, data): # pragma: no cover - """Subclasses must implement this. - - As data is available to be read out of this object, it should - be placed into the _done_chunks list. Subclasses should not - rely on data remaining in _done_chunks forever, as it may be - reaped if the client is parsing data as it comes in. - """ - raise NotImplementedError - - def _close(self): - """Default implementation of close. - - The default implementation assumes that the reader will mark - the response as finished on the _finished attribute once the - entire response body has been read. In the event that this is - not true, the subclass should override the implementation of - close (for example, close-is-end responses have to set - self._finished in the close handler.) - """ - if not self._finished: - raise HTTPRemoteClosedError( - 'server appears to have closed the socket mid-response') - - -class AbstractSimpleReader(AbstractReader): - """Abstract base class for simple readers that require no response decoding. - - Examples of such responses are Connection: Close (close-is-end) - and responses that specify a content length. - """ - def _load(self, data): - if data: - assert not self._finished, ( - 'tried to add data (%r) to a closed reader!' % data) - logger.debug('%s read an additional %d data', - self.name, len(data)) # pylint: disable=E1101 - self._addchunk(data) - - -class CloseIsEndReader(AbstractSimpleReader): - """Reader for responses that specify Connection: Close for length.""" - name = 'close-is-end' - - def _close(self): - logger.info('Marking close-is-end reader as closed.') - self._finished = True - - -class ContentLengthReader(AbstractSimpleReader): - """Reader for responses that specify an exact content length.""" - name = 'content-length' - - def __init__(self, amount): - AbstractSimpleReader.__init__(self) - self._amount = amount - if amount == 0: - self._finished = True - self._amount_seen = 0 - - def _load(self, data): - AbstractSimpleReader._load(self, data) - self._amount_seen += len(data) - if self._amount_seen >= self._amount: - self._finished = True - logger.debug('content-length read complete') - - -class ChunkedReader(AbstractReader): - """Reader for chunked transfer encoding responses.""" - def __init__(self, eol): - AbstractReader.__init__(self) - self._eol = eol - self._leftover_skip_amt = 0 - self._leftover_data = '' - - def _load(self, data): - assert not self._finished, 'tried to add data to a closed reader!' - logger.debug('chunked read an additional %d data', len(data)) - position = 0 - if self._leftover_data: - logger.debug( - 'chunked reader trying to finish block from leftover data') - # TODO: avoid this string concatenation if possible - data = self._leftover_data + data - position = self._leftover_skip_amt - self._leftover_data = '' - self._leftover_skip_amt = 0 - datalen = len(data) - while position < datalen: - split = data.find(self._eol, position) - if split == -1: - self._leftover_data = data - self._leftover_skip_amt = position - return - amt = int(data[position:split], base=16) - block_start = split + len(self._eol) - # If the whole data chunk plus the eol trailer hasn't - # loaded, we'll wait for the next load. - if block_start + amt + len(self._eol) > len(data): - self._leftover_data = data - self._leftover_skip_amt = position - return - if amt == 0: - self._finished = True - logger.debug('closing chunked reader due to chunk of length 0') - return - self._addchunk(data[block_start:block_start + amt]) - position = block_start + amt + len(self._eol) -# no-check-code diff -r fb92df8b634c -r ed5448edcbfa mercurial/httpconnection.py --- a/mercurial/httpconnection.py Wed Apr 04 10:35:09 2018 -0400 +++ b/mercurial/httpconnection.py Wed Apr 18 15:32:08 2018 -0400 @@ -10,15 +10,11 @@ from __future__ import absolute_import -import logging import os -import socket from .i18n import _ from . import ( - httpclient, - sslutil, - urllibcompat, + pycompat, util, ) @@ -67,6 +63,7 @@ # moved here from url.py to avoid a cycle def readauthforuri(ui, uri, user): + uri = pycompat.bytesurl(uri) # Read configuration groups = {} for key, val in ui.configitems('auth'): @@ -110,190 +107,3 @@ if user and not bestuser: auth['username'] = user return bestauth - -# Mercurial (at least until we can remove the old codepath) requires -# that the http response object be sufficiently file-like, so we -# provide a close() method here. -class HTTPResponse(httpclient.HTTPResponse): - def close(self): - pass - -class HTTPConnection(httpclient.HTTPConnection): - response_class = HTTPResponse - def request(self, method, uri, body=None, headers=None): - if headers is None: - headers = {} - if isinstance(body, httpsendfile): - body.seek(0) - httpclient.HTTPConnection.request(self, method, uri, body=body, - headers=headers) - - -_configuredlogging = False -LOGFMT = '%(levelname)s:%(name)s:%(lineno)d:%(message)s' -# Subclass BOTH of these because otherwise urllib2 "helpfully" -# reinserts them since it notices we don't include any subclasses of -# them. -class http2handler(urlreq.httphandler, urlreq.httpshandler): - def __init__(self, ui, pwmgr): - global _configuredlogging - urlreq.abstracthttphandler.__init__(self) - self.ui = ui - self.pwmgr = pwmgr - self._connections = {} - # developer config: ui.http2debuglevel - loglevel = ui.config('ui', 'http2debuglevel') - if loglevel and not _configuredlogging: - _configuredlogging = True - logger = logging.getLogger('mercurial.httpclient') - logger.setLevel(getattr(logging, loglevel.upper())) - handler = logging.StreamHandler() - handler.setFormatter(logging.Formatter(LOGFMT)) - logger.addHandler(handler) - - def close_all(self): - """Close and remove all connection objects being kept for reuse.""" - for openconns in self._connections.values(): - for conn in openconns: - conn.close() - self._connections = {} - - # shamelessly borrowed from urllib2.AbstractHTTPHandler - def do_open(self, http_class, req, use_ssl): - """Return an addinfourl object for the request, using http_class. - - http_class must implement the HTTPConnection API from httplib. - The addinfourl return value is a file-like object. It also - has methods and attributes including: - - info(): return a mimetools.Message object for the headers - - geturl(): return the original request URL - - code: HTTP status code - """ - # If using a proxy, the host returned by get_host() is - # actually the proxy. On Python 2.6.1, the real destination - # hostname is encoded in the URI in the urllib2 request - # object. On Python 2.6.5, it's stored in the _tunnel_host - # attribute which has no accessor. - tunhost = getattr(req, '_tunnel_host', None) - host = urllibcompat.gethost(req) - if tunhost: - proxyhost = host - host = tunhost - elif req.has_proxy(): - proxyhost = urllibcompat.gethost(req) - host = urllibcompat.getselector( - req).split('://', 1)[1].split('/', 1)[0] - else: - proxyhost = None - - if proxyhost: - if ':' in proxyhost: - # Note: this means we'll explode if we try and use an - # IPv6 http proxy. This isn't a regression, so we - # won't worry about it for now. - proxyhost, proxyport = proxyhost.rsplit(':', 1) - else: - proxyport = 3128 # squid default - proxy = (proxyhost, proxyport) - else: - proxy = None - - if not host: - raise urlerr.urlerror('no host given') - - connkey = use_ssl, host, proxy - allconns = self._connections.get(connkey, []) - conns = [c for c in allconns if not c.busy()] - if conns: - h = conns[0] - else: - if allconns: - self.ui.debug('all connections for %s busy, making a new ' - 'one\n' % host) - timeout = None - if req.timeout is not socket._GLOBAL_DEFAULT_TIMEOUT: - timeout = req.timeout - h = http_class(host, timeout=timeout, proxy_hostport=proxy) - self._connections.setdefault(connkey, []).append(h) - - headers = dict(req.headers) - headers.update(req.unredirected_hdrs) - headers = dict( - (name.title(), val) for name, val in headers.items()) - try: - path = urllibcompat.getselector(req) - if '://' in path: - path = path.split('://', 1)[1].split('/', 1)[1] - if path[0] != '/': - path = '/' + path - h.request(req.get_method(), path, req.data, headers) - r = h.getresponse() - except socket.error as err: # XXX what error? - raise urlerr.urlerror(err) - - # Pick apart the HTTPResponse object to get the addinfourl - # object initialized properly. - r.recv = r.read - - resp = urlreq.addinfourl(r, r.headers, urllibcompat.getfullurl(req)) - resp.code = r.status - resp.msg = r.reason - return resp - - # httplib always uses the given host/port as the socket connect - # target, and then allows full URIs in the request path, which it - # then observes and treats as a signal to do proxying instead. - def http_open(self, req): - if urllibcompat.getfullurl(req).startswith('https'): - return self.https_open(req) - def makehttpcon(*args, **kwargs): - k2 = dict(kwargs) - k2[r'use_ssl'] = False - return HTTPConnection(*args, **k2) - return self.do_open(makehttpcon, req, False) - - def https_open(self, req): - # urllibcompat.getfullurl(req) does not contain credentials and we may - # need them to match the certificates. - url = urllibcompat.getfullurl(req) - user, password = self.pwmgr.find_stored_password(url) - res = readauthforuri(self.ui, url, user) - if res: - group, auth = res - self.auth = auth - self.ui.debug("using auth.%s.* for authentication\n" % group) - else: - self.auth = None - return self.do_open(self._makesslconnection, req, True) - - def _makesslconnection(self, host, port=443, *args, **kwargs): - keyfile = None - certfile = None - - if args: # key_file - keyfile = args.pop(0) - if args: # cert_file - certfile = args.pop(0) - - # if the user has specified different key/cert files in - # hgrc, we prefer these - if self.auth and 'key' in self.auth and 'cert' in self.auth: - keyfile = self.auth['key'] - certfile = self.auth['cert'] - - # let host port take precedence - if ':' in host and '[' not in host or ']:' in host: - host, port = host.rsplit(':', 1) - port = int(port) - if '[' in host: - host = host[1:-1] - - kwargs[r'keyfile'] = keyfile - kwargs[r'certfile'] = certfile - - con = HTTPConnection(host, port, use_ssl=True, - ssl_wrap_socket=sslutil.wrapsocket, - ssl_validator=sslutil.validatesocket, - ui=self.ui, - **kwargs) - return con diff -r fb92df8b634c -r ed5448edcbfa mercurial/httppeer.py --- a/mercurial/httppeer.py Wed Apr 04 10:35:09 2018 -0400 +++ b/mercurial/httppeer.py Wed Apr 18 15:32:08 2018 -0400 @@ -14,18 +14,29 @@ import socket import struct import tempfile +import weakref from .i18n import _ -from .node import nullid +from .thirdparty import ( + cbor, +) +from .thirdparty.zope import ( + interface as zi, +) from . import ( bundle2, error, httpconnection, pycompat, + repository, statichttprepo, - url, + url as urlmod, util, - wireproto, + wireprotoframing, + wireprototypes, + wireprotov1peer, + wireprotov2peer, + wireprotov2server, ) httplib = util.httplib @@ -134,73 +145,270 @@ f.seek(0) self._index = 0 -class httppeer(wireproto.wirepeer): - def __init__(self, ui, path): +def makev1commandrequest(ui, requestbuilder, caps, capablefn, + repobaseurl, cmd, args): + """Make an HTTP request to run a command for a version 1 client. + + ``caps`` is a set of known server capabilities. The value may be + None if capabilities are not yet known. + + ``capablefn`` is a function to evaluate a capability. + + ``cmd``, ``args``, and ``data`` define the command, its arguments, and + raw data to pass to it. + """ + if cmd == 'pushkey': + args['data'] = '' + data = args.pop('data', None) + headers = args.pop('headers', {}) + + ui.debug("sending %s command\n" % cmd) + q = [('cmd', cmd)] + headersize = 0 + # Important: don't use self.capable() here or else you end up + # with infinite recursion when trying to look up capabilities + # for the first time. + postargsok = caps is not None and 'httppostargs' in caps + + # Send arguments via POST. + if postargsok and args: + strargs = urlreq.urlencode(sorted(args.items())) + if not data: + data = strargs + else: + if isinstance(data, bytes): + i = io.BytesIO(data) + i.length = len(data) + data = i + argsio = io.BytesIO(strargs) + argsio.length = len(strargs) + data = _multifile(argsio, data) + headers[r'X-HgArgs-Post'] = len(strargs) + elif args: + # Calling self.capable() can infinite loop if we are calling + # "capabilities". But that command should never accept wire + # protocol arguments. So this should never happen. + assert cmd != 'capabilities' + httpheader = capablefn('httpheader') + if httpheader: + headersize = int(httpheader.split(',', 1)[0]) + + # Send arguments via HTTP headers. + if headersize > 0: + # The headers can typically carry more data than the URL. + encargs = urlreq.urlencode(sorted(args.items())) + for header, value in encodevalueinheaders(encargs, 'X-HgArg', + headersize): + headers[header] = value + # Send arguments via query string (Mercurial <1.9). + else: + q += sorted(args.items()) + + qs = '?%s' % urlreq.urlencode(q) + cu = "%s%s" % (repobaseurl, qs) + size = 0 + if util.safehasattr(data, 'length'): + size = data.length + elif data is not None: + size = len(data) + if data is not None and r'Content-Type' not in headers: + headers[r'Content-Type'] = r'application/mercurial-0.1' + + # Tell the server we accept application/mercurial-0.2 and multiple + # compression formats if the server is capable of emitting those + # payloads. + # Note: Keep this set empty by default, as client advertisement of + # protocol parameters should only occur after the handshake. + protoparams = set() + + mediatypes = set() + if caps is not None: + mt = capablefn('httpmediatype') + if mt: + protoparams.add('0.1') + mediatypes = set(mt.split(',')) + + protoparams.add('partial-pull') + + if '0.2tx' in mediatypes: + protoparams.add('0.2') + + if '0.2tx' in mediatypes and capablefn('compression'): + # We /could/ compare supported compression formats and prune + # non-mutually supported or error if nothing is mutually supported. + # For now, send the full list to the server and have it error. + comps = [e.wireprotosupport().name for e in + util.compengines.supportedwireengines(util.CLIENTROLE)] + protoparams.add('comp=%s' % ','.join(comps)) + + if protoparams: + protoheaders = encodevalueinheaders(' '.join(sorted(protoparams)), + 'X-HgProto', + headersize or 1024) + for header, value in protoheaders: + headers[header] = value + + varyheaders = [] + for header in headers: + if header.lower().startswith(r'x-hg'): + varyheaders.append(header) + + if varyheaders: + headers[r'Vary'] = r','.join(sorted(varyheaders)) + + req = requestbuilder(pycompat.strurl(cu), data, headers) + + if data is not None: + ui.debug("sending %d bytes\n" % size) + req.add_unredirected_header(r'Content-Length', r'%d' % size) + + return req, cu, qs + +def _reqdata(req): + """Get request data, if any. If no data, returns None.""" + if pycompat.ispy3: + return req.data + if not req.has_data(): + return None + return req.get_data() + +def sendrequest(ui, opener, req): + """Send a prepared HTTP request. + + Returns the response object. + """ + if (ui.debugflag + and ui.configbool('devel', 'debug.peer-request')): + dbg = ui.debug + line = 'devel-peer-request: %s\n' + dbg(line % '%s %s' % (pycompat.bytesurl(req.get_method()), + pycompat.bytesurl(req.get_full_url()))) + hgargssize = None + + for header, value in sorted(req.header_items()): + header = pycompat.bytesurl(header) + value = pycompat.bytesurl(value) + if header.startswith('X-hgarg-'): + if hgargssize is None: + hgargssize = 0 + hgargssize += len(value) + else: + dbg(line % ' %s %s' % (header, value)) + + if hgargssize is not None: + dbg(line % ' %d bytes of commands arguments in headers' + % hgargssize) + data = _reqdata(req) + if data is not None: + length = getattr(data, 'length', None) + if length is None: + length = len(data) + dbg(line % ' %d bytes of data' % length) + + start = util.timer() + + try: + res = opener.open(req) + except urlerr.httperror as inst: + if inst.code == 401: + raise error.Abort(_('authorization failed')) + raise + except httplib.HTTPException as inst: + ui.debug('http error requesting %s\n' % + util.hidepassword(req.get_full_url())) + ui.traceback() + raise IOError(None, inst) + finally: + if ui.configbool('devel', 'debug.peer-request'): + dbg(line % ' finished in %.4f seconds (%d)' + % (util.timer() - start, res.code)) + + # Insert error handlers for common I/O failures. + _wraphttpresponse(res) + + return res + +def parsev1commandresponse(ui, baseurl, requrl, qs, resp, compressible, + allowcbor=False): + # record the url we got redirected to + respurl = pycompat.bytesurl(resp.geturl()) + if respurl.endswith(qs): + respurl = respurl[:-len(qs)] + if baseurl.rstrip('/') != respurl.rstrip('/'): + if not ui.quiet: + ui.warn(_('real URL is %s\n') % respurl) + + try: + proto = pycompat.bytesurl(resp.getheader(r'content-type', r'')) + except AttributeError: + proto = pycompat.bytesurl(resp.headers.get(r'content-type', r'')) + + safeurl = util.hidepassword(baseurl) + if proto.startswith('application/hg-error'): + raise error.OutOfBandError(resp.read()) + + # Pre 1.0 versions of Mercurial used text/plain and + # application/hg-changegroup. We don't support such old servers. + if not proto.startswith('application/mercurial-'): + ui.debug("requested URL: '%s'\n" % util.hidepassword(requrl)) + raise error.RepoError( + _("'%s' does not appear to be an hg repository:\n" + "---%%<--- (%s)\n%s\n---%%<---\n") + % (safeurl, proto or 'no content-type', resp.read(1024))) + + try: + subtype = proto.split('-', 1)[1] + + # Unless we end up supporting CBOR in the legacy wire protocol, + # this should ONLY be encountered for the initial capabilities + # request during handshake. + if subtype == 'cbor': + if allowcbor: + return respurl, proto, resp + else: + raise error.RepoError(_('unexpected CBOR response from ' + 'server')) + + version_info = tuple([int(n) for n in subtype.split('.')]) + except ValueError: + raise error.RepoError(_("'%s' sent a broken Content-Type " + "header (%s)") % (safeurl, proto)) + + # TODO consider switching to a decompression reader that uses + # generators. + if version_info == (0, 1): + if compressible: + resp = util.compengines['zlib'].decompressorreader(resp) + + elif version_info == (0, 2): + # application/mercurial-0.2 always identifies the compression + # engine in the payload header. + elen = struct.unpack('B', resp.read(1))[0] + ename = resp.read(elen) + engine = util.compengines.forwiretype(ename) + + resp = engine.decompressorreader(resp) + else: + raise error.RepoError(_("'%s' uses newer protocol %s") % + (safeurl, subtype)) + + return respurl, proto, resp + +class httppeer(wireprotov1peer.wirepeer): + def __init__(self, ui, path, url, opener, requestbuilder, caps): + self.ui = ui self._path = path - self._caps = None - self._urlopener = None - self._requestbuilder = None - u = util.url(path) - if u.query or u.fragment: - raise error.Abort(_('unsupported URL component: "%s"') % - (u.query or u.fragment)) - - # urllib cannot handle URLs with embedded user or passwd - self._url, authinfo = u.authinfo() - - self._ui = ui - ui.debug('using %s\n' % self._url) - - self._urlopener = url.opener(ui, authinfo) - self._requestbuilder = urlreq.request + self._url = url + self._caps = caps + self._urlopener = opener + self._requestbuilder = requestbuilder def __del__(self): - urlopener = getattr(self, '_urlopener', None) - if urlopener: - for h in urlopener.handlers: - h.close() - getattr(h, "close_all", lambda: None)() - - def _openurl(self, req): - if (self._ui.debugflag - and self._ui.configbool('devel', 'debug.peer-request')): - dbg = self._ui.debug - line = 'devel-peer-request: %s\n' - dbg(line % '%s %s' % (req.get_method(), req.get_full_url())) - hgargssize = None - - for header, value in sorted(req.header_items()): - if header.startswith('X-hgarg-'): - if hgargssize is None: - hgargssize = 0 - hgargssize += len(value) - else: - dbg(line % ' %s %s' % (header, value)) + for h in self._urlopener.handlers: + h.close() + getattr(h, "close_all", lambda: None)() - if hgargssize is not None: - dbg(line % ' %d bytes of commands arguments in headers' - % hgargssize) - - if req.has_data(): - data = req.get_data() - length = getattr(data, 'length', None) - if length is None: - length = len(data) - dbg(line % ' %d bytes of data' % length) - - start = util.timer() - - ret = self._urlopener.open(req) - if self._ui.configbool('devel', 'debug.peer-request'): - dbg(line % ' finished in %.4f seconds (%s)' - % (util.timer() - start, ret.code)) - return ret - - # Begin of _basepeer interface. - - @util.propertycache - def ui(self): - return self._ui + # Begin of ipeerconnection interface. def url(self): return self._path @@ -217,189 +425,28 @@ def close(self): pass - # End of _basepeer interface. + # End of ipeerconnection interface. - # Begin of _basewirepeer interface. + # Begin of ipeercommands interface. def capabilities(self): - if self._caps is None: - try: - self._fetchcaps() - except error.RepoError: - self._caps = set() - self.ui.debug('capabilities: %s\n' % - (' '.join(self._caps or ['none']))) return self._caps - # End of _basewirepeer interface. + # End of ipeercommands interface. # look up capabilities only when needed - def _fetchcaps(self): - self._caps = set(self._call('capabilities').split()) - def _callstream(self, cmd, _compressible=False, **args): args = pycompat.byteskwargs(args) - if cmd == 'pushkey': - args['data'] = '' - data = args.pop('data', None) - headers = args.pop('headers', {}) - self.ui.debug("sending %s command\n" % cmd) - q = [('cmd', cmd)] - headersize = 0 - varyheaders = [] - # Important: don't use self.capable() here or else you end up - # with infinite recursion when trying to look up capabilities - # for the first time. - postargsok = self._caps is not None and 'httppostargs' in self._caps - if postargsok and args: - strargs = urlreq.urlencode(sorted(args.items())) - if not data: - data = strargs - else: - if isinstance(data, bytes): - i = io.BytesIO(data) - i.length = len(data) - data = i - argsio = io.BytesIO(strargs) - argsio.length = len(strargs) - data = _multifile(argsio, data) - headers[r'X-HgArgs-Post'] = len(strargs) - else: - if len(args) > 0: - httpheader = self.capable('httpheader') - if httpheader: - headersize = int(httpheader.split(',', 1)[0]) - if headersize > 0: - # The headers can typically carry more data than the URL. - encargs = urlreq.urlencode(sorted(args.items())) - for header, value in encodevalueinheaders(encargs, 'X-HgArg', - headersize): - headers[header] = value - varyheaders.append(header) - else: - q += sorted(args.items()) - qs = '?%s' % urlreq.urlencode(q) - cu = "%s%s" % (self._url, qs) - size = 0 - if util.safehasattr(data, 'length'): - size = data.length - elif data is not None: - size = len(data) - if size and self.ui.configbool('ui', 'usehttp2'): - headers[r'Expect'] = r'100-Continue' - headers[r'X-HgHttp2'] = r'1' - if data is not None and r'Content-Type' not in headers: - headers[r'Content-Type'] = r'application/mercurial-0.1' - - # Tell the server we accept application/mercurial-0.2 and multiple - # compression formats if the server is capable of emitting those - # payloads. - protoparams = [] - - mediatypes = set() - if self._caps is not None: - mt = self.capable('httpmediatype') - if mt: - protoparams.append('0.1') - mediatypes = set(mt.split(',')) - - if '0.2tx' in mediatypes: - protoparams.append('0.2') - - if '0.2tx' in mediatypes and self.capable('compression'): - # We /could/ compare supported compression formats and prune - # non-mutually supported or error if nothing is mutually supported. - # For now, send the full list to the server and have it error. - comps = [e.wireprotosupport().name for e in - util.compengines.supportedwireengines(util.CLIENTROLE)] - protoparams.append('comp=%s' % ','.join(comps)) + req, cu, qs = makev1commandrequest(self.ui, self._requestbuilder, + self._caps, self.capable, + self._url, cmd, args) - if protoparams: - protoheaders = encodevalueinheaders(' '.join(protoparams), - 'X-HgProto', - headersize or 1024) - for header, value in protoheaders: - headers[header] = value - varyheaders.append(header) - - if varyheaders: - headers[r'Vary'] = r','.join(varyheaders) - - req = self._requestbuilder(pycompat.strurl(cu), data, headers) - - if data is not None: - self.ui.debug("sending %s bytes\n" % size) - req.add_unredirected_header('Content-Length', '%d' % size) - try: - resp = self._openurl(req) - except urlerr.httperror as inst: - if inst.code == 401: - raise error.Abort(_('authorization failed')) - raise - except httplib.HTTPException as inst: - self.ui.debug('http error while sending %s command\n' % cmd) - self.ui.traceback() - raise IOError(None, inst) - - # Insert error handlers for common I/O failures. - _wraphttpresponse(resp) + resp = sendrequest(self.ui, self._urlopener, req) - # record the url we got redirected to - resp_url = pycompat.bytesurl(resp.geturl()) - if resp_url.endswith(qs): - resp_url = resp_url[:-len(qs)] - if self._url.rstrip('/') != resp_url.rstrip('/'): - if not self.ui.quiet: - self.ui.warn(_('real URL is %s\n') % resp_url) - self._url = resp_url - try: - proto = pycompat.bytesurl(resp.getheader(r'content-type', r'')) - except AttributeError: - proto = pycompat.bytesurl(resp.headers.get(r'content-type', r'')) - - safeurl = util.hidepassword(self._url) - if proto.startswith('application/hg-error'): - raise error.OutOfBandError(resp.read()) - # accept old "text/plain" and "application/hg-changegroup" for now - if not (proto.startswith('application/mercurial-') or - (proto.startswith('text/plain') - and not resp.headers.get('content-length')) or - proto.startswith('application/hg-changegroup')): - self.ui.debug("requested URL: '%s'\n" % util.hidepassword(cu)) - raise error.RepoError( - _("'%s' does not appear to be an hg repository:\n" - "---%%<--- (%s)\n%s\n---%%<---\n") - % (safeurl, proto or 'no content-type', resp.read(1024))) - - if proto.startswith('application/mercurial-'): - try: - version = proto.split('-', 1)[1] - version_info = tuple([int(n) for n in version.split('.')]) - except ValueError: - raise error.RepoError(_("'%s' sent a broken Content-Type " - "header (%s)") % (safeurl, proto)) - - # TODO consider switching to a decompression reader that uses - # generators. - if version_info == (0, 1): - if _compressible: - return util.compengines['zlib'].decompressorreader(resp) - return resp - elif version_info == (0, 2): - # application/mercurial-0.2 always identifies the compression - # engine in the payload header. - elen = struct.unpack('B', resp.read(1))[0] - ename = resp.read(elen) - engine = util.compengines.forwiretype(ename) - return engine.decompressorreader(resp) - else: - raise error.RepoError(_("'%s' uses newer protocol %s") % - (safeurl, version)) - - if _compressible: - return util.compengines['zlib'].decompressorreader(resp) + self._url, ct, resp = parsev1commandresponse(self.ui, self._url, cu, qs, + resp, _compressible) return resp @@ -430,7 +477,7 @@ tempname = bundle2.writebundle(self.ui, cg, None, type) fp = httpconnection.httpsendfile(self.ui, tempname, "rb") - headers = {'Content-Type': 'application/mercurial-0.1'} + headers = {r'Content-Type': r'application/mercurial-0.1'} try: r = self._call(cmd, data=fp, headers=headers, **args) @@ -438,6 +485,11 @@ if len(vals) < 2: raise error.ResponseError(_("unexpected response:"), r) return vals + except urlerr.httperror: + # Catch and re-raise these so we don't try and treat them + # like generic socket errors. They lack any values in + # .args on Python 3 which breaks our socket.error block. + raise except socket.error as err: if err.args[0] in (errno.ECONNRESET, errno.EPIPE): raise error.Abort(_('push failed: %s') % err.args[1]) @@ -453,7 +505,7 @@ try: # dump bundle to disk fd, filename = tempfile.mkstemp(prefix="hg-bundle-", suffix=".hg") - fh = os.fdopen(fd, pycompat.sysstr("wb")) + fh = os.fdopen(fd, r"wb") d = fp.read(4096) while d: fh.write(d) @@ -461,7 +513,7 @@ fh.close() # start http push fp_ = httpconnection.httpsendfile(self.ui, filename, "rb") - headers = {'Content-Type': 'application/mercurial-0.1'} + headers = {r'Content-Type': r'application/mercurial-0.1'} return self._callstream(cmd, data=fp_, headers=headers, **args) finally: if fp_ is not None: @@ -476,28 +528,428 @@ def _abort(self, exception): raise exception -class httpspeer(httppeer): - def __init__(self, ui, path): - if not url.has_https: - raise error.Abort(_('Python support for SSL and HTTPS ' - 'is not installed')) - httppeer.__init__(self, ui, path) +def sendv2request(ui, opener, requestbuilder, apiurl, permission, requests): + reactor = wireprotoframing.clientreactor(hasmultiplesend=False, + buffersends=True) + + handler = wireprotov2peer.clienthandler(ui, reactor) + + url = '%s/%s' % (apiurl, permission) + + if len(requests) > 1: + url += '/multirequest' + else: + url += '/%s' % requests[0][0] + + for command, args, f in requests: + assert not list(handler.callcommand(command, args, f)) + + # TODO stream this. + body = b''.join(map(bytes, handler.flushcommands())) + + # TODO modify user-agent to reflect v2 + headers = { + r'Accept': wireprotov2server.FRAMINGTYPE, + r'Content-Type': wireprotov2server.FRAMINGTYPE, + } + + req = requestbuilder(pycompat.strurl(url), body, headers) + req.add_unredirected_header(r'Content-Length', r'%d' % len(body)) + + try: + res = opener.open(req) + except urlerr.httperror as e: + if e.code == 401: + raise error.Abort(_('authorization failed')) + + raise + except httplib.HTTPException as e: + ui.traceback() + raise IOError(None, e) + + return handler, res + +class queuedcommandfuture(pycompat.futures.Future): + """Wraps result() on command futures to trigger submission on call.""" + + def result(self, timeout=None): + if self.done(): + return pycompat.futures.Future.result(self, timeout) + + self._peerexecutor.sendcommands() + + # sendcommands() will restore the original __class__ and self.result + # will resolve to Future.result. + return self.result(timeout) + +@zi.implementer(repository.ipeercommandexecutor) +class httpv2executor(object): + def __init__(self, ui, opener, requestbuilder, apiurl, descriptor): + self._ui = ui + self._opener = opener + self._requestbuilder = requestbuilder + self._apiurl = apiurl + self._descriptor = descriptor + self._sent = False + self._closed = False + self._neededpermissions = set() + self._calls = [] + self._futures = weakref.WeakSet() + self._responseexecutor = None + self._responsef = None + + def __enter__(self): + return self + + def __exit__(self, exctype, excvalue, exctb): + self.close() + + def callcommand(self, command, args): + if self._sent: + raise error.ProgrammingError('callcommand() cannot be used after ' + 'commands are sent') + + if self._closed: + raise error.ProgrammingError('callcommand() cannot be used after ' + 'close()') + + # The service advertises which commands are available. So if we attempt + # to call an unknown command or pass an unknown argument, we can screen + # for this. + if command not in self._descriptor['commands']: + raise error.ProgrammingError( + 'wire protocol command %s is not available' % command) + + cmdinfo = self._descriptor['commands'][command] + unknownargs = set(args.keys()) - set(cmdinfo.get('args', {})) + + if unknownargs: + raise error.ProgrammingError( + 'wire protocol command %s does not accept argument: %s' % ( + command, ', '.join(sorted(unknownargs)))) + + self._neededpermissions |= set(cmdinfo['permissions']) + + # TODO we /could/ also validate types here, since the API descriptor + # includes types... + + f = pycompat.futures.Future() + + # Monkeypatch it so result() triggers sendcommands(), otherwise result() + # could deadlock. + f.__class__ = queuedcommandfuture + f._peerexecutor = self + + self._futures.add(f) + self._calls.append((command, args, f)) + + return f + + def sendcommands(self): + if self._sent: + return + + if not self._calls: + return + + self._sent = True + + # Unhack any future types so caller sees a clean type and so we + # break reference cycle. + for f in self._futures: + if isinstance(f, queuedcommandfuture): + f.__class__ = pycompat.futures.Future + f._peerexecutor = None + + # Mark the future as running and filter out cancelled futures. + calls = [(command, args, f) + for command, args, f in self._calls + if f.set_running_or_notify_cancel()] + + # Clear out references, prevent improper object usage. + self._calls = None + + if not calls: + return + + permissions = set(self._neededpermissions) + + if 'push' in permissions and 'pull' in permissions: + permissions.remove('pull') + + if len(permissions) > 1: + raise error.RepoError(_('cannot make request requiring multiple ' + 'permissions: %s') % + _(', ').join(sorted(permissions))) + + permission = { + 'push': 'rw', + 'pull': 'ro', + }[permissions.pop()] + + handler, resp = sendv2request( + self._ui, self._opener, self._requestbuilder, self._apiurl, + permission, calls) + + # TODO we probably want to validate the HTTP code, media type, etc. + + self._responseexecutor = pycompat.futures.ThreadPoolExecutor(1) + self._responsef = self._responseexecutor.submit(self._handleresponse, + handler, resp) + + def close(self): + if self._closed: + return + + self.sendcommands() + + self._closed = True + + if not self._responsef: + return + + try: + self._responsef.result() + finally: + self._responseexecutor.shutdown(wait=True) + self._responsef = None + self._responseexecutor = None + + # If any of our futures are still in progress, mark them as + # errored, otherwise a result() could wait indefinitely. + for f in self._futures: + if not f.done(): + f.set_exception(error.ResponseError( + _('unfulfilled command response'))) + + self._futures = None + + def _handleresponse(self, handler, resp): + # Called in a thread to read the response. + + while handler.readframe(resp): + pass -def instance(ui, path, create): +# TODO implement interface for version 2 peers +@zi.implementer(repository.ipeerconnection, repository.ipeercapabilities, + repository.ipeerrequests) +class httpv2peer(object): + def __init__(self, ui, repourl, apipath, opener, requestbuilder, + apidescriptor): + self.ui = ui + + if repourl.endswith('/'): + repourl = repourl[:-1] + + self._url = repourl + self._apipath = apipath + self._apiurl = '%s/%s' % (repourl, apipath) + self._opener = opener + self._requestbuilder = requestbuilder + self._descriptor = apidescriptor + + # Start of ipeerconnection. + + def url(self): + return self._url + + def local(self): + return None + + def peer(self): + return self + + def canpush(self): + # TODO change once implemented. + return False + + def close(self): + pass + + # End of ipeerconnection. + + # Start of ipeercapabilities. + + def capable(self, name): + # The capabilities used internally historically map to capabilities + # advertised from the "capabilities" wire protocol command. However, + # version 2 of that command works differently. + + # Maps to commands that are available. + if name in ('branchmap', 'getbundle', 'known', 'lookup', 'pushkey'): + return True + + # Other concepts. + if name in ('bundle2',): + return True + + return False + + def requirecap(self, name, purpose): + if self.capable(name): + return + + raise error.CapabilityError( + _('cannot %s; client or remote repository does not support the %r ' + 'capability') % (purpose, name)) + + # End of ipeercapabilities. + + def _call(self, name, **args): + with self.commandexecutor() as e: + return e.callcommand(name, args).result() + + def commandexecutor(self): + return httpv2executor(self.ui, self._opener, self._requestbuilder, + self._apiurl, self._descriptor) + +# Registry of API service names to metadata about peers that handle it. +# +# The following keys are meaningful: +# +# init +# Callable receiving (ui, repourl, servicepath, opener, requestbuilder, +# apidescriptor) to create a peer. +# +# priority +# Integer priority for the service. If we could choose from multiple +# services, we choose the one with the highest priority. +API_PEERS = { + wireprototypes.HTTP_WIREPROTO_V2: { + 'init': httpv2peer, + 'priority': 50, + }, +} + +def performhandshake(ui, url, opener, requestbuilder): + # The handshake is a request to the capabilities command. + + caps = None + def capable(x): + raise error.ProgrammingError('should not be called') + + args = {} + + # The client advertises support for newer protocols by adding an + # X-HgUpgrade-* header with a list of supported APIs and an + # X-HgProto-* header advertising which serializing formats it supports. + # We only support the HTTP version 2 transport and CBOR responses for + # now. + advertisev2 = ui.configbool('experimental', 'httppeer.advertise-v2') + + if advertisev2: + args['headers'] = { + r'X-HgProto-1': r'cbor', + } + + args['headers'].update( + encodevalueinheaders(' '.join(sorted(API_PEERS)), + 'X-HgUpgrade', + # We don't know the header limit this early. + # So make it small. + 1024)) + + req, requrl, qs = makev1commandrequest(ui, requestbuilder, caps, + capable, url, 'capabilities', + args) + + resp = sendrequest(ui, opener, req) + + respurl, ct, resp = parsev1commandresponse(ui, url, requrl, qs, resp, + compressible=False, + allowcbor=advertisev2) + + try: + rawdata = resp.read() + finally: + resp.close() + + if not ct.startswith('application/mercurial-'): + raise error.ProgrammingError('unexpected content-type: %s' % ct) + + if advertisev2: + if ct == 'application/mercurial-cbor': + try: + info = cbor.loads(rawdata) + except cbor.CBORDecodeError: + raise error.Abort(_('error decoding CBOR from remote server'), + hint=_('try again and consider contacting ' + 'the server operator')) + + # We got a legacy response. That's fine. + elif ct in ('application/mercurial-0.1', 'application/mercurial-0.2'): + info = { + 'v1capabilities': set(rawdata.split()) + } + + else: + raise error.RepoError( + _('unexpected response type from server: %s') % ct) + else: + info = { + 'v1capabilities': set(rawdata.split()) + } + + return respurl, info + +def makepeer(ui, path, opener=None, requestbuilder=urlreq.request): + """Construct an appropriate HTTP peer instance. + + ``opener`` is an ``url.opener`` that should be used to establish + connections, perform HTTP requests. + + ``requestbuilder`` is the type used for constructing HTTP requests. + It exists as an argument so extensions can override the default. + """ + u = util.url(path) + if u.query or u.fragment: + raise error.Abort(_('unsupported URL component: "%s"') % + (u.query or u.fragment)) + + # urllib cannot handle URLs with embedded user or passwd. + url, authinfo = u.authinfo() + ui.debug('using %s\n' % url) + + opener = opener or urlmod.opener(ui, authinfo) + + respurl, info = performhandshake(ui, url, opener, requestbuilder) + + # Given the intersection of APIs that both we and the server support, + # sort by their advertised priority and pick the first one. + # + # TODO consider making this request-based and interface driven. For + # example, the caller could say "I want a peer that does X." It's quite + # possible that not all peers would do that. Since we know the service + # capabilities, we could filter out services not meeting the + # requirements. Possibly by consulting the interfaces defined by the + # peer type. + apipeerchoices = set(info.get('apis', {}).keys()) & set(API_PEERS.keys()) + + preferredchoices = sorted(apipeerchoices, + key=lambda x: API_PEERS[x]['priority'], + reverse=True) + + for service in preferredchoices: + apipath = '%s/%s' % (info['apibase'].rstrip('/'), service) + + return API_PEERS[service]['init'](ui, respurl, apipath, opener, + requestbuilder, + info['apis'][service]) + + # Failed to construct an API peer. Fall back to legacy. + return httppeer(ui, path, respurl, opener, requestbuilder, + info['v1capabilities']) + +def instance(ui, path, create, intents=None): if create: raise error.Abort(_('cannot create new http repository')) try: - if path.startswith('https:'): - inst = httpspeer(ui, path) - else: - inst = httppeer(ui, path) - try: - # Try to do useful work when checking compatibility. - # Usually saves a roundtrip since we want the caps anyway. - inst._fetchcaps() - except error.RepoError: - # No luck, try older compatibility check. - inst.between([(nullid, nullid)]) + if path.startswith('https:') and not urlmod.has_https: + raise error.Abort(_('Python support for SSL and HTTPS ' + 'is not installed')) + + inst = makepeer(ui, path) + return inst except error.RepoError as httpexception: try: diff -r fb92df8b634c -r ed5448edcbfa mercurial/i18n.py --- a/mercurial/i18n.py Wed Apr 04 10:35:09 2018 -0400 +++ b/mercurial/i18n.py Wed Apr 18 15:32:08 2018 -0400 @@ -50,8 +50,8 @@ def setdatapath(datapath): datapath = pycompat.fsdecode(datapath) - localedir = os.path.join(datapath, pycompat.sysstr('locale')) - t = gettextmod.translation('hg', localedir, _languages, fallback=True) + localedir = os.path.join(datapath, r'locale') + t = gettextmod.translation(r'hg', localedir, _languages, fallback=True) global _ugettext try: _ugettext = t.ugettext diff -r fb92df8b634c -r ed5448edcbfa mercurial/keepalive.py --- a/mercurial/keepalive.py Wed Apr 04 10:35:09 2018 -0400 +++ b/mercurial/keepalive.py Wed Apr 18 15:32:08 2018 -0400 @@ -97,6 +97,9 @@ urllibcompat, util, ) +from .utils import ( + procutil, +) httplib = util.httplib urlerr = util.urlerr @@ -315,24 +318,24 @@ headers.update(sorted(req.unredirected_hdrs.items())) headers = util.sortdict((n.lower(), v) for n, v in headers.items()) skipheaders = {} - for n in ('host', 'accept-encoding'): + for n in (r'host', r'accept-encoding'): if n in headers: - skipheaders['skip_' + n.replace('-', '_')] = 1 + skipheaders[r'skip_' + n.replace(r'-', r'_')] = 1 try: if urllibcompat.hasdata(req): data = urllibcompat.getdata(req) h.putrequest( req.get_method(), urllibcompat.getselector(req), - **pycompat.strkwargs(skipheaders)) - if 'content-type' not in headers: - h.putheader('Content-type', - 'application/x-www-form-urlencoded') - if 'content-length' not in headers: - h.putheader('Content-length', '%d' % len(data)) + **skipheaders) + if r'content-type' not in headers: + h.putheader(r'Content-type', + r'application/x-www-form-urlencoded') + if r'content-length' not in headers: + h.putheader(r'Content-length', r'%d' % len(data)) else: h.putrequest( req.get_method(), urllibcompat.getselector(req), - **pycompat.strkwargs(skipheaders)) + **skipheaders) except socket.error as err: raise urlerr.urlerror(err) for k, v in headers.items(): @@ -346,7 +349,7 @@ class HTTPResponse(httplib.HTTPResponse): # we need to subclass HTTPResponse in order to - # 1) add readline() and readlines() methods + # 1) add readline(), readlines(), and readinto() methods # 2) add close_connection() methods # 3) add info() and geturl() methods @@ -381,6 +384,7 @@ self._connection = None # (same) _raw_read = httplib.HTTPResponse.read + _raw_readinto = getattr(httplib.HTTPResponse, 'readinto', None) def close(self): if self.fp: @@ -519,6 +523,26 @@ break return list + def readinto(self, dest): + if self._raw_readinto is None: + res = self.read(len(dest)) + if not res: + return 0 + dest[0:len(res)] = res + return len(res) + total = len(dest) + have = len(self._rbuf) + if have >= total: + dest[0:total] = self._rbuf[:total] + self._rbuf = self._rbuf[total:] + return total + mv = memoryview(dest) + got = self._raw_readinto(mv[have:total]) + dest[0:have] = self._rbuf + got += len(self._rbuf) + self._rbuf = '' + return got + def safesend(self, str): """Send `str' to the server. @@ -635,14 +659,14 @@ def comp(N, url): print(' making %i connections to:\n %s' % (N, url)) - util.stdout.write(' first using the normal urllib handlers') + procutil.stdout.write(' first using the normal urllib handlers') # first use normal opener opener = urlreq.buildopener() urlreq.installopener(opener) t1 = fetch(N, url) print(' TIME: %.3f s' % t1) - util.stdout.write(' now using the keepalive handler ') + procutil.stdout.write(' now using the keepalive handler ') # now install the keepalive handler and try again opener = urlreq.buildopener(HTTPHandler()) urlreq.installopener(opener) @@ -687,11 +711,11 @@ i = 20 print(" waiting %i seconds for the server to close the connection" % i) while i > 0: - util.stdout.write('\r %2i' % i) - util.stdout.flush() + procutil.stdout.write('\r %2i' % i) + procutil.stdout.flush() time.sleep(1) i -= 1 - util.stderr.write('\r') + procutil.stderr.write('\r') print(" fetching the file a second time") fo = urlreq.urlopen(url) diff -r fb92df8b634c -r ed5448edcbfa mercurial/localrepo.py --- a/mercurial/localrepo.py Wed Apr 04 10:35:09 2018 -0400 +++ b/mercurial/localrepo.py Wed Apr 18 15:32:08 2018 -0400 @@ -9,9 +9,9 @@ import errno import hashlib -import inspect import os import random +import sys import time import weakref @@ -21,6 +21,9 @@ nullid, short, ) +from .thirdparty.zope import ( + interface as zi, +) from . import ( bookmarks, branchmap, @@ -44,9 +47,9 @@ merge as mergemod, mergeutil, namespaces, + narrowspec, obsolete, pathutil, - peer, phases, pushkey, pycompat, @@ -57,13 +60,17 @@ scmutil, sparse, store, - subrepo, + subrepoutil, tags as tagsmod, transaction, txnutil, util, vfs as vfsmod, ) +from .utils import ( + procutil, + stringutil, +) release = lockmod.release urlerr = util.urlerr @@ -146,6 +153,50 @@ 'unbundle'} legacycaps = moderncaps.union({'changegroupsubset'}) +@zi.implementer(repository.ipeercommandexecutor) +class localcommandexecutor(object): + def __init__(self, peer): + self._peer = peer + self._sent = False + self._closed = False + + def __enter__(self): + return self + + def __exit__(self, exctype, excvalue, exctb): + self.close() + + def callcommand(self, command, args): + if self._sent: + raise error.ProgrammingError('callcommand() cannot be used after ' + 'sendcommands()') + + if self._closed: + raise error.ProgrammingError('callcommand() cannot be used after ' + 'close()') + + # We don't need to support anything fancy. Just call the named + # method on the peer and return a resolved future. + fn = getattr(self._peer, pycompat.sysstr(command)) + + f = pycompat.futures.Future() + + try: + result = fn(**pycompat.strkwargs(args)) + except Exception: + pycompat.future_set_exception_info(f, sys.exc_info()[1:]) + else: + f.set_result(result) + + return f + + def sendcommands(self): + self._sent = True + + def close(self): + self._closed = True + +@zi.implementer(repository.ipeercommands) class localpeer(repository.peer): '''peer for a local repo; reflects only the most recent API''' @@ -155,15 +206,11 @@ if caps is None: caps = moderncaps.copy() self._repo = repo.filtered('served') - self._ui = repo.ui + self.ui = repo.ui self._caps = repo._restrictcapabilities(caps) # Begin of _basepeer interface. - @util.propertycache - def ui(self): - return self._ui - def url(self): return self._repo.url() @@ -189,9 +236,14 @@ def capabilities(self): return self._caps + def clonebundles(self): + return self._repo.tryread('clonebundles.manifest') + def debugwireargs(self, one, two, three=None, four=None, five=None): """Used to test argument passing over the wire""" - return "%s %s %s %s %s" % (one, two, three, four, five) + return "%s %s %s %s %s" % (one, two, pycompat.bytestr(three), + pycompat.bytestr(four), + pycompat.bytestr(five)) def getbundle(self, source, heads=None, common=None, bundlecaps=None, **kwargs): @@ -227,14 +279,14 @@ raise error.Abort(_('cannot perform stream clone against local ' 'peer')) - def unbundle(self, cg, heads, url): + def unbundle(self, bundle, heads, url): """apply a bundle on a repo This function handles the repo locking itself.""" try: try: - cg = exchange.readbundle(self.ui, cg, None) - ret = exchange.unbundle(self._repo, cg, heads, 'push', url) + bundle = exchange.readbundle(self.ui, bundle, None) + ret = exchange.unbundle(self._repo, bundle, heads, 'push', url) if util.safehasattr(ret, 'getchunks'): # This is a bundle20 object, turn it into an unbundler. # This little dance should be dropped eventually when the @@ -260,18 +312,20 @@ bundle2.processbundle(self._repo, b) raise except error.PushRaced as exc: - raise error.ResponseError(_('push failed:'), str(exc)) + raise error.ResponseError(_('push failed:'), + stringutil.forcebytestr(exc)) # End of _basewirecommands interface. # Begin of peer interface. - def iterbatch(self): - return peer.localiterbatcher(self) + def commandexecutor(self): + return localcommandexecutor(self) # End of peer interface. -class locallegacypeer(repository.legacypeer, localpeer): +@zi.implementer(repository.ipeerlegacycommands) +class locallegacypeer(localpeer): '''peer extension which implements legacy methods too; used for tests with restricted capabilities''' @@ -286,8 +340,8 @@ def branches(self, nodes): return self._repo.branches(nodes) - def changegroup(self, basenodes, source): - outgoing = discovery.outgoing(self._repo, missingroots=basenodes, + def changegroup(self, nodes, source): + outgoing = discovery.outgoing(self._repo, missingroots=nodes, missingheads=self._repo.heads()) return changegroup.makechangegroup(self._repo, outgoing, '01', source) @@ -302,13 +356,27 @@ # clients. REVLOGV2_REQUIREMENT = 'exp-revlogv2.0' +# Functions receiving (ui, features) that extensions can register to impact +# the ability to load repositories with custom requirements. Only +# functions defined in loaded extensions are called. +# +# The function receives a set of requirement strings that the repository +# is capable of opening. Functions will typically add elements to the +# set to reflect that the extension knows how to handle that requirements. +featuresetupfuncs = set() + +@zi.implementer(repository.completelocalrepository) class localrepository(object): + # obsolete experimental requirements: + # - manifestv2: An experimental new manifest format that allowed + # for stem compression of long paths. Experiment ended up not + # being successful (repository sizes went up due to worse delta + # chains), and the code was deleted in 4.6. supportedformats = { 'revlogv1', 'generaldelta', 'treemanifest', - 'manifestv2', REVLOGV2_REQUIREMENT, } _basesupported = supportedformats | { @@ -323,13 +391,8 @@ 'revlogv1', 'generaldelta', 'treemanifest', - 'manifestv2', } - # a list of (ui, featureset) functions. - # only functions defined in module of enabled extensions are invoked - featuresetupfuncs = set() - # list of prefix for file which can be written without 'wlock' # Extensions should extend this list when needed _wlockfreeprefix = { @@ -350,7 +413,7 @@ 'bisect.state', } - def __init__(self, baseui, path, create=False): + def __init__(self, baseui, path, create=False, intents=None): self.requirements = set() self.filtername = None # wvfs: rooted at the repository root, used to access the working copy @@ -389,11 +452,11 @@ except IOError: pass - if self.featuresetupfuncs: + if featuresetupfuncs: self.supported = set(self._basesupported) # use private copy extmods = set(m.__name__ for n, m in extensions.extensions(self.ui)) - for setupfunc in self.featuresetupfuncs: + for setupfunc in featuresetupfuncs: if setupfunc.__module__ in extmods: setupfunc(self.ui, self.supported) else: @@ -480,7 +543,7 @@ self._branchcaches = {} self._revbranchcache = None - self.filterpats = {} + self._filterpats = {} self._datafilters = {} self._transref = self._lockref = self._wlockref = None @@ -733,9 +796,42 @@ " working parent %s!\n") % short(node)) return nullid + @repofilecache(narrowspec.FILENAME) + def narrowpats(self): + """matcher patterns for this repository's narrowspec + + A tuple of (includes, excludes). + """ + source = self + if self.shared(): + from . import hg + source = hg.sharedreposource(self) + return narrowspec.load(source) + + @repofilecache(narrowspec.FILENAME) + def _narrowmatch(self): + if changegroup.NARROW_REQUIREMENT not in self.requirements: + return matchmod.always(self.root, '') + include, exclude = self.narrowpats + return narrowspec.match(self.root, include=include, exclude=exclude) + + # TODO(martinvonz): make this property-like instead? + def narrowmatch(self): + return self._narrowmatch + + def setnarrowpats(self, newincludes, newexcludes): + target = self + if self.shared(): + from . import hg + target = hg.sharedreposource(self) + narrowspec.save(target, newincludes, newexcludes) + self.invalidate(clearfilecache=True) + def __getitem__(self, changeid): if changeid is None: return context.workingctx(self) + if isinstance(changeid, context.basectx): + return changeid if isinstance(changeid, slice): # wdirrev isn't contiguous so the slice shouldn't include it return [context.changectx(self, i) @@ -754,7 +850,8 @@ try: self[changeid] return True - except error.RepoLookupError: + except (error.RepoLookupError, error.FilteredIndexError, + error.FilteredLookupError): return False def __nonzero__(self): @@ -808,7 +905,8 @@ ``{name: definitionstring}``. ''' if user: - m = revset.matchany(self.ui, specs, repo=self, + m = revset.matchany(self.ui, specs, + lookup=revset.lookupfn(self), localalias=localalias) else: m = revset.matchany(None, specs, localalias=localalias) @@ -969,15 +1067,13 @@ pass def lookup(self, key): - return self[key].node() + return scmutil.revsymbol(self, key).node() - def lookupbranch(self, key, remote=None): - repo = remote or self - if key in repo.branchmap(): + def lookupbranch(self, key): + if key in self.branchmap(): return key - repo = (remote and remote.local()) and remote or self - return repo[key].branch() + return scmutil.revsymbol(self, key).branch() def known(self, nodes): cl = self.changelog @@ -1021,9 +1117,6 @@ f = f[1:] return filelog.filelog(self.svfs, f) - def changectx(self, changeid): - return self[changeid] - def setparents(self, p1, p2=nullid): with self.dirstate.parentchange(): copies = self.dirstate.setparents(p1, p2) @@ -1040,10 +1133,11 @@ if f not in pctx and s not in pctx: self.dirstate.copy(None, f) - def filectx(self, path, changeid=None, fileid=None): + def filectx(self, path, changeid=None, fileid=None, changectx=None): """changeid can be a changeset revision, node, or tag. fileid can be a file revision or node.""" - return context.filectx(self, path, changeid, fileid) + return context.filectx(self, path, changeid, fileid, + changectx=changectx) def getcwd(self): return self.dirstate.getcwd() @@ -1052,7 +1146,7 @@ return self.dirstate.pathto(f, cwd) def _loadfilter(self, filter): - if filter not in self.filterpats: + if filter not in self._filterpats: l = [] for pat, cmd in self.ui.configitems(filter): if cmd == '!': @@ -1066,14 +1160,14 @@ params = cmd[len(name):].lstrip() break if not fn: - fn = lambda s, c, **kwargs: util.filter(s, c) + fn = lambda s, c, **kwargs: procutil.filter(s, c) # Wrap old filters not supporting keyword arguments - if not inspect.getargspec(fn)[2]: + if not pycompat.getargspec(fn)[2]: oldfn = fn fn = lambda s, c, **kwargs: oldfn(s, c) l.append((mf, fn, params)) - self.filterpats[filter] = l - return self.filterpats[filter] + self._filterpats[filter] = l + return self._filterpats[filter] def _filter(self, filterpats, filename, data): for mf, fn, cmd in filterpats: @@ -1140,7 +1234,7 @@ raise error.ProgrammingError('transaction requires locking') tr = self.currenttransaction() if tr is not None: - return tr.nest() + return tr.nest(name=desc) # abort here if the journal already exists if self.svfs.exists("journal"): @@ -1279,7 +1373,8 @@ self.store.createmode, validator=validate, releasefn=releasefn, - checkambigfiles=_cachedfiles) + checkambigfiles=_cachedfiles, + name=desc) tr.changes['revs'] = xrange(0, 0) tr.changes['obsmarkers'] = set() tr.changes['phases'] = {} @@ -1332,7 +1427,7 @@ """To be run if transaction is aborted """ reporef().hook('txnabort', throw=False, txnname=desc, - **tr2.hookargs) + **pycompat.strkwargs(tr2.hookargs)) tr.addabort('txnabort-hook', txnaborthook) # avoid eager cache invalidation. in-memory data should be identical # to stored data if transaction has no error. @@ -1481,12 +1576,15 @@ return updater @unfilteredmethod - def updatecaches(self, tr=None): + def updatecaches(self, tr=None, full=False): """warm appropriate caches If this function is called after a transaction closed. The transaction will be available in the 'tr' argument. This can be used to selectively update caches relevant to the changes in that transaction. + + If 'full' is set, make sure all caches the function knows about have + up-to-date data. Even the ones usually loaded more lazily. """ if tr is not None and tr.hookargs.get('source') == 'strip': # During strip, many caches are invalid but @@ -1498,6 +1596,12 @@ self.ui.debug('updating the branch cache\n') branchmap.updatecache(self.filtered('served')) + if full: + rbc = self.revbranchcache() + for r in self.changelog: + rbc.branchinfo(r) + rbc.write() + def invalidatecaches(self): if '_tagscache' in vars(self): @@ -1574,7 +1678,8 @@ def _refreshfilecachestats(self, tr): """Reload stats of cached files so that they are flagged as valid""" for k, ce in self._filecache.items(): - if k == 'dirstate' or k not in self.__dict__: + k = pycompat.sysstr(k) + if k == r'dirstate' or k not in self.__dict__: continue ce.refresh() @@ -1832,7 +1937,7 @@ status.modified.extend(status.clean) # mq may commit clean files # check subrepos - subs, commitsubs, newstate = subrepo.precommit( + subs, commitsubs, newstate = subrepoutil.precommit( self.ui, wctx, status, match, force=force) # make sure all explicit patterns are matched @@ -1869,10 +1974,10 @@ for s in sorted(commitsubs): sub = wctx.sub(s) self.ui.status(_('committing subrepository %s\n') % - subrepo.subrelpath(sub)) + subrepoutil.subrelpath(sub)) sr = sub.commit(cctx._text, user, date) newstate[s] = (newstate[s][0], sr) - subrepo.writestate(self, newstate) + subrepoutil.writestate(self, newstate) p1, p2 = self.dirstate.parents() hookp1, hookp2 = hex(p1), (p2 != nullid and hex(p2) or '') @@ -1982,7 +2087,7 @@ self.hook('pretxncommit', throw=True, node=hex(n), parent1=xp1, parent2=xp2) # set the new commit is proper phase - targetphase = subrepo.newcommitphase(self.ui, ctx) + targetphase = subrepoutil.newcommitphase(self.ui, ctx) if targetphase: # retract boundary do not alter parent changeset. # if a parent have higher the resulting phase will @@ -2047,15 +2152,6 @@ # tag cache retrieval" case to work. self.invalidate() - def walk(self, match, node=None): - ''' - walk recursively through the directory tree or a given - changeset, finding all files matched by the match - function - ''' - self.ui.deprecwarn('use repo[node].walk instead of repo.walk', '4.3') - return self[node].walk(match) - def status(self, node1='.', node2=None, match=None, ignored=False, clean=False, unknown=False, listsubrepos=False): @@ -2176,10 +2272,11 @@ hookargs = {} if tr is not None: hookargs.update(tr.hookargs) - hookargs['namespace'] = namespace - hookargs['key'] = key - hookargs['old'] = old - hookargs['new'] = new + hookargs = pycompat.strkwargs(hookargs) + hookargs[r'namespace'] = namespace + hookargs[r'key'] = key + hookargs[r'old'] = old + hookargs[r'new'] = new self.hook('prepushkey', throw=True, **hookargs) except error.HookAbort as exc: self.ui.write_err(_("pushkey-abort: %s\n") % exc) @@ -2203,7 +2300,9 @@ def debugwireargs(self, one, two, three=None, four=None, five=None): '''used to test argument passing over the wire''' - return "%s %s %s %s %s" % (one, two, three, four, five) + return "%s %s %s %s %s" % (one, two, pycompat.bytestr(three), + pycompat.bytestr(four), + pycompat.bytestr(five)) def savecommitmessage(self, text): fp = self.vfs('last-message.txt', 'wb') @@ -2233,8 +2332,9 @@ assert name.startswith('journal') return os.path.join(base, name.replace('journal', 'undo', 1)) -def instance(ui, path, create): - return localrepository(ui, util.urllocalpath(path), create) +def instance(ui, path, create, intents=None): + return localrepository(ui, util.urllocalpath(path), create, + intents=intents) def islocal(path): return True @@ -2270,8 +2370,6 @@ requirements.add('generaldelta') if ui.configbool('experimental', 'treemanifest'): requirements.add('treemanifest') - if ui.configbool('experimental', 'manifestv2'): - requirements.add('manifestv2') revlogv2 = ui.config('experimental', 'revlogv2') if revlogv2 == 'enable-unstable-format-and-corrupt-my-data': diff -r fb92df8b634c -r ed5448edcbfa mercurial/lock.py --- a/mercurial/lock.py Wed Apr 04 10:35:09 2018 -0400 +++ b/mercurial/lock.py Wed Apr 18 15:32:08 2018 -0400 @@ -10,6 +10,7 @@ import contextlib import errno import os +import signal import socket import time import warnings @@ -20,7 +21,10 @@ encoding, error, pycompat, - util, +) + +from .utils import ( + procutil, ) def _getlockprefix(): @@ -30,9 +34,7 @@ confidence. Typically it's just hostname. On modern linux, we include an extra Linux-specific pid namespace identifier. """ - result = socket.gethostname() - if pycompat.ispy3: - result = result.encode(pycompat.sysstr(encoding.encoding), 'replace') + result = encoding.strtolocal(socket.gethostname()) if pycompat.sysplatform.startswith('linux'): try: result += '/%x' % os.stat('/proc/self/ns/pid').st_ino @@ -41,6 +43,64 @@ raise return result +@contextlib.contextmanager +def _delayedinterrupt(): + """Block signal interrupt while doing something critical + + This makes sure that the code block wrapped by this context manager won't + be interrupted. + + For Windows developers: It appears not possible to guard time.sleep() + from CTRL_C_EVENT, so please don't use time.sleep() to test if this is + working. + """ + assertedsigs = [] + blocked = False + orighandlers = {} + + def raiseinterrupt(num): + if (num == getattr(signal, 'SIGINT', None) or + num == getattr(signal, 'CTRL_C_EVENT', None)): + raise KeyboardInterrupt + else: + raise error.SignalInterrupt + def catchterm(num, frame): + if blocked: + assertedsigs.append(num) + else: + raiseinterrupt(num) + + try: + # save handlers first so they can be restored even if a setup is + # interrupted between signal.signal() and orighandlers[] =. + for name in ['CTRL_C_EVENT', 'SIGINT', 'SIGBREAK', 'SIGHUP', 'SIGTERM']: + num = getattr(signal, name, None) + if num and num not in orighandlers: + orighandlers[num] = signal.getsignal(num) + try: + for num in orighandlers: + signal.signal(num, catchterm) + except ValueError: + pass # in a thread? no luck + + blocked = True + yield + finally: + # no simple way to reliably restore all signal handlers because + # any loops, recursive function calls, except blocks, etc. can be + # interrupted. so instead, make catchterm() raise interrupt. + blocked = False + try: + for num, handler in orighandlers.items(): + signal.signal(num, handler) + except ValueError: + pass # in a thread? + + # re-raise interrupt exception if any, which may be shadowed by a new + # interrupt occurred while re-raising the first one + if assertedsigs: + raiseinterrupt(assertedsigs[0]) + def trylock(ui, vfs, lockname, timeout, warntimeout, *args, **kwargs): """return an acquired lock or raise an a LockHeld exception @@ -52,10 +112,12 @@ # show more details for new-style locks if ':' in locker: host, pid = locker.split(":", 1) - msg = _("waiting for lock on %s held by process %r " - "on host %r\n") % (l.desc, pid, host) + msg = (_("waiting for lock on %s held by process %r on host %r\n") + % (pycompat.bytestr(l.desc), pycompat.bytestr(pid), + pycompat.bytestr(host))) else: - msg = _("waiting for lock on %s held by %r\n") % (l.desc, locker) + msg = (_("waiting for lock on %s held by %r\n") + % (l.desc, pycompat.bytestr(locker))) printer(msg) l = lock(vfs, lockname, 0, *args, dolock=False, **kwargs) @@ -86,9 +148,9 @@ l.delay = delay if l.delay: if 0 <= warningidx <= l.delay: - ui.warn(_("got lock after %s seconds\n") % l.delay) + ui.warn(_("got lock after %d seconds\n") % l.delay) else: - ui.debug("got lock after %s seconds\n" % l.delay) + ui.debug("got lock after %d seconds\n" % l.delay) if l.acquirefn: l.acquirefn() return l @@ -113,11 +175,11 @@ _host = None - def __init__(self, vfs, file, timeout=-1, releasefn=None, acquirefn=None, + def __init__(self, vfs, fname, timeout=-1, releasefn=None, acquirefn=None, desc=None, inheritchecker=None, parentlock=None, dolock=True): self.vfs = vfs - self.f = file + self.f = fname self.held = 0 self.timeout = timeout self.releasefn = releasefn @@ -153,8 +215,8 @@ self.release() def _getpid(self): - # wrapper around util.getpid() to make testing easier - return util.getpid() + # wrapper around procutil.getpid() to make testing easier + return procutil.getpid() def lock(self): timeout = self.timeout @@ -182,8 +244,9 @@ while not self.held and retry: retry -= 1 try: - self.vfs.makelock(lockname, self.f) - self.held = 1 + with _delayedinterrupt(): + self.vfs.makelock(lockname, self.f) + self.held = 1 except (OSError, IOError) as why: if why.errno == errno.EEXIST: locker = self._readlock() @@ -239,7 +302,7 @@ pid = int(pid) except ValueError: return locker - if util.testpid(pid): + if procutil.testpid(pid): return locker # if locker dead, break lock. must do this with another lock # held, or can race and break valid lock. @@ -285,7 +348,7 @@ if self._parentheld: lockname = self.parentlock else: - lockname = '%s:%s' % (lock._host, self.pid) + lockname = b'%s:%d' % (lock._host, self.pid) self._inherited = True try: yield lockname diff -r fb92df8b634c -r ed5448edcbfa mercurial/logcmdutil.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/mercurial/logcmdutil.py Wed Apr 18 15:32:08 2018 -0400 @@ -0,0 +1,915 @@ +# logcmdutil.py - utility for log-like commands +# +# Copyright 2005-2007 Matt Mackall +# +# This software may be used and distributed according to the terms of the +# GNU General Public License version 2 or any later version. + +from __future__ import absolute_import + +import itertools +import os + +from .i18n import _ +from .node import ( + nullid, +) + +from . import ( + dagop, + error, + formatter, + graphmod, + match as matchmod, + mdiff, + patch, + pathutil, + pycompat, + revset, + revsetlang, + scmutil, + smartset, + templatekw, + templater, + util, +) +from .utils import ( + dateutil, + stringutil, +) + +def getlimit(opts): + """get the log limit according to option -l/--limit""" + limit = opts.get('limit') + if limit: + try: + limit = int(limit) + except ValueError: + raise error.Abort(_('limit must be a positive integer')) + if limit <= 0: + raise error.Abort(_('limit must be positive')) + else: + limit = None + return limit + +def diffordiffstat(ui, repo, diffopts, node1, node2, match, + changes=None, stat=False, fp=None, prefix='', + root='', listsubrepos=False, hunksfilterfn=None): + '''show diff or diffstat.''' + if root: + relroot = pathutil.canonpath(repo.root, repo.getcwd(), root) + else: + relroot = '' + if relroot != '': + # XXX relative roots currently don't work if the root is within a + # subrepo + uirelroot = match.uipath(relroot) + relroot += '/' + for matchroot in match.files(): + if not matchroot.startswith(relroot): + ui.warn(_('warning: %s not inside relative root %s\n') % ( + match.uipath(matchroot), uirelroot)) + + if stat: + diffopts = diffopts.copy(context=0, noprefix=False) + width = 80 + if not ui.plain(): + width = ui.termwidth() + + chunks = patch.diff(repo, node1, node2, match, changes, opts=diffopts, + prefix=prefix, relroot=relroot, + hunksfilterfn=hunksfilterfn) + + if fp is not None or ui.canwritewithoutlabels(): + out = fp or ui + if stat: + chunks = [patch.diffstat(util.iterlines(chunks), width=width)] + for chunk in util.filechunkiter(util.chunkbuffer(chunks)): + out.write(chunk) + else: + if stat: + chunks = patch.diffstatui(util.iterlines(chunks), width=width) + else: + chunks = patch.difflabel(lambda chunks, **kwargs: chunks, chunks, + opts=diffopts) + if ui.canbatchlabeledwrites(): + def gen(): + for chunk, label in chunks: + yield ui.label(chunk, label=label) + for chunk in util.filechunkiter(util.chunkbuffer(gen())): + ui.write(chunk) + else: + for chunk, label in chunks: + ui.write(chunk, label=label) + + if listsubrepos: + ctx1 = repo[node1] + ctx2 = repo[node2] + for subpath, sub in scmutil.itersubrepos(ctx1, ctx2): + tempnode2 = node2 + try: + if node2 is not None: + tempnode2 = ctx2.substate[subpath][1] + except KeyError: + # A subrepo that existed in node1 was deleted between node1 and + # node2 (inclusive). Thus, ctx2's substate won't contain that + # subpath. The best we can do is to ignore it. + tempnode2 = None + submatch = matchmod.subdirmatcher(subpath, match) + sub.diff(ui, diffopts, tempnode2, submatch, changes=changes, + stat=stat, fp=fp, prefix=prefix) + +class changesetdiffer(object): + """Generate diff of changeset with pre-configured filtering functions""" + + def _makefilematcher(self, ctx): + return scmutil.matchall(ctx.repo()) + + def _makehunksfilter(self, ctx): + return None + + def showdiff(self, ui, ctx, diffopts, stat=False): + repo = ctx.repo() + node = ctx.node() + prev = ctx.p1().node() + diffordiffstat(ui, repo, diffopts, prev, node, + match=self._makefilematcher(ctx), stat=stat, + hunksfilterfn=self._makehunksfilter(ctx)) + +def changesetlabels(ctx): + labels = ['log.changeset', 'changeset.%s' % ctx.phasestr()] + if ctx.obsolete(): + labels.append('changeset.obsolete') + if ctx.isunstable(): + labels.append('changeset.unstable') + for instability in ctx.instabilities(): + labels.append('instability.%s' % instability) + return ' '.join(labels) + +class changesetprinter(object): + '''show changeset information when templating not requested.''' + + def __init__(self, ui, repo, differ=None, diffopts=None, buffered=False): + self.ui = ui + self.repo = repo + self.buffered = buffered + self._differ = differ or changesetdiffer() + self.diffopts = diffopts or {} + self.header = {} + self.hunk = {} + self.lastheader = None + self.footer = None + self._columns = templatekw.getlogcolumns() + + def flush(self, ctx): + rev = ctx.rev() + if rev in self.header: + h = self.header[rev] + if h != self.lastheader: + self.lastheader = h + self.ui.write(h) + del self.header[rev] + if rev in self.hunk: + self.ui.write(self.hunk[rev]) + del self.hunk[rev] + + def close(self): + if self.footer: + self.ui.write(self.footer) + + def show(self, ctx, copies=None, **props): + props = pycompat.byteskwargs(props) + if self.buffered: + self.ui.pushbuffer(labeled=True) + self._show(ctx, copies, props) + self.hunk[ctx.rev()] = self.ui.popbuffer() + else: + self._show(ctx, copies, props) + + def _show(self, ctx, copies, props): + '''show a single changeset or file revision''' + changenode = ctx.node() + rev = ctx.rev() + + if self.ui.quiet: + self.ui.write("%s\n" % scmutil.formatchangeid(ctx), + label='log.node') + return + + columns = self._columns + self.ui.write(columns['changeset'] % scmutil.formatchangeid(ctx), + label=changesetlabels(ctx)) + + # branches are shown first before any other names due to backwards + # compatibility + branch = ctx.branch() + # don't show the default branch name + if branch != 'default': + self.ui.write(columns['branch'] % branch, label='log.branch') + + for nsname, ns in self.repo.names.iteritems(): + # branches has special logic already handled above, so here we just + # skip it + if nsname == 'branches': + continue + # we will use the templatename as the color name since those two + # should be the same + for name in ns.names(self.repo, changenode): + self.ui.write(ns.logfmt % name, + label='log.%s' % ns.colorname) + if self.ui.debugflag: + self.ui.write(columns['phase'] % ctx.phasestr(), label='log.phase') + for pctx in scmutil.meaningfulparents(self.repo, ctx): + label = 'log.parent changeset.%s' % pctx.phasestr() + self.ui.write(columns['parent'] % scmutil.formatchangeid(pctx), + label=label) + + if self.ui.debugflag and rev is not None: + mnode = ctx.manifestnode() + mrev = self.repo.manifestlog._revlog.rev(mnode) + self.ui.write(columns['manifest'] + % scmutil.formatrevnode(self.ui, mrev, mnode), + label='ui.debug log.manifest') + self.ui.write(columns['user'] % ctx.user(), label='log.user') + self.ui.write(columns['date'] % dateutil.datestr(ctx.date()), + label='log.date') + + if ctx.isunstable(): + instabilities = ctx.instabilities() + self.ui.write(columns['instability'] % ', '.join(instabilities), + label='log.instability') + + elif ctx.obsolete(): + self._showobsfate(ctx) + + self._exthook(ctx) + + if self.ui.debugflag: + files = ctx.p1().status(ctx)[:3] + for key, value in zip(['files', 'files+', 'files-'], files): + if value: + self.ui.write(columns[key] % " ".join(value), + label='ui.debug log.files') + elif ctx.files() and self.ui.verbose: + self.ui.write(columns['files'] % " ".join(ctx.files()), + label='ui.note log.files') + if copies and self.ui.verbose: + copies = ['%s (%s)' % c for c in copies] + self.ui.write(columns['copies'] % ' '.join(copies), + label='ui.note log.copies') + + extra = ctx.extra() + if extra and self.ui.debugflag: + for key, value in sorted(extra.items()): + self.ui.write(columns['extra'] + % (key, stringutil.escapestr(value)), + label='ui.debug log.extra') + + description = ctx.description().strip() + if description: + if self.ui.verbose: + self.ui.write(_("description:\n"), + label='ui.note log.description') + self.ui.write(description, + label='ui.note log.description') + self.ui.write("\n\n") + else: + self.ui.write(columns['summary'] % description.splitlines()[0], + label='log.summary') + self.ui.write("\n") + + self._showpatch(ctx) + + def _showobsfate(self, ctx): + # TODO: do not depend on templater + tres = formatter.templateresources(self.repo.ui, self.repo) + t = formatter.maketemplater(self.repo.ui, '{join(obsfate, "\n")}', + defaults=templatekw.keywords, + resources=tres) + obsfate = t.renderdefault({'ctx': ctx}).splitlines() + + if obsfate: + for obsfateline in obsfate: + self.ui.write(self._columns['obsolete'] % obsfateline, + label='log.obsfate') + + def _exthook(self, ctx): + '''empty method used by extension as a hook point + ''' + + def _showpatch(self, ctx): + stat = self.diffopts.get('stat') + diff = self.diffopts.get('patch') + diffopts = patch.diffallopts(self.ui, self.diffopts) + if stat: + self._differ.showdiff(self.ui, ctx, diffopts, stat=True) + if stat and diff: + self.ui.write("\n") + if diff: + self._differ.showdiff(self.ui, ctx, diffopts, stat=False) + if stat or diff: + self.ui.write("\n") + +class changesetformatter(changesetprinter): + """Format changeset information by generic formatter""" + + def __init__(self, ui, repo, fm, differ=None, diffopts=None, + buffered=False): + changesetprinter.__init__(self, ui, repo, differ, diffopts, buffered) + self._fm = fm + + def close(self): + self._fm.end() + + def _show(self, ctx, copies, props): + '''show a single changeset or file revision''' + fm = self._fm + fm.startitem() + + # TODO: maybe this should be wdirrev/wdirnode? + rev = ctx.rev() + if rev is None: + hexnode = None + else: + hexnode = fm.hexfunc(ctx.node()) + fm.data(rev=rev, + node=hexnode) + + if self.ui.quiet: + return + + fm.data(branch=ctx.branch(), + phase=ctx.phasestr(), + user=ctx.user(), + date=fm.formatdate(ctx.date()), + desc=ctx.description(), + bookmarks=fm.formatlist(ctx.bookmarks(), name='bookmark'), + tags=fm.formatlist(ctx.tags(), name='tag'), + parents=fm.formatlist([fm.hexfunc(c.node()) + for c in ctx.parents()], name='node')) + + if self.ui.debugflag: + if rev is None: + hexnode = None + else: + hexnode = fm.hexfunc(ctx.manifestnode()) + fm.data(manifest=hexnode, + extra=fm.formatdict(ctx.extra())) + + files = ctx.p1().status(ctx) + fm.data(modified=fm.formatlist(files[0], name='file'), + added=fm.formatlist(files[1], name='file'), + removed=fm.formatlist(files[2], name='file')) + + elif self.ui.verbose: + fm.data(files=fm.formatlist(ctx.files(), name='file')) + if copies: + fm.data(copies=fm.formatdict(copies, + key='name', value='source')) + + stat = self.diffopts.get('stat') + diff = self.diffopts.get('patch') + diffopts = patch.difffeatureopts(self.ui, self.diffopts, git=True) + if stat: + self.ui.pushbuffer() + self._differ.showdiff(self.ui, ctx, diffopts, stat=True) + fm.data(diffstat=self.ui.popbuffer()) + if diff: + self.ui.pushbuffer() + self._differ.showdiff(self.ui, ctx, diffopts, stat=False) + fm.data(diff=self.ui.popbuffer()) + +class changesettemplater(changesetprinter): + '''format changeset information. + + Note: there are a variety of convenience functions to build a + changesettemplater for common cases. See functions such as: + maketemplater, changesetdisplayer, buildcommittemplate, or other + functions that use changesest_templater. + ''' + + # Arguments before "buffered" used to be positional. Consider not + # adding/removing arguments before "buffered" to not break callers. + def __init__(self, ui, repo, tmplspec, differ=None, diffopts=None, + buffered=False): + changesetprinter.__init__(self, ui, repo, differ, diffopts, buffered) + # tres is shared with _graphnodeformatter() + self._tresources = tres = formatter.templateresources(ui, repo) + self.t = formatter.loadtemplater(ui, tmplspec, + defaults=templatekw.keywords, + resources=tres, + cache=templatekw.defaulttempl) + self._counter = itertools.count() + + self._tref = tmplspec.ref + self._parts = {'header': '', 'footer': '', + tmplspec.ref: tmplspec.ref, + 'docheader': '', 'docfooter': '', + 'separator': ''} + if tmplspec.mapfile: + # find correct templates for current mode, for backward + # compatibility with 'log -v/-q/--debug' using a mapfile + tmplmodes = [ + (True, ''), + (self.ui.verbose, '_verbose'), + (self.ui.quiet, '_quiet'), + (self.ui.debugflag, '_debug'), + ] + for mode, postfix in tmplmodes: + for t in self._parts: + cur = t + postfix + if mode and cur in self.t: + self._parts[t] = cur + else: + partnames = [p for p in self._parts.keys() if p != tmplspec.ref] + m = formatter.templatepartsmap(tmplspec, self.t, partnames) + self._parts.update(m) + + if self._parts['docheader']: + self.ui.write(self.t.render(self._parts['docheader'], {})) + + def close(self): + if self._parts['docfooter']: + if not self.footer: + self.footer = "" + self.footer += self.t.render(self._parts['docfooter'], {}) + return super(changesettemplater, self).close() + + def _show(self, ctx, copies, props): + '''show a single changeset or file revision''' + props = props.copy() + props['ctx'] = ctx + props['index'] = index = next(self._counter) + props['revcache'] = {'copies': copies} + + # write separator, which wouldn't work well with the header part below + # since there's inherently a conflict between header (across items) and + # separator (per item) + if self._parts['separator'] and index > 0: + self.ui.write(self.t.render(self._parts['separator'], {})) + + # write header + if self._parts['header']: + h = self.t.render(self._parts['header'], props) + if self.buffered: + self.header[ctx.rev()] = h + else: + if self.lastheader != h: + self.lastheader = h + self.ui.write(h) + + # write changeset metadata, then patch if requested + key = self._parts[self._tref] + self.ui.write(self.t.render(key, props)) + self._showpatch(ctx) + + if self._parts['footer']: + if not self.footer: + self.footer = self.t.render(self._parts['footer'], props) + +def templatespec(tmpl, mapfile): + if mapfile: + return formatter.templatespec('changeset', tmpl, mapfile) + else: + return formatter.templatespec('', tmpl, None) + +def _lookuptemplate(ui, tmpl, style): + """Find the template matching the given template spec or style + + See formatter.lookuptemplate() for details. + """ + + # ui settings + if not tmpl and not style: # template are stronger than style + tmpl = ui.config('ui', 'logtemplate') + if tmpl: + return templatespec(templater.unquotestring(tmpl), None) + else: + style = util.expandpath(ui.config('ui', 'style')) + + if not tmpl and style: + mapfile = style + if not os.path.split(mapfile)[0]: + mapname = (templater.templatepath('map-cmdline.' + mapfile) + or templater.templatepath(mapfile)) + if mapname: + mapfile = mapname + return templatespec(None, mapfile) + + if not tmpl: + return templatespec(None, None) + + return formatter.lookuptemplate(ui, 'changeset', tmpl) + +def maketemplater(ui, repo, tmpl, buffered=False): + """Create a changesettemplater from a literal template 'tmpl' + byte-string.""" + spec = templatespec(tmpl, None) + return changesettemplater(ui, repo, spec, buffered=buffered) + +def changesetdisplayer(ui, repo, opts, differ=None, buffered=False): + """show one changeset using template or regular display. + + Display format will be the first non-empty hit of: + 1. option 'template' + 2. option 'style' + 3. [ui] setting 'logtemplate' + 4. [ui] setting 'style' + If all of these values are either the unset or the empty string, + regular display via changesetprinter() is done. + """ + postargs = (differ, opts, buffered) + if opts.get('template') == 'json': + fm = ui.formatter('log', opts) + return changesetformatter(ui, repo, fm, *postargs) + + spec = _lookuptemplate(ui, opts.get('template'), opts.get('style')) + + if not spec.ref and not spec.tmpl and not spec.mapfile: + return changesetprinter(ui, repo, *postargs) + + return changesettemplater(ui, repo, spec, *postargs) + +def _makematcher(repo, revs, pats, opts): + """Build matcher and expanded patterns from log options + + If --follow, revs are the revisions to follow from. + + Returns (match, pats, slowpath) where + - match: a matcher built from the given pats and -I/-X opts + - pats: patterns used (globs are expanded on Windows) + - slowpath: True if patterns aren't as simple as scanning filelogs + """ + # pats/include/exclude are passed to match.match() directly in + # _matchfiles() revset but walkchangerevs() builds its matcher with + # scmutil.match(). The difference is input pats are globbed on + # platforms without shell expansion (windows). + wctx = repo[None] + match, pats = scmutil.matchandpats(wctx, pats, opts) + slowpath = match.anypats() or (not match.always() and opts.get('removed')) + if not slowpath: + follow = opts.get('follow') or opts.get('follow_first') + startctxs = [] + if follow and opts.get('rev'): + startctxs = [repo[r] for r in revs] + for f in match.files(): + if follow and startctxs: + # No idea if the path was a directory at that revision, so + # take the slow path. + if any(f not in c for c in startctxs): + slowpath = True + continue + elif follow and f not in wctx: + # If the file exists, it may be a directory, so let it + # take the slow path. + if os.path.exists(repo.wjoin(f)): + slowpath = True + continue + else: + raise error.Abort(_('cannot follow file not in parent ' + 'revision: "%s"') % f) + filelog = repo.file(f) + if not filelog: + # A zero count may be a directory or deleted file, so + # try to find matching entries on the slow path. + if follow: + raise error.Abort( + _('cannot follow nonexistent file: "%s"') % f) + slowpath = True + + # We decided to fall back to the slowpath because at least one + # of the paths was not a file. Check to see if at least one of them + # existed in history - in that case, we'll continue down the + # slowpath; otherwise, we can turn off the slowpath + if slowpath: + for path in match.files(): + if path == '.' or path in repo.store: + break + else: + slowpath = False + + return match, pats, slowpath + +def _fileancestors(repo, revs, match, followfirst): + fctxs = [] + for r in revs: + ctx = repo[r] + fctxs.extend(ctx[f].introfilectx() for f in ctx.walk(match)) + + # When displaying a revision with --patch --follow FILE, we have + # to know which file of the revision must be diffed. With + # --follow, we want the names of the ancestors of FILE in the + # revision, stored in "fcache". "fcache" is populated as a side effect + # of the graph traversal. + fcache = {} + def filematcher(ctx): + return scmutil.matchfiles(repo, fcache.get(ctx.rev(), [])) + + def revgen(): + for rev, cs in dagop.filectxancestors(fctxs, followfirst=followfirst): + fcache[rev] = [c.path() for c in cs] + yield rev + return smartset.generatorset(revgen(), iterasc=False), filematcher + +def _makenofollowfilematcher(repo, pats, opts): + '''hook for extensions to override the filematcher for non-follow cases''' + return None + +_opt2logrevset = { + 'no_merges': ('not merge()', None), + 'only_merges': ('merge()', None), + '_matchfiles': (None, '_matchfiles(%ps)'), + 'date': ('date(%s)', None), + 'branch': ('branch(%s)', '%lr'), + '_patslog': ('filelog(%s)', '%lr'), + 'keyword': ('keyword(%s)', '%lr'), + 'prune': ('ancestors(%s)', 'not %lr'), + 'user': ('user(%s)', '%lr'), +} + +def _makerevset(repo, match, pats, slowpath, opts): + """Return a revset string built from log options and file patterns""" + opts = dict(opts) + # follow or not follow? + follow = opts.get('follow') or opts.get('follow_first') + + # branch and only_branch are really aliases and must be handled at + # the same time + opts['branch'] = opts.get('branch', []) + opts.get('only_branch', []) + opts['branch'] = [repo.lookupbranch(b) for b in opts['branch']] + + if slowpath: + # See walkchangerevs() slow path. + # + # pats/include/exclude cannot be represented as separate + # revset expressions as their filtering logic applies at file + # level. For instance "-I a -X b" matches a revision touching + # "a" and "b" while "file(a) and not file(b)" does + # not. Besides, filesets are evaluated against the working + # directory. + matchargs = ['r:', 'd:relpath'] + for p in pats: + matchargs.append('p:' + p) + for p in opts.get('include', []): + matchargs.append('i:' + p) + for p in opts.get('exclude', []): + matchargs.append('x:' + p) + opts['_matchfiles'] = matchargs + elif not follow: + opts['_patslog'] = list(pats) + + expr = [] + for op, val in sorted(opts.iteritems()): + if not val: + continue + if op not in _opt2logrevset: + continue + revop, listop = _opt2logrevset[op] + if revop and '%' not in revop: + expr.append(revop) + elif not listop: + expr.append(revsetlang.formatspec(revop, val)) + else: + if revop: + val = [revsetlang.formatspec(revop, v) for v in val] + expr.append(revsetlang.formatspec(listop, val)) + + if expr: + expr = '(' + ' and '.join(expr) + ')' + else: + expr = None + return expr + +def _initialrevs(repo, opts): + """Return the initial set of revisions to be filtered or followed""" + follow = opts.get('follow') or opts.get('follow_first') + if opts.get('rev'): + revs = scmutil.revrange(repo, opts['rev']) + elif follow and repo.dirstate.p1() == nullid: + revs = smartset.baseset() + elif follow: + revs = repo.revs('.') + else: + revs = smartset.spanset(repo) + revs.reverse() + return revs + +def getrevs(repo, pats, opts): + """Return (revs, differ) where revs is a smartset + + differ is a changesetdiffer with pre-configured file matcher. + """ + follow = opts.get('follow') or opts.get('follow_first') + followfirst = opts.get('follow_first') + limit = getlimit(opts) + revs = _initialrevs(repo, opts) + if not revs: + return smartset.baseset(), None + match, pats, slowpath = _makematcher(repo, revs, pats, opts) + filematcher = None + if follow: + if slowpath or match.always(): + revs = dagop.revancestors(repo, revs, followfirst=followfirst) + else: + revs, filematcher = _fileancestors(repo, revs, match, followfirst) + revs.reverse() + if filematcher is None: + filematcher = _makenofollowfilematcher(repo, pats, opts) + if filematcher is None: + def filematcher(ctx): + return match + + expr = _makerevset(repo, match, pats, slowpath, opts) + if opts.get('graph') and opts.get('rev'): + # User-specified revs might be unsorted, but don't sort before + # _makerevset because it might depend on the order of revs + if not (revs.isdescending() or revs.istopo()): + revs.sort(reverse=True) + if expr: + matcher = revset.match(None, expr) + revs = matcher(repo, revs) + if limit is not None: + revs = revs.slice(0, limit) + + differ = changesetdiffer() + differ._makefilematcher = filematcher + return revs, differ + +def _parselinerangeopt(repo, opts): + """Parse --line-range log option and return a list of tuples (filename, + (fromline, toline)). + """ + linerangebyfname = [] + for pat in opts.get('line_range', []): + try: + pat, linerange = pat.rsplit(',', 1) + except ValueError: + raise error.Abort(_('malformatted line-range pattern %s') % pat) + try: + fromline, toline = map(int, linerange.split(':')) + except ValueError: + raise error.Abort(_("invalid line range for %s") % pat) + msg = _("line range pattern '%s' must match exactly one file") % pat + fname = scmutil.parsefollowlinespattern(repo, None, pat, msg) + linerangebyfname.append( + (fname, util.processlinerange(fromline, toline))) + return linerangebyfname + +def getlinerangerevs(repo, userrevs, opts): + """Return (revs, differ). + + "revs" are revisions obtained by processing "line-range" log options and + walking block ancestors of each specified file/line-range. + + "differ" is a changesetdiffer with pre-configured file matcher and hunks + filter. + """ + wctx = repo[None] + + # Two-levels map of "rev -> file ctx -> [line range]". + linerangesbyrev = {} + for fname, (fromline, toline) in _parselinerangeopt(repo, opts): + if fname not in wctx: + raise error.Abort(_('cannot follow file not in parent ' + 'revision: "%s"') % fname) + fctx = wctx.filectx(fname) + for fctx, linerange in dagop.blockancestors(fctx, fromline, toline): + rev = fctx.introrev() + if rev not in userrevs: + continue + linerangesbyrev.setdefault( + rev, {}).setdefault( + fctx.path(), []).append(linerange) + + def nofilterhunksfn(fctx, hunks): + return hunks + + def hunksfilter(ctx): + fctxlineranges = linerangesbyrev.get(ctx.rev()) + if fctxlineranges is None: + return nofilterhunksfn + + def filterfn(fctx, hunks): + lineranges = fctxlineranges.get(fctx.path()) + if lineranges is not None: + for hr, lines in hunks: + if hr is None: # binary + yield hr, lines + continue + if any(mdiff.hunkinrange(hr[2:], lr) + for lr in lineranges): + yield hr, lines + else: + for hunk in hunks: + yield hunk + + return filterfn + + def filematcher(ctx): + files = list(linerangesbyrev.get(ctx.rev(), [])) + return scmutil.matchfiles(repo, files) + + revs = sorted(linerangesbyrev, reverse=True) + + differ = changesetdiffer() + differ._makefilematcher = filematcher + differ._makehunksfilter = hunksfilter + return revs, differ + +def _graphnodeformatter(ui, displayer): + spec = ui.config('ui', 'graphnodetemplate') + if not spec: + return templatekw.getgraphnode # fast path for "{graphnode}" + + spec = templater.unquotestring(spec) + if isinstance(displayer, changesettemplater): + # reuse cache of slow templates + tres = displayer._tresources + else: + tres = formatter.templateresources(ui) + templ = formatter.maketemplater(ui, spec, defaults=templatekw.keywords, + resources=tres) + def formatnode(repo, ctx): + props = {'ctx': ctx, 'repo': repo} + return templ.renderdefault(props) + return formatnode + +def displaygraph(ui, repo, dag, displayer, edgefn, getrenamed=None, props=None): + props = props or {} + formatnode = _graphnodeformatter(ui, displayer) + state = graphmod.asciistate() + styles = state['styles'] + + # only set graph styling if HGPLAIN is not set. + if ui.plain('graph'): + # set all edge styles to |, the default pre-3.8 behaviour + styles.update(dict.fromkeys(styles, '|')) + else: + edgetypes = { + 'parent': graphmod.PARENT, + 'grandparent': graphmod.GRANDPARENT, + 'missing': graphmod.MISSINGPARENT + } + for name, key in edgetypes.items(): + # experimental config: experimental.graphstyle.* + styles[key] = ui.config('experimental', 'graphstyle.%s' % name, + styles[key]) + if not styles[key]: + styles[key] = None + + # experimental config: experimental.graphshorten + state['graphshorten'] = ui.configbool('experimental', 'graphshorten') + + for rev, type, ctx, parents in dag: + char = formatnode(repo, ctx) + copies = None + if getrenamed and ctx.rev(): + copies = [] + for fn in ctx.files(): + rename = getrenamed(fn, ctx.rev()) + if rename: + copies.append((fn, rename[0])) + edges = edgefn(type, char, state, rev, parents) + firstedge = next(edges) + width = firstedge[2] + displayer.show(ctx, copies=copies, + graphwidth=width, **pycompat.strkwargs(props)) + lines = displayer.hunk.pop(rev).split('\n') + if not lines[-1]: + del lines[-1] + displayer.flush(ctx) + for type, char, width, coldata in itertools.chain([firstedge], edges): + graphmod.ascii(ui, state, type, char, lines, coldata) + lines = [] + displayer.close() + +def displaygraphrevs(ui, repo, revs, displayer, getrenamed): + revdag = graphmod.dagwalker(repo, revs) + displaygraph(ui, repo, revdag, displayer, graphmod.asciiedges, getrenamed) + +def displayrevs(ui, repo, revs, displayer, getrenamed): + for rev in revs: + ctx = repo[rev] + copies = None + if getrenamed is not None and rev: + copies = [] + for fn in ctx.files(): + rename = getrenamed(fn, rev) + if rename: + copies.append((fn, rename[0])) + displayer.show(ctx, copies=copies) + displayer.flush(ctx) + displayer.close() + +def checkunsupportedgraphflags(pats, opts): + for op in ["newest_first"]: + if op in opts and opts[op]: + raise error.Abort(_("-G/--graph option is incompatible with --%s") + % op.replace("_", "-")) + +def graphrevs(repo, nodes, opts): + limit = getlimit(opts) + nodes.reverse() + if limit is not None: + nodes = nodes[:limit] + return graphmod.nodes(repo, nodes) diff -r fb92df8b634c -r ed5448edcbfa mercurial/logexchange.py --- a/mercurial/logexchange.py Wed Apr 04 10:35:09 2018 -0400 +++ b/mercurial/logexchange.py Wed Apr 18 15:32:08 2018 -0400 @@ -11,6 +11,7 @@ from .node import hex from . import ( + util, vfs as vfsmod, ) @@ -94,6 +95,30 @@ finally: wlock.release() +def activepath(repo, remote): + """returns remote path""" + local = None + # is the remote a local peer + local = remote.local() + + # determine the remote path from the repo, if possible; else just + # use the string given to us + rpath = remote + if local: + rpath = remote._repo.root + elif not isinstance(remote, bytes): + rpath = remote._url + + # represent the remotepath with user defined path name if exists + for path, url in repo.ui.configitems('paths'): + # remove auth info from user defined url + url = util.removeauth(url) + if url == rpath: + rpath = path + break + + return rpath + def pullremotenames(localrepo, remoterepo): """ pulls bookmarks and branches information of the remote repo during a @@ -101,15 +126,24 @@ localrepo is our local repository remoterepo is the peer instance """ - remotepath = remoterepo.url() - bookmarks = remoterepo.listkeys('bookmarks') + remotepath = activepath(localrepo, remoterepo) + + with remoterepo.commandexecutor() as e: + bookmarks = e.callcommand('listkeys', { + 'namespace': 'bookmarks', + }).result() + # on a push, we don't want to keep obsolete heads since # they won't show up as heads on the next pull, so we # remove them here otherwise we would require the user # to issue a pull to refresh the storage bmap = {} repo = localrepo.unfiltered() - for branch, nodes in remoterepo.branchmap().iteritems(): + + with remoterepo.commandexecutor() as e: + branchmap = e.callcommand('branchmap', {}).result() + + for branch, nodes in branchmap.iteritems(): bmap[branch] = [] for node in nodes: if node in repo and not repo[node].obsolete(): diff -r fb92df8b634c -r ed5448edcbfa mercurial/lsprof.py --- a/mercurial/lsprof.py Wed Apr 04 10:35:09 2018 -0400 +++ b/mercurial/lsprof.py Wed Apr 18 15:32:08 2018 -0400 @@ -27,7 +27,7 @@ def __init__(self, data): self.data = data - def sort(self, crit="inlinetime"): + def sort(self, crit=r"inlinetime"): """XXX docstring""" # profiler_entries isn't defined when running under PyPy. if profiler_entry: diff -r fb92df8b634c -r ed5448edcbfa mercurial/mail.py --- a/mercurial/mail.py Wed Apr 04 10:35:09 2018 -0400 +++ b/mercurial/mail.py Wed Apr 18 15:32:08 2018 -0400 @@ -20,9 +20,14 @@ from . import ( encoding, error, + pycompat, sslutil, util, ) +from .utils import ( + procutil, + stringutil, +) class STARTTLS(smtplib.SMTP): '''Derived class to verify the peer certificate for STARTTLS. @@ -80,7 +85,7 @@ local_hostname = ui.config('smtp', 'local_hostname') tls = ui.config('smtp', 'tls') # backward compatible: when tls = true, we use starttls. - starttls = tls == 'starttls' or util.parsebool(tls) + starttls = tls == 'starttls' or stringutil.parsebool(tls) smtps = tls == 'smtps' if (starttls or smtps) and not util.safehasattr(socket, 'ssl'): raise error.Abort(_("can't use TLS: Python SSL support not installed")) @@ -136,16 +141,16 @@ def _sendmail(ui, sender, recipients, msg): '''send mail using sendmail.''' program = ui.config('email', 'method') - cmdline = '%s -f %s %s' % (program, util.email(sender), - ' '.join(map(util.email, recipients))) + cmdline = '%s -f %s %s' % (program, stringutil.email(sender), + ' '.join(map(stringutil.email, recipients))) ui.note(_('sending mail: %s\n') % cmdline) - fp = util.popen(cmdline, 'w') - fp.write(msg) + fp = procutil.popen(cmdline, 'wb') + fp.write(util.tonativeeol(msg)) ret = fp.close() if ret: raise error.Abort('%s %s' % ( os.path.basename(program.split(None, 1)[0]), - util.explainexit(ret)[0])) + procutil.explainexit(ret))) def _mbox(mbox, sender, recipients, msg): '''write mails to mbox''' @@ -180,13 +185,13 @@ raise error.Abort(_('smtp specified as email transport, ' 'but no smtp host configured')) else: - if not util.findexe(method): + if not procutil.findexe(method): raise error.Abort(_('%r specified as email transport, ' 'but not in PATH') % method) def codec2iana(cs): '''''' - cs = email.charset.Charset(cs).input_charset.lower() + cs = pycompat.sysbytes(email.charset.Charset(cs).input_charset.lower()) # "latin1" normalizes to "iso8859-1", standard calls for "iso-8859-1" if cs.startswith("iso") and not cs.startswith("iso-"): @@ -205,7 +210,7 @@ return mimetextqp(s, subtype, 'us-ascii') for charset in cs: try: - s.decode(charset) + s.decode(pycompat.sysstr(charset)) return mimetextqp(s, subtype, codec2iana(charset)) except UnicodeDecodeError: pass @@ -218,7 +223,7 @@ ''' cs = email.charset.Charset(charset) msg = email.message.Message() - msg.set_type('text/' + subtype) + msg.set_type(pycompat.sysstr('text/' + subtype)) for line in body.splitlines(): if len(line) > 950: @@ -287,13 +292,13 @@ addr = addr.encode('ascii') except UnicodeDecodeError: raise error.Abort(_('invalid local address: %s') % addr) - return email.Utils.formataddr((name, addr)) + return email.utils.formataddr((name, addr)) def addressencode(ui, address, charsets=None, display=False): '''Turns address into RFC-2047 compliant header.''' if display or not address: return address or '' - name, addr = email.Utils.parseaddr(address) + name, addr = email.utils.parseaddr(address) return _addressencode(ui, name, addr, charsets) def addrlistencode(ui, addrs, charsets=None, display=False): @@ -304,7 +309,7 @@ return [a.strip() for a in addrs if a.strip()] result = [] - for name, addr in email.Utils.getaddresses(addrs): + for name, addr in email.utils.getaddresses(addrs): if name or addr: result.append(_addressencode(ui, name, addr, charsets)) return result @@ -327,6 +332,11 @@ continue except UnicodeDecodeError: pass + # On Python 3, decode_header() may return either bytes or unicode + # depending on whether the header has =?? or not + if isinstance(part, type(u'')): + uparts.append(part) + continue try: uparts.append(part.decode('UTF-8')) continue diff -r fb92df8b634c -r ed5448edcbfa mercurial/manifest.py --- a/mercurial/manifest.py Wed Apr 04 10:35:09 2018 -0400 +++ b/mercurial/manifest.py Wed Apr 18 15:32:08 2018 -0400 @@ -9,7 +9,6 @@ import heapq import itertools -import os import struct from .i18n import _ @@ -28,7 +27,7 @@ parsers = policy.importmod(r'parsers') propertycache = util.propertycache -def _parsev1(data): +def _parse(data): # This method does a little bit of excessive-looking # precondition checking. This is so that the behavior of this # class exactly matches its C counterpart to try and help @@ -47,43 +46,7 @@ else: yield f, bin(n), '' -def _parsev2(data): - metadataend = data.find('\n') - # Just ignore metadata for now - pos = metadataend + 1 - prevf = '' - while pos < len(data): - end = data.find('\n', pos + 1) # +1 to skip stem length byte - if end == -1: - raise ValueError('Manifest ended with incomplete file entry.') - stemlen = ord(data[pos:pos + 1]) - items = data[pos + 1:end].split('\0') - f = prevf[:stemlen] + items[0] - if prevf > f: - raise ValueError('Manifest entries not in sorted order.') - fl = items[1] - # Just ignore metadata (items[2:] for now) - n = data[end + 1:end + 21] - yield f, n, fl - pos = end + 22 - prevf = f - -def _parse(data): - """Generates (path, node, flags) tuples from a manifest text""" - if data.startswith('\0'): - return iter(_parsev2(data)) - else: - return iter(_parsev1(data)) - -def _text(it, usemanifestv2): - """Given an iterator over (path, node, flags) tuples, returns a manifest - text""" - if usemanifestv2: - return _textv2(it) - else: - return _textv1(it) - -def _textv1(it): +def _text(it): files = [] lines = [] _hex = revlog.hex @@ -96,19 +59,6 @@ _checkforbidden(files) return ''.join(lines) -def _textv2(it): - files = [] - lines = ['\0\n'] - prevf = '' - for f, n, fl in it: - files.append(f) - stem = os.path.commonprefix([prevf, f]) - stemlen = min(len(stem), 255) - lines.append("%c%s\0%s\n%s\n" % (stemlen, f[stemlen:], fl, n)) - prevf = f - _checkforbidden(files) - return ''.join(lines) - class lazymanifestiter(object): def __init__(self, lm): self.pos = 0 @@ -414,13 +364,7 @@ class manifestdict(object): def __init__(self, data=''): - if data.startswith('\0'): - #_lazymanifest can not parse v2 - self._lm = _lazymanifest('') - for f, n, fl in _parsev2(data): - self._lm[f] = n, fl - else: - self._lm = _lazymanifest(data) + self._lm = _lazymanifest(data) def __getitem__(self, key): return self._lm[key][0] @@ -589,12 +533,9 @@ def iterentries(self): return self._lm.iterentries() - def text(self, usemanifestv2=False): - if usemanifestv2: - return _textv2(self._lm.iterentries()) - else: - # use (probably) native version for v1 - return self._lm.text() + def text(self): + # most likely uses native version + return self._lm.text() def fastdelta(self, base, changes): """Given a base manifest text as a bytearray and a list of changes @@ -755,6 +696,12 @@ size += m.__len__() return size + def __nonzero__(self): + # Faster than "__len() != 0" since it avoids loading sub-manifests + return not self._isempty() + + __bool__ = __nonzero__ + def _isempty(self): self._load() # for consistency; already loaded by all callers return (not self._files and (not self._dirs or @@ -954,7 +901,7 @@ else: files.update(m1.iterkeys()) - for fn in t1._files.iterkeys(): + for fn in t1._files: if fn not in t2._files: files.add(t1._subpath(fn)) @@ -1013,7 +960,7 @@ # yield this dir's files and walk its submanifests self._load() - for p in sorted(self._dirs.keys() + self._files.keys()): + for p in sorted(list(self._dirs) + list(self._files)): if p in self._files: fullp = self._subpath(p) if match(fullp): @@ -1132,12 +1079,12 @@ if fl: self._flags[f] = fl - def text(self, usemanifestv2=False): + def text(self): """Get the full data of this manifest as a bytestring.""" self._load() - return _text(self.iterentries(), usemanifestv2) + return _text(self.iterentries()) - def dirtext(self, usemanifestv2=False): + def dirtext(self): """Get the full data of this directory as a bytestring. Make sure that any submanifests have been written first, so their nodeids are correct. """ @@ -1145,7 +1092,7 @@ flags = self.flags dirs = [(d[:-1], self._dirs[d]._node, 't') for d in self._dirs] files = [(f, self._files[f], flags(f)) for f in self._files] - return _text(sorted(dirs + files), usemanifestv2) + return _text(sorted(dirs + files)) def read(self, gettext, readsubtree): def _load_for_read(s): @@ -1202,22 +1149,17 @@ # stacks of commits, the number can go up, hence the config knob below. cachesize = 4 optiontreemanifest = False - usemanifestv2 = False opts = getattr(opener, 'options', None) if opts is not None: cachesize = opts.get('manifestcachesize', cachesize) optiontreemanifest = opts.get('treemanifest', False) - usemanifestv2 = opts.get('manifestv2', usemanifestv2) self._treeondisk = optiontreemanifest or treemanifest - self._usemanifestv2 = usemanifestv2 self._fulltextcache = util.lrucachedict(cachesize) if dir: assert self._treeondisk, 'opts is %r' % opts - if not dir.endswith('/'): - dir = dir + '/' if indexfile is None: indexfile = '00manifest.i' @@ -1245,19 +1187,18 @@ self._fulltextcache.clear() self._dirlogcache = {'': self} - def dirlog(self, dir): - if dir: + def dirlog(self, d): + if d: assert self._treeondisk - if dir not in self._dirlogcache: - mfrevlog = manifestrevlog(self.opener, dir, + if d not in self._dirlogcache: + mfrevlog = manifestrevlog(self.opener, d, self._dirlogcache, treemanifest=self._treeondisk) - self._dirlogcache[dir] = mfrevlog - return self._dirlogcache[dir] + self._dirlogcache[d] = mfrevlog + return self._dirlogcache[d] def add(self, m, transaction, link, p1, p2, added, removed, readtree=None): - if (p1 in self.fulltextcache and util.safehasattr(m, 'fastdelta') - and not self._usemanifestv2): + if p1 in self.fulltextcache and util.safehasattr(m, 'fastdelta'): # If our first parent is in the manifest cache, we can # compute a delta here using properties we know about the # manifest up-front, which may save time later for the @@ -1284,7 +1225,7 @@ n = self._addtree(m, transaction, link, m1, m2, readtree) arraytext = None else: - text = m.text(self._usemanifestv2) + text = m.text() n = self.addrevision(text, transaction, link, p1, p2) arraytext = bytearray(text) @@ -1303,13 +1244,13 @@ sublog.add(subm, transaction, link, subp1, subp2, None, None, readtree=readtree) m.writesubtrees(m1, m2, writesubtree) - text = m.dirtext(self._usemanifestv2) + text = m.dirtext() n = None if self._dir != '': # Double-check whether contents are unchanged to one parent - if text == m1.dirtext(self._usemanifestv2): + if text == m1.dirtext(): n = m1.node() - elif text == m2.dirtext(self._usemanifestv2): + elif text == m2.dirtext(): n = m2.node() if not n: @@ -1338,6 +1279,7 @@ self._treeinmem = usetreemanifest self._revlog = repo._constructmanifest() + self._narrowmatch = repo.narrowmatch() # A cache of the manifestctx or treemanifestctx for each directory self._dirmancache = {} @@ -1361,6 +1303,9 @@ if node in self._dirmancache.get(dir, ()): return self._dirmancache[dir][node] + if not self._narrowmatch.always(): + if not self._narrowmatch.visitdir(dir[:-1] or '.'): + return excludeddirmanifestctx(dir, node) if dir: if self._revlog._treeondisk: if verify: @@ -1487,19 +1432,6 @@ Changing the value of `shallow` has no effect on flat manifests. ''' revlog = self._revlog() - if revlog._usemanifestv2: - # Need to perform a slow delta - r0 = revlog.deltaparent(revlog.rev(self._node)) - m0 = self._manifestlog[revlog.node(r0)].read() - m1 = self.read() - md = manifestdict() - for f, ((n0, fl0), (n1, fl1)) in m0.diff(m1).iteritems(): - if n1: - md[f] = n1 - if fl1: - md.setflag(f, fl1) - return md - r = revlog.rev(self._node) d = mdiff.patchtext(revlog.revdiff(revlog.deltaparent(r), r)) return manifestdict(d) @@ -1549,6 +1481,10 @@ #self.linkrev = revlog.linkrev(rev) def _revlog(self): + narrowmatch = self._manifestlog._narrowmatch + if not narrowmatch.always(): + if not narrowmatch.visitdir(self._dir[:-1] or '.'): + return excludedmanifestrevlog(self._dir) return self._manifestlog._revlog.dirlog(self._dir) def read(self): @@ -1602,7 +1538,7 @@ its 't' flag. ''' revlog = self._revlog() - if shallow and not revlog._usemanifestv2: + if shallow: r = revlog.rev(self._node) d = mdiff.patchtext(revlog.revdiff(revlog.deltaparent(r), r)) return manifestdict(d) @@ -1641,3 +1577,80 @@ def find(self, key): return self.read().find(key) + +class excludeddir(treemanifest): + """Stand-in for a directory that is excluded from the repository. + + With narrowing active on a repository that uses treemanifests, + some of the directory revlogs will be excluded from the resulting + clone. This is a huge storage win for clients, but means we need + some sort of pseudo-manifest to surface to internals so we can + detect a merge conflict outside the narrowspec. That's what this + class is: it stands in for a directory whose node is known, but + whose contents are unknown. + """ + def __init__(self, dir, node): + super(excludeddir, self).__init__(dir) + self._node = node + # Add an empty file, which will be included by iterators and such, + # appearing as the directory itself (i.e. something like "dir/") + self._files[''] = node + self._flags[''] = 't' + + # Manifests outside the narrowspec should never be modified, so avoid + # copying. This makes a noticeable difference when there are very many + # directories outside the narrowspec. Also, it makes sense for the copy to + # be of the same type as the original, which would not happen with the + # super type's copy(). + def copy(self): + return self + +class excludeddirmanifestctx(treemanifestctx): + """context wrapper for excludeddir - see that docstring for rationale""" + def __init__(self, dir, node): + self._dir = dir + self._node = node + + def read(self): + return excludeddir(self._dir, self._node) + + def write(self, *args): + raise error.ProgrammingError( + 'attempt to write manifest from excluded dir %s' % self._dir) + +class excludedmanifestrevlog(manifestrevlog): + """Stand-in for excluded treemanifest revlogs. + + When narrowing is active on a treemanifest repository, we'll have + references to directories we can't see due to the revlog being + skipped. This class exists to conform to the manifestrevlog + interface for those directories and proactively prevent writes to + outside the narrowspec. + """ + + def __init__(self, dir): + self._dir = dir + + def __len__(self): + raise error.ProgrammingError( + 'attempt to get length of excluded dir %s' % self._dir) + + def rev(self, node): + raise error.ProgrammingError( + 'attempt to get rev from excluded dir %s' % self._dir) + + def linkrev(self, node): + raise error.ProgrammingError( + 'attempt to get linkrev from excluded dir %s' % self._dir) + + def node(self, rev): + raise error.ProgrammingError( + 'attempt to get node from excluded dir %s' % self._dir) + + def add(self, *args, **kwargs): + # We should never write entries in dirlogs outside the narrow clone. + # However, the method still gets called from writesubtree() in + # _addtree(), so we need to handle it. We should possibly make that + # avoid calling add() with a clean manifest (_dirty is always False + # in excludeddir instances). + pass diff -r fb92df8b634c -r ed5448edcbfa mercurial/match.py --- a/mercurial/match.py Wed Apr 04 10:35:09 2018 -0400 +++ b/mercurial/match.py Wed Apr 18 15:32:08 2018 -0400 @@ -13,10 +13,15 @@ from .i18n import _ from . import ( + encoding, error, pathutil, + pycompat, util, ) +from .utils import ( + stringutil, +) allpatternkinds = ('re', 'glob', 'path', 'relglob', 'relpath', 'relre', 'listfile', 'listfile0', 'set', 'include', 'subinclude', @@ -225,7 +230,7 @@ except IOError as inst: if warn: warn(_("skipping unreadable pattern file '%s': %s\n") % - (pat, inst.strerror)) + (pat, stringutil.forcebytestr(inst.strerror))) continue # else: re or relre - which cannot be normalized kindpats.append((kind, pat, '')) @@ -345,7 +350,7 @@ return 'all' def __repr__(self): - return '' + return r'' class nevermatcher(basematcher): '''Matches nothing.''' @@ -368,7 +373,7 @@ return False def __repr__(self): - return '' + return r'' class patternmatcher(basematcher): @@ -397,6 +402,7 @@ def prefix(self): return self._prefix + @encoding.strmethod def __repr__(self): return ('' % self._pats) @@ -424,8 +430,9 @@ any(parentdir in self._roots for parentdir in util.finddirs(dir))) + @encoding.strmethod def __repr__(self): - return ('' % self._pats) + return ('' % pycompat.bytestr(self._pats)) class exactmatcher(basematcher): '''Matches the input files exactly. They are interpreted as paths, not @@ -452,6 +459,7 @@ def isexact(self): return True + @encoding.strmethod def __repr__(self): return ('' % self._files) @@ -492,6 +500,7 @@ def isexact(self): return self._m1.isexact() + @encoding.strmethod def __repr__(self): return ('' % (self._m1, self._m2)) @@ -558,6 +567,7 @@ def isexact(self): return self._m1.isexact() or self._m2.isexact() + @encoding.strmethod def __repr__(self): return ('' % (self._m1, self._m2)) @@ -638,6 +648,7 @@ def prefix(self): return self._matcher.prefix() and not self._always + @encoding.strmethod def __repr__(self): return ('' % (self._path, self._matcher)) @@ -671,6 +682,7 @@ r |= v return r + @encoding.strmethod def __repr__(self): return ('' % self._matchers) diff -r fb92df8b634c -r ed5448edcbfa mercurial/mdiff.py --- a/mercurial/mdiff.py Wed Apr 04 10:35:09 2018 -0400 +++ b/mercurial/mdiff.py Wed Apr 18 15:32:08 2018 -0400 @@ -13,11 +13,15 @@ from .i18n import _ from . import ( + encoding, error, policy, pycompat, util, ) +from .utils import dateutil + +_missing_newline_marker = "\\ No newline at end of file\n" bdiff = policy.importmod(r'bdiff') mpatch = policy.importmod(r'mpatch') @@ -27,16 +31,7 @@ patches = mpatch.patches patchedsize = mpatch.patchedsize textdiff = bdiff.bdiff - -def splitnewlines(text): - '''like str.splitlines, but only split on newlines.''' - lines = [l + '\n' for l in text.split('\n')] - if lines: - if lines[-1] == '\n': - lines.pop() - else: - lines[-1] = lines[-1][:-1] - return lines +splitnewlines = bdiff.splitnewlines class diffopts(object): '''context is the number of context lines @@ -68,6 +63,7 @@ 'upgrade': False, 'showsimilarity': False, 'worddiff': False, + 'xdiff': False, } def __init__(self, **opts): @@ -82,7 +78,8 @@ self.context = int(self.context) except ValueError: raise error.Abort(_('diff context lines count must be ' - 'an integer, not %r') % self.context) + 'an integer, not %r') % + pycompat.bytestr(self.context)) def copy(self, **kwargs): opts = dict((k, getattr(self, k)) for k in self.defaults) @@ -100,7 +97,7 @@ if blank and opts.ignoreblanklines: text = re.sub('\n+', '\n', text).strip('\n') if opts.ignorewseol: - text = re.sub(br'[ \t\r\f]+\n', r'\n', text) + text = re.sub(br'[ \t\r\f]+\n', br'\n', text) return text def splitblock(base1, lines1, base2, lines2, opts): @@ -193,6 +190,13 @@ raise error.Abort(_('line range exceeds file size')) return filteredblocks, (lba, uba) +def chooseblocksfunc(opts=None): + if (opts is None or not opts.xdiff + or not util.safehasattr(bdiff, 'xdiffblocks')): + return bdiff.blocks + else: + return bdiff.xdiffblocks + def allblocks(text1, text2, opts=None, lines1=None, lines2=None): """Return (block, type) tuples, where block is an mdiff.blocks line entry. type is '=' for blocks matching exactly one another @@ -206,7 +210,7 @@ if opts.ignorews or opts.ignorewsamount or opts.ignorewseol: text1 = wsclean(opts, text1, False) text2 = wsclean(opts, text2, False) - diff = bdiff.blocks(text1, text2) + diff = chooseblocksfunc(opts)(text1, text2) for i, s1 in enumerate(diff): # The first match is special. # we've either found a match starting at line 0 or a match later @@ -234,13 +238,15 @@ yield s, type yield s1, '=' -def unidiff(a, ad, b, bd, fn1, fn2, opts=defaultopts): +def unidiff(a, ad, b, bd, fn1, fn2, binary, opts=defaultopts): """Return a unified diff as a (headers, hunks) tuple. If the diff is not null, `headers` is a list with unified diff header lines "--- " and "+++ " and `hunks` is a generator yielding (hunkrange, hunklines) coming from _unidiff(). Otherwise, `headers` and `hunks` are empty. + + Set binary=True if either a or b should be taken as a binary file. """ def datetag(date, fn=None): if not opts.git and not opts.nodates: @@ -259,23 +265,18 @@ aprefix = 'a/' bprefix = 'b/' - epoch = util.datestr((0, 0)) + epoch = dateutil.datestr((0, 0)) fn1 = util.pconvert(fn1) fn2 = util.pconvert(fn2) - def checknonewline(lines): - for text in lines: - if text[-1:] != '\n': - text += "\n\ No newline at end of file\n" - yield text - - if not opts.text and (util.binary(a) or util.binary(b)): + if binary: if a and b and len(a) == len(b) and a == b: return sentinel headerlines = [] hunks = (None, ['Binary file %s has changed\n' % fn1]), elif not a: + without_newline = not b.endswith('\n') b = splitnewlines(b) if a is None: l1 = '--- /dev/null%s' % datetag(epoch) @@ -286,8 +287,12 @@ size = len(b) hunkrange = (0, 0, 1, size) hunklines = ["@@ -0,0 +1,%d @@\n" % size] + ["+" + e for e in b] - hunks = (hunkrange, checknonewline(hunklines)), + if without_newline: + hunklines[-1] += '\n' + hunklines.append(_missing_newline_marker) + hunks = (hunkrange, hunklines), elif not b: + without_newline = not a.endswith('\n') a = splitnewlines(a) l1 = "--- %s%s%s" % (aprefix, fn1, datetag(ad, fn1)) if b is None: @@ -298,24 +303,19 @@ size = len(a) hunkrange = (1, size, 0, 0) hunklines = ["@@ -1,%d +0,0 @@\n" % size] + ["-" + e for e in a] - hunks = (hunkrange, checknonewline(hunklines)), + if without_newline: + hunklines[-1] += '\n' + hunklines.append(_missing_newline_marker) + hunks = (hunkrange, hunklines), else: - diffhunks = _unidiff(a, b, opts=opts) - try: - hunkrange, hunklines = next(diffhunks) - except StopIteration: + hunks = _unidiff(a, b, opts=opts) + if not next(hunks): return sentinel headerlines = [ "--- %s%s%s" % (aprefix, fn1, datetag(ad, fn1)), "+++ %s%s%s" % (bprefix, fn2, datetag(bd, fn2)), ] - def rewindhunks(): - yield hunkrange, checknonewline(hunklines) - for hr, hl in diffhunks: - yield hr, checknonewline(hl) - - hunks = rewindhunks() return headerlines, hunks @@ -327,6 +327,8 @@ form the '@@ -s1,l1 +s2,l2 @@' header and `hunklines` is a list of lines of the hunk combining said header followed by line additions and deletions. + + The hunks are prefixed with a bool. """ l1 = splitnewlines(t1) l2 = splitnewlines(t2) @@ -357,7 +359,11 @@ # alphanumeric char. for i in xrange(astart - 1, lastpos - 1, -1): if l1[i][0:1].isalnum(): - func = ' ' + l1[i].rstrip()[:40] + func = b' ' + l1[i].rstrip() + # split long function name if ASCII. otherwise we have no + # idea where the multi-byte boundary is, so just leave it. + if encoding.isasciistr(func): + func = func[:41] lastfunc[1] = func break # by recording this hunk's starting point as the next place to @@ -377,6 +383,26 @@ + delta + [' ' + l1[x] for x in xrange(a2, aend)] ) + # If either file ends without a newline and the last line of + # that file is part of a hunk, a marker is printed. If the + # last line of both files is identical and neither ends in + # a newline, print only one marker. That's the only case in + # which the hunk can end in a shared line without a newline. + skip = False + if not t1.endswith('\n') and astart + alen == len(l1) + 1: + for i in xrange(len(hunklines) - 1, -1, -1): + if hunklines[i].startswith(('-', ' ')): + if hunklines[i].startswith(' '): + skip = True + hunklines[i] += '\n' + hunklines.insert(i + 1, _missing_newline_marker) + break + if not skip and not t2.endswith('\n') and bstart + blen == len(l2) + 1: + for i in xrange(len(hunklines) - 1, -1, -1): + if hunklines[i].startswith('+'): + hunklines[i] += '\n' + hunklines.insert(i + 1, _missing_newline_marker) + break yield hunkrange, hunklines # bdiff.blocks gives us the matching sequences in the files. The loop @@ -385,6 +411,7 @@ # hunk = None ignoredlines = 0 + has_hunks = False for s, stype in allblocks(t1, t2, opts, l1, l2): a1, a2, b1, b2 = s if stype != '!': @@ -411,6 +438,9 @@ astart = hunk[1] bstart = hunk[3] else: + if not has_hunks: + has_hunks = True + yield True for x in yieldhunk(hunk): yield x if prev: @@ -427,17 +457,22 @@ delta[len(delta):] = ['+' + x for x in new] if hunk: + if not has_hunks: + has_hunks = True + yield True for x in yieldhunk(hunk): yield x + elif not has_hunks: + yield False def b85diff(to, tn): '''print base85-encoded binary diff''' def fmtline(line): l = len(line) if l <= 26: - l = chr(ord('A') + l - 1) + l = pycompat.bytechr(ord('A') + l - 1) else: - l = chr(l - 26 + ord('a') - 1) + l = pycompat.bytechr(l - 26 + ord('a') - 1) return '%c%s\n' % (l, util.b85encode(line, True)) def chunk(text, csize=52): diff -r fb92df8b634c -r ed5448edcbfa mercurial/merge.py --- a/mercurial/merge.py Wed Apr 04 10:35:09 2018 -0400 +++ b/mercurial/merge.py Wed Apr 18 15:32:08 2018 -0400 @@ -22,16 +22,18 @@ nullid, nullrev, ) +from .thirdparty import ( + attr, +) from . import ( copies, error, - extensions, filemerge, match as matchmod, obsutil, pycompat, scmutil, - subrepo, + subrepoutil, util, worker, ) @@ -45,6 +47,47 @@ bits = bits[:-2] + bits[-1:] return '\0'.join(bits) +# Merge state record types. See ``mergestate`` docs for more. +RECORD_LOCAL = b'L' +RECORD_OTHER = b'O' +RECORD_MERGED = b'F' +RECORD_CHANGEDELETE_CONFLICT = b'C' +RECORD_MERGE_DRIVER_MERGE = b'D' +RECORD_PATH_CONFLICT = b'P' +RECORD_MERGE_DRIVER_STATE = b'm' +RECORD_FILE_VALUES = b'f' +RECORD_LABELS = b'l' +RECORD_OVERRIDE = b't' +RECORD_UNSUPPORTED_MANDATORY = b'X' +RECORD_UNSUPPORTED_ADVISORY = b'x' + +MERGE_DRIVER_STATE_UNMARKED = b'u' +MERGE_DRIVER_STATE_MARKED = b'm' +MERGE_DRIVER_STATE_SUCCESS = b's' + +MERGE_RECORD_UNRESOLVED = b'u' +MERGE_RECORD_RESOLVED = b'r' +MERGE_RECORD_UNRESOLVED_PATH = b'pu' +MERGE_RECORD_RESOLVED_PATH = b'pr' +MERGE_RECORD_DRIVER_RESOLVED = b'd' + +ACTION_FORGET = b'f' +ACTION_REMOVE = b'r' +ACTION_ADD = b'a' +ACTION_GET = b'g' +ACTION_PATH_CONFLICT = b'p' +ACTION_PATH_CONFLICT_RESOLVE = b'pr' +ACTION_ADD_MODIFIED = b'am' +ACTION_CREATED = b'c' +ACTION_DELETED_CHANGED = b'dc' +ACTION_CHANGED_DELETED = b'cd' +ACTION_MERGE = b'm' +ACTION_LOCAL_DIR_RENAME_GET = b'dg' +ACTION_DIR_RENAME_MOVE_LOCAL = b'dm' +ACTION_KEEP = b'k' +ACTION_EXEC = b'e' +ACTION_CREATED_MERGE = b'cm' + class mergestate(object): '''track 3-way merge state of individual files @@ -131,9 +174,9 @@ self._other = other self._readmergedriver = None if self.mergedriver: - self._mdstate = 's' + self._mdstate = MERGE_DRIVER_STATE_SUCCESS else: - self._mdstate = 'u' + self._mdstate = MERGE_DRIVER_STATE_UNMARKED shutil.rmtree(self._repo.vfs.join('merge'), True) self._results = {} self._dirty = False @@ -152,27 +195,30 @@ if var in vars(self): delattr(self, var) self._readmergedriver = None - self._mdstate = 's' + self._mdstate = MERGE_DRIVER_STATE_SUCCESS unsupported = set() records = self._readrecords() for rtype, record in records: - if rtype == 'L': + if rtype == RECORD_LOCAL: self._local = bin(record) - elif rtype == 'O': + elif rtype == RECORD_OTHER: self._other = bin(record) - elif rtype == 'm': + elif rtype == RECORD_MERGE_DRIVER_STATE: bits = record.split('\0', 1) mdstate = bits[1] - if len(mdstate) != 1 or mdstate not in 'ums': + if len(mdstate) != 1 or mdstate not in ( + MERGE_DRIVER_STATE_UNMARKED, MERGE_DRIVER_STATE_MARKED, + MERGE_DRIVER_STATE_SUCCESS): # the merge driver should be idempotent, so just rerun it - mdstate = 'u' + mdstate = MERGE_DRIVER_STATE_UNMARKED self._readmergedriver = bits[0] self._mdstate = mdstate - elif rtype in 'FDCP': + elif rtype in (RECORD_MERGED, RECORD_CHANGEDELETE_CONFLICT, + RECORD_PATH_CONFLICT, RECORD_MERGE_DRIVER_MERGE): bits = record.split('\0') self._state[bits[0]] = bits[1:] - elif rtype == 'f': + elif rtype == RECORD_FILE_VALUES: filename, rawextras = record.split('\0', 1) extraparts = rawextras.split('\0') extras = {} @@ -182,7 +228,7 @@ i += 2 self._stateextras[filename] = extras - elif rtype == 'l': + elif rtype == RECORD_LABELS: labels = record.split('\0', 2) self._labels = [l for l in labels if len(l) > 0] elif not rtype.islower(): @@ -216,12 +262,12 @@ # we have to infer the "other" changeset of the merge # we cannot do better than that with v1 of the format mctx = self._repo[None].parents()[-1] - v1records.append(('O', mctx.hex())) + v1records.append((RECORD_OTHER, mctx.hex())) # add place holder "other" file node information # nobody is using it yet so we do no need to fetch the data # if mctx was wrong `mctx[bits[-2]]` may fails. for idx, r in enumerate(v1records): - if r[0] == 'F': + if r[0] == RECORD_MERGED: bits = r[1].split('\0') bits.insert(-2, '') v1records[idx] = (r[0], '\0'.join(bits)) @@ -230,11 +276,11 @@ def _v1v2match(self, v1records, v2records): oldv2 = set() # old format version of v2 record for rec in v2records: - if rec[0] == 'L': + if rec[0] == RECORD_LOCAL: oldv2.add(rec) - elif rec[0] == 'F': + elif rec[0] == RECORD_MERGED: # drop the onode data (not contained in v1) - oldv2.add(('F', _droponode(rec[1]))) + oldv2.add((RECORD_MERGED, _droponode(rec[1]))) for rec in v1records: if rec not in oldv2: return False @@ -254,9 +300,9 @@ f = self._repo.vfs(self.statepathv1) for i, l in enumerate(f): if i == 0: - records.append(('L', l[:-1])) + records.append((RECORD_LOCAL, l[:-1])) else: - records.append(('F', l[:-1])) + records.append((RECORD_MERGED, l[:-1])) f.close() except IOError as err: if err.errno != errno.ENOENT: @@ -288,14 +334,14 @@ off = 0 end = len(data) while off < end: - rtype = data[off] + rtype = data[off:off + 1] off += 1 length = _unpack('>I', data[off:(off + 4)])[0] off += 4 record = data[off:(off + length)] off += length - if rtype == 't': - rtype, record = record[0], record[1:] + if rtype == RECORD_OVERRIDE: + rtype, record = record[0:1], record[1:] records.append((rtype, record)) f.close() except IOError as err: @@ -357,10 +403,10 @@ def _makerecords(self): records = [] - records.append(('L', hex(self._local))) - records.append(('O', hex(self._other))) + records.append((RECORD_LOCAL, hex(self._local))) + records.append((RECORD_OTHER, hex(self._other))) if self.mergedriver: - records.append(('m', '\0'.join([ + records.append((RECORD_MERGE_DRIVER_STATE, '\0'.join([ self.mergedriver, self._mdstate]))) # Write out state items. In all cases, the value of the state map entry # is written as the contents of the record. The record type depends on @@ -368,29 +414,35 @@ # to prevent older versions of Mercurial that do not support the feature # from loading them. for filename, v in self._state.iteritems(): - if v[0] == 'd': + if v[0] == MERGE_RECORD_DRIVER_RESOLVED: # Driver-resolved merge. These are stored in 'D' records. - records.append(('D', '\0'.join([filename] + v))) - elif v[0] in ('pu', 'pr'): + records.append((RECORD_MERGE_DRIVER_MERGE, + '\0'.join([filename] + v))) + elif v[0] in (MERGE_RECORD_UNRESOLVED_PATH, + MERGE_RECORD_RESOLVED_PATH): # Path conflicts. These are stored in 'P' records. The current # resolution state ('pu' or 'pr') is stored within the record. - records.append(('P', '\0'.join([filename] + v))) + records.append((RECORD_PATH_CONFLICT, + '\0'.join([filename] + v))) elif v[1] == nullhex or v[6] == nullhex: # Change/Delete or Delete/Change conflicts. These are stored in # 'C' records. v[1] is the local file, and is nullhex when the # file is deleted locally ('dc'). v[6] is the remote file, and # is nullhex when the file is deleted remotely ('cd'). - records.append(('C', '\0'.join([filename] + v))) + records.append((RECORD_CHANGEDELETE_CONFLICT, + '\0'.join([filename] + v))) else: # Normal files. These are stored in 'F' records. - records.append(('F', '\0'.join([filename] + v))) + records.append((RECORD_MERGED, + '\0'.join([filename] + v))) for filename, extras in sorted(self._stateextras.iteritems()): rawextras = '\0'.join('%s\0%s' % (k, v) for k, v in extras.iteritems()) - records.append(('f', '%s\0%s' % (filename, rawextras))) + records.append((RECORD_FILE_VALUES, + '%s\0%s' % (filename, rawextras))) if self._labels is not None: labels = '\0'.join(self._labels) - records.append(('l', labels)) + records.append((RECORD_LABELS, labels)) return records def _writerecords(self, records): @@ -400,13 +452,13 @@ def _writerecordsv1(self, records): """Write current state on disk in a version 1 file""" - f = self._repo.vfs(self.statepathv1, 'w') + f = self._repo.vfs(self.statepathv1, 'wb') irecords = iter(records) lrecords = next(irecords) - assert lrecords[0] == 'L' + assert lrecords[0] == RECORD_LOCAL f.write(hex(self._local) + '\n') for rtype, data in irecords: - if rtype == 'F': + if rtype == RECORD_MERGED: f.write('%s\n' % _droponode(data)) f.close() @@ -415,12 +467,12 @@ See the docstring for _readrecordsv2 for why we use 't'.""" # these are the records that all version 2 clients can read - whitelist = 'LOF' - f = self._repo.vfs(self.statepathv2, 'w') + allowlist = (RECORD_LOCAL, RECORD_OTHER, RECORD_MERGED) + f = self._repo.vfs(self.statepathv2, 'wb') for key, data in records: assert len(key) == 1 - if key not in whitelist: - key, data = 't', '%s%s' % (key, data) + if key not in allowlist: + key, data = RECORD_OVERRIDE, '%s%s' % (key, data) format = '>sI%is' % len(data) f.write(_pack(format, key, len(data), data)) f.close() @@ -439,7 +491,7 @@ else: hash = hex(hashlib.sha1(fcl.path()).digest()) self._repo.vfs.write('merge/' + hash, fcl.data()) - self._state[fd] = ['u', hash, fcl.path(), + self._state[fd] = [MERGE_RECORD_UNRESOLVED, hash, fcl.path(), fca.path(), hex(fca.filenode()), fco.path(), hex(fco.filenode()), fcl.flags()] @@ -452,7 +504,7 @@ frename: the filename the conflicting file was renamed to forigin: origin of the file ('l' or 'r' for local/remote) """ - self._state[path] = ['pu', frename, forigin] + self._state[path] = [MERGE_RECORD_UNRESOLVED_PATH, frename, forigin] self._dirty = True def __contains__(self, dfile): @@ -478,14 +530,15 @@ """Obtain the paths of unresolved files.""" for f, entry in self._state.iteritems(): - if entry[0] in ('u', 'pu'): + if entry[0] in (MERGE_RECORD_UNRESOLVED, + MERGE_RECORD_UNRESOLVED_PATH): yield f def driverresolved(self): """Obtain the paths of driver-resolved files.""" for f, entry in self._state.items(): - if entry[0] == 'd': + if entry[0] == MERGE_RECORD_DRIVER_RESOLVED: yield f def extras(self, filename): @@ -493,7 +546,8 @@ def _resolve(self, preresolve, dfile, wctx): """rerun merge process for file path `dfile`""" - if self[dfile] in 'rd': + if self[dfile] in (MERGE_RECORD_RESOLVED, + MERGE_RECORD_DRIVER_RESOLVED): return True, 0 stateentry = self._state[dfile] state, hash, lfile, afile, anode, ofile, onode, flags = stateentry @@ -507,7 +561,7 @@ fcd = self._filectxorabsent(hash, wctx, dfile) fco = self._filectxorabsent(onode, octx, ofile) # TODO: move this to filectxorabsent - fca = self._repo.filectx(afile, fileid=anode, changeid=actx) + fca = self._repo.filectx(afile, fileid=anode, changectx=actx) # "premerge" x flags flo = fco.flags() fla = fca.flags() @@ -543,7 +597,7 @@ self._stateextras.pop(dfile, None) self._dirty = True elif not r: - self.mark(dfile, 'r') + self.mark(dfile, MERGE_RECORD_RESOLVED) if complete: action = None @@ -551,18 +605,18 @@ if fcd.isabsent(): # dc: local picked. Need to drop if present, which may # happen on re-resolves. - action = 'f' + action = ACTION_FORGET else: # cd: remote picked (or otherwise deleted) - action = 'r' + action = ACTION_REMOVE else: if fcd.isabsent(): # dc: remote picked - action = 'g' + action = ACTION_GET elif fco.isabsent(): # cd: local picked if dfile in self.localctx: - action = 'am' + action = ACTION_ADD_MODIFIED else: - action = 'a' + action = ACTION_ADD # else: regular merges (no action necessary) self._results[dfile] = r, action @@ -594,7 +648,7 @@ if r is None: updated += 1 elif r == 0: - if action == 'r': + if action == ACTION_REMOVE: removed += 1 else: merged += 1 @@ -606,7 +660,13 @@ def actions(self): """return lists of actions to perform on the dirstate""" - actions = {'r': [], 'f': [], 'a': [], 'am': [], 'g': []} + actions = { + ACTION_REMOVE: [], + ACTION_FORGET: [], + ACTION_ADD: [], + ACTION_ADD_MODIFIED: [], + ACTION_GET: [], + } for f, (r, action) in self._results.iteritems(): if action is not None: actions[action].append((f, None, "merge result")) @@ -621,19 +681,19 @@ """queues a file to be removed from the dirstate Meant for use by custom merge drivers.""" - self._results[f] = 0, 'r' + self._results[f] = 0, ACTION_REMOVE def queueadd(self, f): """queues a file to be added to the dirstate Meant for use by custom merge drivers.""" - self._results[f] = 0, 'a' + self._results[f] = 0, ACTION_ADD def queueget(self, f): """queues a file to be marked modified in the dirstate Meant for use by custom merge drivers.""" - self._results[f] = 0, 'g' + self._results[f] = 0, ACTION_GET def _getcheckunknownconfig(repo, section, name): config = repo.ui.config(section, name) @@ -707,7 +767,8 @@ # Does the directory contain any files that are not in the dirstate? for p, dirs, files in repo.wvfs.walk(f): for fn in files: - relf = repo.dirstate.normalize(repo.wvfs.reljoin(p, fn)) + relf = util.pconvert(repo.wvfs.reljoin(p, fn)) + relf = repo.dirstate.normalize(relf, isknown=True) if relf not in repo.dirstate: return f return None @@ -734,14 +795,14 @@ checkunknowndirs = _unknowndirschecker() for f, (m, args, msg) in actions.iteritems(): - if m in ('c', 'dc'): + if m in (ACTION_CREATED, ACTION_DELETED_CHANGED): if _checkunknownfile(repo, wctx, mctx, f): fileconflicts.add(f) elif pathconfig and f not in wctx: path = checkunknowndirs(repo, wctx, f) if path is not None: pathconflicts.add(path) - elif m == 'dg': + elif m == ACTION_LOCAL_DIR_RENAME_GET: if _checkunknownfile(repo, wctx, mctx, f, args[0]): fileconflicts.add(f) @@ -753,7 +814,7 @@ collectconflicts(unknownconflicts, unknownconfig) else: for f, (m, args, msg) in actions.iteritems(): - if m == 'cm': + if m == ACTION_CREATED_MERGE: fl2, anc = args different = _checkunknownfile(repo, wctx, mctx, f) if repo.dirstate._ignore(f): @@ -774,16 +835,16 @@ # don't like an abort happening in the middle of # merge.update. if not different: - actions[f] = ('g', (fl2, False), "remote created") + actions[f] = (ACTION_GET, (fl2, False), 'remote created') elif mergeforce or config == 'abort': - actions[f] = ('m', (f, f, None, False, anc), - "remote differs from untracked local") + actions[f] = (ACTION_MERGE, (f, f, None, False, anc), + 'remote differs from untracked local') elif config == 'abort': abortconflicts.add(f) else: if config == 'warn': warnconflicts.add(f) - actions[f] = ('g', (fl2, True), "remote created") + actions[f] = (ACTION_GET, (fl2, True), 'remote created') for f in sorted(abortconflicts): warn = repo.ui.warn @@ -805,11 +866,11 @@ repo.ui.warn(_("%s: replacing untracked files in directory\n") % f) for f, (m, args, msg) in actions.iteritems(): - if m == 'c': + if m == ACTION_CREATED: backup = (f in fileconflicts or f in pathconflicts or any(p in pathconflicts for p in util.finddirs(f))) flags, = args - actions[f] = ('g', (flags, backup), msg) + actions[f] = (ACTION_GET, (flags, backup), msg) def _forgetremoved(wctx, mctx, branchmerge): """ @@ -827,9 +888,9 @@ """ actions = {} - m = 'f' + m = ACTION_FORGET if branchmerge: - m = 'r' + m = ACTION_REMOVE for f in wctx.deleted(): if f not in mctx: actions[f] = m, None, "forget deleted" @@ -837,7 +898,7 @@ if not branchmerge: for f in wctx.removed(): if f not in mctx: - actions[f] = 'f', None, "forget removed" + actions[f] = ACTION_FORGET, None, "forget removed" return actions @@ -846,19 +907,20 @@ pmmf = set(wmf) if actions: - # k, dr, e and rd are no-op - for m in 'a', 'am', 'f', 'g', 'cd', 'dc': + # KEEP and EXEC are no-op + for m in (ACTION_ADD, ACTION_ADD_MODIFIED, ACTION_FORGET, ACTION_GET, + ACTION_CHANGED_DELETED, ACTION_DELETED_CHANGED): for f, args, msg in actions[m]: pmmf.add(f) - for f, args, msg in actions['r']: + for f, args, msg in actions[ACTION_REMOVE]: pmmf.discard(f) - for f, args, msg in actions['dm']: + for f, args, msg in actions[ACTION_DIR_RENAME_MOVE_LOCAL]: f2, flags = args pmmf.discard(f2) pmmf.add(f) - for f, args, msg in actions['dg']: + for f, args, msg in actions[ACTION_LOCAL_DIR_RENAME_GET]: pmmf.add(f) - for f, args, msg in actions['m']: + for f, args, msg in actions[ACTION_MERGE]: f1, f2, fa, move, anc = args if move: pmmf.discard(f1) @@ -934,7 +996,8 @@ deletedfiles = set() for f, (m, args, msg) in actions.items(): - if m in ('c', 'dc', 'm', 'cm'): + if m in (ACTION_CREATED, ACTION_DELETED_CHANGED, ACTION_MERGE, + ACTION_CREATED_MERGE): # This action may create a new local file. createdfiledirs.update(util.finddirs(f)) if mf.hasdir(f): @@ -943,13 +1006,13 @@ # will be checked once we know what all the deleted files are. remoteconflicts.add(f) # Track the names of all deleted files. - if m == 'r': + if m == ACTION_REMOVE: deletedfiles.add(f) - if m == 'm': + if m == ACTION_MERGE: f1, f2, fa, move, anc = args if move: deletedfiles.add(f1) - if m == 'dm': + if m == ACTION_DIR_RENAME_MOVE_LOCAL: f2, flags = args deletedfiles.add(f2) @@ -965,7 +1028,10 @@ # A file is in a directory which aliases a local file. # We will need to rename the local file. localconflicts.add(p) - if p in actions and actions[p][0] in ('c', 'dc', 'm', 'cm'): + if p in actions and actions[p][0] in (ACTION_CREATED, + ACTION_DELETED_CHANGED, + ACTION_MERGE, + ACTION_CREATED_MERGE): # The file is in a directory which aliases a remote file. # This is an internal inconsistency within the remote # manifest. @@ -974,26 +1040,30 @@ # Rename all local conflicting files that have not been deleted. for p in localconflicts: if p not in deletedfiles: - ctxname = str(wctx).rstrip('+') + ctxname = bytes(wctx).rstrip('+') pnew = util.safename(p, ctxname, wctx, set(actions.keys())) - actions[pnew] = ('pr', (p,), "local path conflict") - actions[p] = ('p', (pnew, 'l'), "path conflict") + actions[pnew] = (ACTION_PATH_CONFLICT_RESOLVE, (p,), + 'local path conflict') + actions[p] = (ACTION_PATH_CONFLICT, (pnew, 'l'), + 'path conflict') if remoteconflicts: # Check if all files in the conflicting directories have been removed. - ctxname = str(mctx).rstrip('+') + ctxname = bytes(mctx).rstrip('+') for f, p in _filesindirs(repo, mf, remoteconflicts): if f not in deletedfiles: m, args, msg = actions[p] pnew = util.safename(p, ctxname, wctx, set(actions.keys())) - if m in ('dc', 'm'): + if m in (ACTION_DELETED_CHANGED, ACTION_MERGE): # Action was merge, just update target. actions[pnew] = (m, args, msg) else: # Action was create, change to renamed get action. fl = args[0] - actions[pnew] = ('dg', (p, fl), "remote path conflict") - actions[p] = ('p', (pnew, 'r'), "path conflict") + actions[pnew] = (ACTION_LOCAL_DIR_RENAME_GET, (p, fl), + 'remote path conflict') + actions[p] = (ACTION_PATH_CONFLICT, (pnew, ACTION_REMOVE), + 'path conflict') remoteconflicts.remove(p) break @@ -1071,77 +1141,80 @@ if f not in ma: fa = copy.get(f, None) if fa is not None: - actions[f] = ('m', (f, f, fa, False, pa.node()), - "both renamed from " + fa) + actions[f] = (ACTION_MERGE, (f, f, fa, False, pa.node()), + 'both renamed from %s' % fa) else: - actions[f] = ('m', (f, f, None, False, pa.node()), - "both created") + actions[f] = (ACTION_MERGE, (f, f, None, False, pa.node()), + 'both created') else: a = ma[f] fla = ma.flags(f) nol = 'l' not in fl1 + fl2 + fla if n2 == a and fl2 == fla: - actions[f] = ('k', (), "remote unchanged") + actions[f] = (ACTION_KEEP, (), 'remote unchanged') elif n1 == a and fl1 == fla: # local unchanged - use remote if n1 == n2: # optimization: keep local content - actions[f] = ('e', (fl2,), "update permissions") + actions[f] = (ACTION_EXEC, (fl2,), 'update permissions') else: - actions[f] = ('g', (fl2, False), "remote is newer") + actions[f] = (ACTION_GET, (fl2, False), + 'remote is newer') elif nol and n2 == a: # remote only changed 'x' - actions[f] = ('e', (fl2,), "update permissions") + actions[f] = (ACTION_EXEC, (fl2,), 'update permissions') elif nol and n1 == a: # local only changed 'x' - actions[f] = ('g', (fl1, False), "remote is newer") + actions[f] = (ACTION_GET, (fl1, False), 'remote is newer') else: # both changed something - actions[f] = ('m', (f, f, f, False, pa.node()), - "versions differ") + actions[f] = (ACTION_MERGE, (f, f, f, False, pa.node()), + 'versions differ') elif n1: # file exists only on local side if f in copied: pass # we'll deal with it on m2 side elif f in movewithdir: # directory rename, move local f2 = movewithdir[f] if f2 in m2: - actions[f2] = ('m', (f, f2, None, True, pa.node()), - "remote directory rename, both created") + actions[f2] = (ACTION_MERGE, (f, f2, None, True, pa.node()), + 'remote directory rename, both created') else: - actions[f2] = ('dm', (f, fl1), - "remote directory rename - move from " + f) + actions[f2] = (ACTION_DIR_RENAME_MOVE_LOCAL, (f, fl1), + 'remote directory rename - move from %s' % f) elif f in copy: f2 = copy[f] - actions[f] = ('m', (f, f2, f2, False, pa.node()), - "local copied/moved from " + f2) + actions[f] = (ACTION_MERGE, (f, f2, f2, False, pa.node()), + 'local copied/moved from %s' % f2) elif f in ma: # clean, a different, no remote if n1 != ma[f]: if acceptremote: - actions[f] = ('r', None, "remote delete") + actions[f] = (ACTION_REMOVE, None, 'remote delete') else: - actions[f] = ('cd', (f, None, f, False, pa.node()), - "prompt changed/deleted") + actions[f] = (ACTION_CHANGED_DELETED, + (f, None, f, False, pa.node()), + 'prompt changed/deleted') elif n1 == addednodeid: # This extra 'a' is added by working copy manifest to mark # the file as locally added. We should forget it instead of # deleting it. - actions[f] = ('f', None, "remote deleted") + actions[f] = (ACTION_FORGET, None, 'remote deleted') else: - actions[f] = ('r', None, "other deleted") + actions[f] = (ACTION_REMOVE, None, 'other deleted') elif n2: # file exists only on remote side if f in copied: pass # we'll deal with it on m1 side elif f in movewithdir: f2 = movewithdir[f] if f2 in m1: - actions[f2] = ('m', (f2, f, None, False, pa.node()), - "local directory rename, both created") + actions[f2] = (ACTION_MERGE, + (f2, f, None, False, pa.node()), + 'local directory rename, both created') else: - actions[f2] = ('dg', (f, fl2), - "local directory rename - get from " + f) + actions[f2] = (ACTION_LOCAL_DIR_RENAME_GET, (f, fl2), + 'local directory rename - get from %s' % f) elif f in copy: f2 = copy[f] if f2 in m2: - actions[f] = ('m', (f2, f, f2, False, pa.node()), - "remote copied from " + f2) + actions[f] = (ACTION_MERGE, (f2, f, f2, False, pa.node()), + 'remote copied from %s' % f2) else: - actions[f] = ('m', (f2, f, f2, True, pa.node()), - "remote moved from " + f2) + actions[f] = (ACTION_MERGE, (f2, f, f2, True, pa.node()), + 'remote moved from %s' % f2) elif f not in ma: # local unknown, remote created: the logic is described by the # following table: @@ -1155,12 +1228,12 @@ # Checking whether the files are different is expensive, so we # don't do that when we can avoid it. if not force: - actions[f] = ('c', (fl2,), "remote created") + actions[f] = (ACTION_CREATED, (fl2,), 'remote created') elif not branchmerge: - actions[f] = ('c', (fl2,), "remote created") + actions[f] = (ACTION_CREATED, (fl2,), 'remote created') else: - actions[f] = ('cm', (fl2, pa.node()), - "remote created, get or merge") + actions[f] = (ACTION_CREATED_MERGE, (fl2, pa.node()), + 'remote created, get or merge') elif n2 != ma[f]: df = None for d in dirmove: @@ -1169,13 +1242,15 @@ df = dirmove[d] + f[len(d):] break if df is not None and df in m1: - actions[df] = ('m', (df, f, f, False, pa.node()), - "local directory rename - respect move from " + f) + actions[df] = (ACTION_MERGE, (df, f, f, False, pa.node()), + 'local directory rename - respect move ' + 'from %s' % f) elif acceptremote: - actions[f] = ('c', (fl2,), "remote recreating") + actions[f] = (ACTION_CREATED, (fl2,), 'remote recreating') else: - actions[f] = ('dc', (None, f, f, False, pa.node()), - "prompt deleted/changed") + actions[f] = (ACTION_DELETED_CHANGED, + (None, f, f, False, pa.node()), + 'prompt deleted/changed') if repo.ui.configbool('experimental', 'merge.checkpathconflicts'): # If we are merging, look for path conflicts. @@ -1186,12 +1261,15 @@ def _resolvetrivial(repo, wctx, mctx, ancestor, actions): """Resolves false conflicts where the nodeid changed but the content remained the same.""" - - for f, (m, args, msg) in actions.items(): - if m == 'cd' and f in ancestor and not wctx[f].cmp(ancestor[f]): + # We force a copy of actions.items() because we're going to mutate + # actions as we resolve trivial conflicts. + for f, (m, args, msg) in list(actions.items()): + if (m == ACTION_CHANGED_DELETED and f in ancestor + and not wctx[f].cmp(ancestor[f])): # local did change but ended up with same content - actions[f] = 'r', None, "prompt same" - elif m == 'dc' and f in ancestor and not mctx[f].cmp(ancestor[f]): + actions[f] = ACTION_REMOVE, None, 'prompt same' + elif (m == ACTION_DELETED_CHANGED and f in ancestor + and not mctx[f].cmp(ancestor[f])): # remote did change but ended up with same content del actions[f] # don't get = keep local deleted @@ -1255,18 +1333,18 @@ if all(a == l[0] for a in l[1:]): # len(bids) is > 1 repo.ui.note(_(" %s: consensus for %s\n") % (f, m)) actions[f] = l[0] - if m == 'dm': + if m == ACTION_DIR_RENAME_MOVE_LOCAL: dms.append(f) continue # If keep is an option, just do it. - if 'k' in bids: + if ACTION_KEEP in bids: repo.ui.note(_(" %s: picking 'keep' action\n") % f) - actions[f] = bids['k'][0] + actions[f] = bids[ACTION_KEEP][0] continue # If there are gets and they all agree [how could they not?], do it. - if 'g' in bids: - ga0 = bids['g'][0] - if all(a == ga0 for a in bids['g'][1:]): + if ACTION_GET in bids: + ga0 = bids[ACTION_GET][0] + if all(a == ga0 for a in bids[ACTION_GET][1:]): repo.ui.note(_(" %s: picking 'get' action\n") % f) actions[f] = ga0 continue @@ -1281,14 +1359,14 @@ repo.ui.warn(_(' %s: ambiguous merge - picked %s action\n') % (f, m)) actions[f] = l[0] - if m == 'dm': + if m == ACTION_DIR_RENAME_MOVE_LOCAL: dms.append(f) continue # Work around 'dm' that can cause multiple actions for the same file for f in dms: dm, (f0, flags), msg = actions[f] - assert dm == 'dm', dm - if f0 in actions and actions[f0][0] == 'r': + assert dm == ACTION_DIR_RENAME_MOVE_LOCAL, dm + if f0 in actions and actions[f0][0] == ACTION_REMOVE: # We have one bid for removing a file and another for moving it. # These two could be merged as first move and then delete ... # but instead drop moving and just delete. @@ -1386,6 +1464,52 @@ if i > 0: yield i, f +def _prefetchfiles(repo, ctx, actions): + """Invoke ``scmutil.prefetchfiles()`` for the files relevant to the dict + of merge actions. ``ctx`` is the context being merged in.""" + + # Skipping 'a', 'am', 'f', 'r', 'dm', 'e', 'k', 'p' and 'pr', because they + # don't touch the context to be merged in. 'cd' is skipped, because + # changed/deleted never resolves to something from the remote side. + oplist = [actions[a] for a in (ACTION_GET, ACTION_DELETED_CHANGED, + ACTION_LOCAL_DIR_RENAME_GET, ACTION_MERGE)] + prefetch = scmutil.prefetchfiles + matchfiles = scmutil.matchfiles + prefetch(repo, [ctx.rev()], + matchfiles(repo, + [f for sublist in oplist for f, args, msg in sublist])) + +@attr.s(frozen=True) +class updateresult(object): + updatedcount = attr.ib() + mergedcount = attr.ib() + removedcount = attr.ib() + unresolvedcount = attr.ib() + + def isempty(self): + return (not self.updatedcount and not self.mergedcount + and not self.removedcount and not self.unresolvedcount) + + # TODO remove container emulation once consumers switch to new API. + + def __getitem__(self, x): + util.nouideprecwarn('access merge.update() results by name instead of ' + 'index', '4.6', 2) + if x == 0: + return self.updatedcount + elif x == 1: + return self.mergedcount + elif x == 2: + return self.removedcount + elif x == 3: + return self.unresolvedcount + else: + raise IndexError('can only access items 0-3') + + def __len__(self): + util.nouideprecwarn('access merge.update() results by name instead of ' + 'index', '4.6', 2) + return 4 def applyupdates(repo, actions, wctx, mctx, overwrite, labels=None): """apply the merge action list to the working directory @@ -1397,6 +1521,8 @@ describes how many files were affected by the update. """ + _prefetchfiles(repo, mctx, actions) + updated, merged, removed = 0, 0, 0 ms = mergestate.clean(repo, wctx.p1().node(), mctx.node(), labels) moves = [] @@ -1404,9 +1530,9 @@ l.sort() # 'cd' and 'dc' actions are treated like other merge conflicts - mergeactions = sorted(actions['cd']) - mergeactions.extend(sorted(actions['dc'])) - mergeactions.extend(actions['m']) + mergeactions = sorted(actions[ACTION_CHANGED_DELETED]) + mergeactions.extend(sorted(actions[ACTION_DELETED_CHANGED])) + mergeactions.extend(actions[ACTION_MERGE]) for f, args, msg in mergeactions: f1, f2, fa, move, anc = args if f == '.hgsubstate': # merged internally @@ -1441,14 +1567,15 @@ wctx[f].audit() wctx[f].remove() - numupdates = sum(len(l) for m, l in actions.items() if m != 'k') + numupdates = sum(len(l) for m, l in actions.items() + if m != ACTION_KEEP) z = 0 - if [a for a in actions['r'] if a[0] == '.hgsubstate']: - subrepo.submerge(repo, wctx, mctx, wctx, overwrite, labels) + if [a for a in actions[ACTION_REMOVE] if a[0] == '.hgsubstate']: + subrepoutil.submerge(repo, wctx, mctx, wctx, overwrite, labels) # record path conflicts - for f, args, msg in actions['p']: + for f, args, msg in actions[ACTION_PATH_CONFLICT]: f1, fo = args s = repo.ui.status s(_("%s: path conflict - a file or link has the same name as a " @@ -1468,14 +1595,14 @@ # remove in parallel (must come before resolving path conflicts and getting) prog = worker.worker(repo.ui, cost, batchremove, (repo, wctx), - actions['r']) + actions[ACTION_REMOVE]) for i, item in prog: z += i progress(_updating, z, item=item, total=numupdates, unit=_files) - removed = len(actions['r']) + removed = len(actions[ACTION_REMOVE]) # resolve path conflicts (must come before getting) - for f, args, msg in actions['pr']: + for f, args, msg in actions[ACTION_PATH_CONFLICT_RESOLVE]: repo.ui.debug(" %s: %s -> pr\n" % (f, msg)) f0, = args if wctx[f0].lexists(): @@ -1488,40 +1615,40 @@ # get in parallel prog = worker.worker(repo.ui, cost, batchget, (repo, mctx, wctx), - actions['g']) + actions[ACTION_GET]) for i, item in prog: z += i progress(_updating, z, item=item, total=numupdates, unit=_files) - updated = len(actions['g']) + updated = len(actions[ACTION_GET]) - if [a for a in actions['g'] if a[0] == '.hgsubstate']: - subrepo.submerge(repo, wctx, mctx, wctx, overwrite, labels) + if [a for a in actions[ACTION_GET] if a[0] == '.hgsubstate']: + subrepoutil.submerge(repo, wctx, mctx, wctx, overwrite, labels) # forget (manifest only, just log it) (must come first) - for f, args, msg in actions['f']: + for f, args, msg in actions[ACTION_FORGET]: repo.ui.debug(" %s: %s -> f\n" % (f, msg)) z += 1 progress(_updating, z, item=f, total=numupdates, unit=_files) # re-add (manifest only, just log it) - for f, args, msg in actions['a']: + for f, args, msg in actions[ACTION_ADD]: repo.ui.debug(" %s: %s -> a\n" % (f, msg)) z += 1 progress(_updating, z, item=f, total=numupdates, unit=_files) # re-add/mark as modified (manifest only, just log it) - for f, args, msg in actions['am']: + for f, args, msg in actions[ACTION_ADD_MODIFIED]: repo.ui.debug(" %s: %s -> am\n" % (f, msg)) z += 1 progress(_updating, z, item=f, total=numupdates, unit=_files) # keep (noop, just log it) - for f, args, msg in actions['k']: + for f, args, msg in actions[ACTION_KEEP]: repo.ui.debug(" %s: %s -> k\n" % (f, msg)) # no progress # directory rename, move local - for f, args, msg in actions['dm']: + for f, args, msg in actions[ACTION_DIR_RENAME_MOVE_LOCAL]: repo.ui.debug(" %s: %s -> dm\n" % (f, msg)) z += 1 progress(_updating, z, item=f, total=numupdates, unit=_files) @@ -1533,7 +1660,7 @@ updated += 1 # local directory rename, get - for f, args, msg in actions['dg']: + for f, args, msg in actions[ACTION_LOCAL_DIR_RENAME_GET]: repo.ui.debug(" %s: %s -> dg\n" % (f, msg)) z += 1 progress(_updating, z, item=f, total=numupdates, unit=_files) @@ -1543,7 +1670,7 @@ updated += 1 # exec - for f, args, msg in actions['e']: + for f, args, msg in actions[ACTION_EXEC]: repo.ui.debug(" %s: %s -> e\n" % (f, msg)) z += 1 progress(_updating, z, item=f, total=numupdates, unit=_files) @@ -1568,7 +1695,8 @@ if not proceed: # XXX setting unresolved to at least 1 is a hack to make sure we # error out - return updated, merged, removed, max(len(unresolvedf), 1) + return updateresult(updated, merged, removed, + max(len(unresolvedf), 1)) newactions = [] for f, args, msg in mergeactions: if f in unresolvedf: @@ -1583,8 +1711,8 @@ z += 1 progress(_updating, z, item=f, total=numupdates, unit=_files) if f == '.hgsubstate': # subrepo states need updating - subrepo.submerge(repo, wctx, mctx, wctx.ancestor(mctx), - overwrite, labels) + subrepoutil.submerge(repo, wctx, mctx, wctx.ancestor(mctx), + overwrite, labels) continue wctx[f].audit() complete, r = ms.preresolve(f, wctx) @@ -1604,7 +1732,8 @@ unresolved = ms.unresolvedcount() - if usemergedriver and not unresolved and ms.mdstate() != 's': + if (usemergedriver and not unresolved + and ms.mdstate() != MERGE_DRIVER_STATE_SUCCESS): if not driverconclude(repo, ms, wctx, labels=labels): # XXX setting unresolved to at least 1 is a hack to make sure we # error out @@ -1619,17 +1748,17 @@ extraactions = ms.actions() if extraactions: - mfiles = set(a[0] for a in actions['m']) + mfiles = set(a[0] for a in actions[ACTION_MERGE]) for k, acts in extraactions.iteritems(): actions[k].extend(acts) - # Remove these files from actions['m'] as well. This is important - # because in recordupdates, files in actions['m'] are processed - # after files in other actions, and the merge driver might add - # files to those actions via extraactions above. This can lead to a - # file being recorded twice, with poor results. This is especially - # problematic for actions['r'] (currently only possible with the - # merge driver in the initial merge process; interrupted merges - # don't go through this flow). + # Remove these files from actions[ACTION_MERGE] as well. This is + # important because in recordupdates, files in actions[ACTION_MERGE] + # are processed after files in other actions, and the merge driver + # might add files to those actions via extraactions above. This can + # lead to a file being recorded twice, with poor results. This is + # especially problematic for actions[ACTION_REMOVE] (currently only + # possible with the merge driver in the initial merge process; + # interrupted merges don't go through this flow). # # The real fix here is to have indexes by both file and action so # that when the action for a file is changed it is automatically @@ -1640,27 +1769,27 @@ # those lists aren't consulted again. mfiles.difference_update(a[0] for a in acts) - actions['m'] = [a for a in actions['m'] if a[0] in mfiles] + actions[ACTION_MERGE] = [a for a in actions[ACTION_MERGE] + if a[0] in mfiles] progress(_updating, None, total=numupdates, unit=_files) - - return updated, merged, removed, unresolved + return updateresult(updated, merged, removed, unresolved) def recordupdates(repo, actions, branchmerge): "record merge actions to the dirstate" # remove (must come first) - for f, args, msg in actions.get('r', []): + for f, args, msg in actions.get(ACTION_REMOVE, []): if branchmerge: repo.dirstate.remove(f) else: repo.dirstate.drop(f) # forget (must come first) - for f, args, msg in actions.get('f', []): + for f, args, msg in actions.get(ACTION_FORGET, []): repo.dirstate.drop(f) # resolve path conflicts - for f, args, msg in actions.get('pr', []): + for f, args, msg in actions.get(ACTION_PATH_CONFLICT_RESOLVE, []): f0, = args origf0 = repo.dirstate.copied(f0) or f0 repo.dirstate.add(f) @@ -1671,33 +1800,33 @@ repo.dirstate.drop(f0) # re-add - for f, args, msg in actions.get('a', []): + for f, args, msg in actions.get(ACTION_ADD, []): repo.dirstate.add(f) # re-add/mark as modified - for f, args, msg in actions.get('am', []): + for f, args, msg in actions.get(ACTION_ADD_MODIFIED, []): if branchmerge: repo.dirstate.normallookup(f) else: repo.dirstate.add(f) # exec change - for f, args, msg in actions.get('e', []): + for f, args, msg in actions.get(ACTION_EXEC, []): repo.dirstate.normallookup(f) # keep - for f, args, msg in actions.get('k', []): + for f, args, msg in actions.get(ACTION_KEEP, []): pass # get - for f, args, msg in actions.get('g', []): + for f, args, msg in actions.get(ACTION_GET, []): if branchmerge: repo.dirstate.otherparent(f) else: repo.dirstate.normal(f) # merge - for f, args, msg in actions.get('m', []): + for f, args, msg in actions.get(ACTION_MERGE, []): f1, f2, fa, move, anc = args if branchmerge: # We've done a branch merge, mark this file as merged @@ -1722,7 +1851,7 @@ repo.dirstate.drop(f1) # directory rename, move local - for f, args, msg in actions.get('dm', []): + for f, args, msg in actions.get(ACTION_DIR_RENAME_MOVE_LOCAL, []): f0, flag = args if branchmerge: repo.dirstate.add(f) @@ -1733,7 +1862,7 @@ repo.dirstate.drop(f0) # directory rename, get - for f, args, msg in actions.get('dg', []): + for f, args, msg in actions.get(ACTION_LOCAL_DIR_RENAME_GET, []): f0, flag = args if branchmerge: repo.dirstate.add(f) @@ -1835,7 +1964,7 @@ else: pas = [p1.ancestor(p2, warn=branchmerge)] - fp1, fp2, xp1, xp2 = p1.node(), p2.node(), str(p1), str(p2) + fp1, fp2, xp1, xp2 = p1.node(), p2.node(), bytes(p1), bytes(p2) ### check phase if not overwrite: @@ -1865,7 +1994,7 @@ # call the hooks and exit early repo.hook('preupdate', throw=True, parent1=xp2, parent2='') repo.hook('update', parent1=xp2, parent2='', error=0) - return 0, 0, 0, 0 + return updateresult(0, 0, 0, 0) if (updatecheck == 'linear' and pas not in ([p1], [p2])): # nonlinear @@ -1906,43 +2035,59 @@ if updatecheck == 'noconflict': for f, (m, args, msg) in actionbyfile.iteritems(): - if m not in ('g', 'k', 'e', 'r', 'pr'): + if m not in (ACTION_GET, ACTION_KEEP, ACTION_EXEC, + ACTION_REMOVE, ACTION_PATH_CONFLICT_RESOLVE): msg = _("conflicting changes") hint = _("commit or update --clean to discard changes") raise error.Abort(msg, hint=hint) # Prompt and create actions. Most of this is in the resolve phase # already, but we can't handle .hgsubstate in filemerge or - # subrepo.submerge yet so we have to keep prompting for it. + # subrepoutil.submerge yet so we have to keep prompting for it. if '.hgsubstate' in actionbyfile: f = '.hgsubstate' m, args, msg = actionbyfile[f] prompts = filemerge.partextras(labels) prompts['f'] = f - if m == 'cd': + if m == ACTION_CHANGED_DELETED: if repo.ui.promptchoice( _("local%(l)s changed %(f)s which other%(o)s deleted\n" "use (c)hanged version or (d)elete?" "$$ &Changed $$ &Delete") % prompts, 0): - actionbyfile[f] = ('r', None, "prompt delete") + actionbyfile[f] = (ACTION_REMOVE, None, 'prompt delete') elif f in p1: - actionbyfile[f] = ('am', None, "prompt keep") + actionbyfile[f] = (ACTION_ADD_MODIFIED, None, 'prompt keep') else: - actionbyfile[f] = ('a', None, "prompt keep") - elif m == 'dc': + actionbyfile[f] = (ACTION_ADD, None, 'prompt keep') + elif m == ACTION_DELETED_CHANGED: f1, f2, fa, move, anc = args flags = p2[f2].flags() if repo.ui.promptchoice( _("other%(o)s changed %(f)s which local%(l)s deleted\n" "use (c)hanged version or leave (d)eleted?" "$$ &Changed $$ &Deleted") % prompts, 0) == 0: - actionbyfile[f] = ('g', (flags, False), "prompt recreating") + actionbyfile[f] = (ACTION_GET, (flags, False), + 'prompt recreating') else: del actionbyfile[f] # Convert to dictionary-of-lists format actions = dict((m, []) - for m in 'a am f g cd dc r dm dg m e k p pr'.split()) + for m in ( + ACTION_ADD, + ACTION_ADD_MODIFIED, + ACTION_FORGET, + ACTION_GET, + ACTION_CHANGED_DELETED, + ACTION_DELETED_CHANGED, + ACTION_REMOVE, + ACTION_DIR_RENAME_MOVE_LOCAL, + ACTION_LOCAL_DIR_RENAME_GET, + ACTION_MERGE, + ACTION_EXEC, + ACTION_KEEP, + ACTION_PATH_CONFLICT, + ACTION_PATH_CONFLICT_RESOLVE)) for f, (m, args, msg) in actionbyfile.iteritems(): if m not in actions: actions[m] = [] @@ -1992,6 +2137,8 @@ fsmonitorthreshold = repo.ui.configint('fsmonitor', 'warn_update_file_count') try: + # avoid cycle: extensions -> cmdutil -> merge + from . import extensions extensions.find('fsmonitor') fsmonitorenabled = repo.ui.config('fsmonitor', 'mode') != 'off' # We intentionally don't look at whether fsmonitor has disabled @@ -2003,7 +2150,7 @@ if (fsmonitorwarning and not fsmonitorenabled and p1.node() == nullid - and len(actions['g']) >= fsmonitorthreshold + and len(actions[ACTION_GET]) >= fsmonitorthreshold and pycompat.sysplatform.startswith(('linux', 'darwin'))): repo.ui.warn( _('(warning: large working directory being used without ' @@ -2028,7 +2175,8 @@ sparse.prunetemporaryincludes(repo) if not partial: - repo.hook('update', parent1=xp1, parent2=xp2, error=stats[3]) + repo.hook('update', parent1=xp1, parent2=xp2, + error=stats.unresolvedcount) return stats def graft(repo, ctx, pctx, labels, keepparent=False): diff -r fb92df8b634c -r ed5448edcbfa mercurial/minirst.py --- a/mercurial/minirst.py Wed Apr 04 10:35:09 2018 -0400 +++ b/mercurial/minirst.py Wed Apr 18 15:32:08 2018 -0400 @@ -27,7 +27,9 @@ encoding, pycompat, url, - util, +) +from .utils import ( + stringutil, ) def section(s): @@ -459,9 +461,9 @@ hanging = block['optstrwidth'] initindent = '%s%s ' % (block['optstr'], ' ' * ((hanging - colwidth))) hangindent = ' ' * (encoding.colwidth(initindent) + 1) - return ' %s\n' % (util.wrap(desc, usablewidth, - initindent=initindent, - hangindent=hangindent)) + return ' %s\n' % (stringutil.wrap(desc, usablewidth, + initindent=initindent, + hangindent=hangindent)) def formatblock(block, width): """Format a block according to width.""" @@ -477,9 +479,9 @@ defindent = indent + hang * ' ' text = ' '.join(map(bytes.strip, block['lines'])) return '%s\n%s\n' % (indent + admonition, - util.wrap(text, width=width, - initindent=defindent, - hangindent=defindent)) + stringutil.wrap(text, width=width, + initindent=defindent, + hangindent=defindent)) if block['type'] == 'margin': return '\n' if block['type'] == 'literal': @@ -503,7 +505,9 @@ pad = ' ' * (w - encoding.colwidth(v)) l.append(v + pad) l = ' '.join(l) - l = util.wrap(l, width=width, initindent=indent, hangindent=hang) + l = stringutil.wrap(l, width=width, + initindent=indent, + hangindent=hang) if not text and block['header']: text = l + '\n' + indent + '-' * (min(width, span)) + '\n' else: @@ -514,9 +518,9 @@ hang = len(block['lines'][-1]) - len(block['lines'][-1].lstrip()) defindent = indent + hang * ' ' text = ' '.join(map(bytes.strip, block['lines'][1:])) - return '%s\n%s\n' % (term, util.wrap(text, width=width, - initindent=defindent, - hangindent=defindent)) + return '%s\n%s\n' % (term, stringutil.wrap(text, width=width, + initindent=defindent, + hangindent=defindent)) subindent = indent if block['type'] == 'bullet': if block['lines'][0].startswith('| '): @@ -540,9 +544,9 @@ return formatoption(block, width) text = ' '.join(map(bytes.strip, block['lines'])) - return util.wrap(text, width=width, - initindent=indent, - hangindent=subindent) + '\n' + return stringutil.wrap(text, width=width, + initindent=indent, + hangindent=subindent) + '\n' def formathtml(blocks): """Format RST blocks as HTML""" diff -r fb92df8b634c -r ed5448edcbfa mercurial/namespaces.py --- a/mercurial/namespaces.py Wed Apr 04 10:35:09 2018 -0400 +++ b/mercurial/namespaces.py Wed Apr 18 15:32:08 2018 -0400 @@ -2,6 +2,7 @@ from .i18n import _ from . import ( + registrar, templatekw, util, ) @@ -87,10 +88,10 @@ # we only generate a template keyword if one does not already exist if namespace.name not in templatekw.keywords: - def generatekw(**args): - return templatekw.shownames(namespace.name, **args) - - templatekw.keywords[namespace.name] = generatekw + templatekeyword = registrar.templatekeyword(templatekw.keywords) + @templatekeyword(namespace.name, requires={'repo', 'ctx'}) + def generatekw(context, mapping): + return templatekw.shownames(context, mapping, namespace.name) def singlenode(self, repo, name): """ diff -r fb92df8b634c -r ed5448edcbfa mercurial/narrowspec.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/mercurial/narrowspec.py Wed Apr 18 15:32:08 2018 -0400 @@ -0,0 +1,199 @@ +# narrowspec.py - methods for working with a narrow view of a repository +# +# Copyright 2017 Google, Inc. +# +# This software may be used and distributed according to the terms of the +# GNU General Public License version 2 or any later version. + +from __future__ import absolute_import + +import errno + +from .i18n import _ +from . import ( + error, + match as matchmod, + util, +) + +FILENAME = 'narrowspec' + +def _parsestoredpatterns(text): + """Parses the narrowspec format that's stored on disk.""" + patlist = None + includepats = [] + excludepats = [] + for l in text.splitlines(): + if l == '[includes]': + if patlist is None: + patlist = includepats + else: + raise error.Abort(_('narrowspec includes section must appear ' + 'at most once, before excludes')) + elif l == '[excludes]': + if patlist is not excludepats: + patlist = excludepats + else: + raise error.Abort(_('narrowspec excludes section must appear ' + 'at most once')) + else: + patlist.append(l) + + return set(includepats), set(excludepats) + +def parseserverpatterns(text): + """Parses the narrowspec format that's returned by the server.""" + includepats = set() + excludepats = set() + + # We get one entry per line, in the format " ". + # It's OK for value to contain other spaces. + for kp in (l.split(' ', 1) for l in text.splitlines()): + if len(kp) != 2: + raise error.Abort(_('Invalid narrowspec pattern line: "%s"') % kp) + key = kp[0] + pat = kp[1] + if key == 'include': + includepats.add(pat) + elif key == 'exclude': + excludepats.add(pat) + else: + raise error.Abort(_('Invalid key "%s" in server response') % key) + + return includepats, excludepats + +def normalizesplitpattern(kind, pat): + """Returns the normalized version of a pattern and kind. + + Returns a tuple with the normalized kind and normalized pattern. + """ + pat = pat.rstrip('/') + _validatepattern(pat) + return kind, pat + +def _numlines(s): + """Returns the number of lines in s, including ending empty lines.""" + # We use splitlines because it is Unicode-friendly and thus Python 3 + # compatible. However, it does not count empty lines at the end, so trick + # it by adding a character at the end. + return len((s + 'x').splitlines()) + +def _validatepattern(pat): + """Validates the pattern and aborts if it is invalid. + + Patterns are stored in the narrowspec as newline-separated + POSIX-style bytestring paths. There's no escaping. + """ + + # We use newlines as separators in the narrowspec file, so don't allow them + # in patterns. + if _numlines(pat) > 1: + raise error.Abort(_('newlines are not allowed in narrowspec paths')) + + components = pat.split('/') + if '.' in components or '..' in components: + raise error.Abort(_('"." and ".." are not allowed in narrowspec paths')) + +def normalizepattern(pattern, defaultkind='path'): + """Returns the normalized version of a text-format pattern. + + If the pattern has no kind, the default will be added. + """ + kind, pat = matchmod._patsplit(pattern, defaultkind) + return '%s:%s' % normalizesplitpattern(kind, pat) + +def parsepatterns(pats): + """Parses a list of patterns into a typed pattern set.""" + return set(normalizepattern(p) for p in pats) + +def format(includes, excludes): + output = '[includes]\n' + for i in sorted(includes - excludes): + output += i + '\n' + output += '[excludes]\n' + for e in sorted(excludes): + output += e + '\n' + return output + +def match(root, include=None, exclude=None): + if not include: + # Passing empty include and empty exclude to matchmod.match() + # gives a matcher that matches everything, so explicitly use + # the nevermatcher. + return matchmod.never(root, '') + return matchmod.match(root, '', [], include=include or [], + exclude=exclude or []) + +def needsexpansion(includes): + return [i for i in includes if i.startswith('include:')] + +def load(repo): + try: + spec = repo.vfs.read(FILENAME) + except IOError as e: + # Treat "narrowspec does not exist" the same as "narrowspec file exists + # and is empty". + if e.errno == errno.ENOENT: + # Without this the next call to load will use the cached + # non-existence of the file, which can cause some odd issues. + repo.invalidate(clearfilecache=True) + return set(), set() + raise + return _parsestoredpatterns(spec) + +def save(repo, includepats, excludepats): + spec = format(includepats, excludepats) + repo.vfs.write(FILENAME, spec) + +def restrictpatterns(req_includes, req_excludes, repo_includes, repo_excludes): + r""" Restricts the patterns according to repo settings, + results in a logical AND operation + + :param req_includes: requested includes + :param req_excludes: requested excludes + :param repo_includes: repo includes + :param repo_excludes: repo excludes + :return: include patterns, exclude patterns, and invalid include patterns. + + >>> restrictpatterns({'f1','f2'}, {}, ['f1'], []) + (set(['f1']), {}, []) + >>> restrictpatterns({'f1'}, {}, ['f1','f2'], []) + (set(['f1']), {}, []) + >>> restrictpatterns({'f1/fc1', 'f3/fc3'}, {}, ['f1','f2'], []) + (set(['f1/fc1']), {}, []) + >>> restrictpatterns({'f1_fc1'}, {}, ['f1','f2'], []) + ([], set(['path:.']), []) + >>> restrictpatterns({'f1/../f2/fc2'}, {}, ['f1','f2'], []) + (set(['f2/fc2']), {}, []) + >>> restrictpatterns({'f1/../f3/fc3'}, {}, ['f1','f2'], []) + ([], set(['path:.']), []) + >>> restrictpatterns({'f1/$non_exitent_var'}, {}, ['f1','f2'], []) + (set(['f1/$non_exitent_var']), {}, []) + """ + res_excludes = set(req_excludes) + res_excludes.update(repo_excludes) + invalid_includes = [] + if not req_includes: + res_includes = set(repo_includes) + elif 'path:.' not in repo_includes: + res_includes = [] + for req_include in req_includes: + req_include = util.expandpath(util.normpath(req_include)) + if req_include in repo_includes: + res_includes.append(req_include) + continue + valid = False + for repo_include in repo_includes: + if req_include.startswith(repo_include + '/'): + valid = True + res_includes.append(req_include) + break + if not valid: + invalid_includes.append(req_include) + if len(res_includes) == 0: + res_excludes = {'path:.'} + else: + res_includes = set(res_includes) + else: + res_includes = set(req_includes) + return res_includes, res_excludes, invalid_includes diff -r fb92df8b634c -r ed5448edcbfa mercurial/node.py --- a/mercurial/node.py Wed Apr 04 10:35:09 2018 -0400 +++ b/mercurial/node.py Wed Apr 18 15:32:08 2018 -0400 @@ -11,7 +11,14 @@ # This ugly style has a noticeable effect in manifest parsing hex = binascii.hexlify -bin = binascii.unhexlify +# Adapt to Python 3 API changes. If this ends up showing up in +# profiles, we can use this version only on Python 3, and forward +# binascii.unhexlify like we used to on Python 2. +def bin(s): + try: + return binascii.unhexlify(s) + except binascii.Error as e: + raise TypeError(e) nullrev = -1 nullid = b"\0" * 20 @@ -23,7 +30,7 @@ addednodeid = ('0' * 15) + 'added' modifiednodeid = ('0' * 12) + 'modified' -wdirnodes = {newnodeid, addednodeid, modifiednodeid} +wdirfilenodeids = {newnodeid, addednodeid, modifiednodeid} # pseudo identifiers for working directory # (they are experimental, so don't add too many dependencies on them) diff -r fb92df8b634c -r ed5448edcbfa mercurial/obsolete.py --- a/mercurial/obsolete.py Wed Apr 04 10:35:09 2018 -0400 +++ b/mercurial/obsolete.py Wed Apr 18 15:32:08 2018 -0400 @@ -81,6 +81,7 @@ policy, util, ) +from .utils import dateutil parsers = policy.importmod(r'parsers') @@ -132,53 +133,34 @@ return option in result +def getoptions(repo): + """Returns dicts showing state of obsolescence features.""" + + createmarkersvalue = _getoptionvalue(repo, createmarkersopt) + unstablevalue = _getoptionvalue(repo, allowunstableopt) + exchangevalue = _getoptionvalue(repo, exchangeopt) + + # createmarkers must be enabled if other options are enabled + if ((unstablevalue or exchangevalue) and not createmarkersvalue): + raise error.Abort(_("'createmarkers' obsolete option must be enabled " + "if other obsolete options are enabled")) + + return { + createmarkersopt: createmarkersvalue, + allowunstableopt: unstablevalue, + exchangeopt: exchangevalue, + } + def isenabled(repo, option): """Returns True if the given repository has the given obsolete option enabled. """ - createmarkersvalue = _getoptionvalue(repo, createmarkersopt) - unstabluevalue = _getoptionvalue(repo, allowunstableopt) - exchangevalue = _getoptionvalue(repo, exchangeopt) - - # createmarkers must be enabled if other options are enabled - if ((unstabluevalue or exchangevalue) and not createmarkersvalue): - raise error.Abort(_("'createmarkers' obsolete option must be enabled " - "if other obsolete options are enabled")) - - return _getoptionvalue(repo, option) - -### obsolescence marker flag + return getoptions(repo)[option] -## bumpedfix flag -# -# When a changeset A' succeed to a changeset A which became public, we call A' -# "bumped" because it's a successors of a public changesets -# -# o A' (bumped) -# |`: -# | o A -# |/ -# o Z -# -# The way to solve this situation is to create a new changeset Ad as children -# of A. This changeset have the same content than A'. So the diff from A to A' -# is the same than the diff from A to Ad. Ad is marked as a successors of A' -# -# o Ad -# |`: -# | x A' -# |'| -# o | A -# |/ -# o Z -# -# But by transitivity Ad is also a successors of A. To avoid having Ad marked -# as bumped too, we add the `bumpedfix` flag to the marker. . -# This flag mean that the successors express the changes between the public and -# bumped version and fix the situation, breaking the transitivity of -# "bumped" here. -bumpedfix = 1 -usingsha256 = 2 +# Creating aliases for marker flags because evolve extension looks for +# bumpedfix in obsolete.py +bumpedfix = obsutil.bumpedfix +usingsha256 = obsutil.usingsha256 ## Parsing and writing of version "0" # @@ -506,13 +488,6 @@ for mark in markers: successors.setdefault(mark[0], set()).add(mark) -def _addprecursors(*args, **kwargs): - msg = ("'obsolete._addprecursors' is deprecated, " - "use 'obsolete._addpredecessors'") - util.nouideprecwarn(msg, '4.4') - - return _addpredecessors(*args, **kwargs) - @util.nogc def _addpredecessors(predecessors, markers): for mark in markers: @@ -570,7 +545,7 @@ return len(self._all) def __nonzero__(self): - if not self._cached('_all'): + if not self._cached(r'_all'): try: return self.svfs.stat('obsstore').st_size > 1 except OSError as inst: @@ -608,13 +583,13 @@ if date is None: if 'date' in metadata: # as a courtesy for out-of-tree extensions - date = util.parsedate(metadata.pop('date')) + date = dateutil.parsedate(metadata.pop('date')) elif ui is not None: date = ui.configdate('devel', 'default-date') if date is None: - date = util.makedate() + date = dateutil.makedate() else: - date = util.makedate() + date = dateutil.makedate() if len(prec) != 20: raise ValueError(prec) for succ in succs: @@ -663,7 +638,7 @@ self.caches.clear() # records the number of new markers for the transaction hooks previous = int(transaction.hookargs.get('new_obsmarkers', '0')) - transaction.hookargs['new_obsmarkers'] = str(previous + len(new)) + transaction.hookargs['new_obsmarkers'] = '%d' % (previous + len(new)) return len(new) def mergemarkers(self, transaction, data): @@ -700,14 +675,6 @@ _addsuccessors(successors, self._all) return successors - @property - def precursors(self): - msg = ("'obsstore.precursors' is deprecated, " - "use 'obsstore.predecessors'") - util.nouideprecwarn(msg, '4.4') - - return self.predecessors - @propertycache def predecessors(self): predecessors = {} @@ -727,11 +694,11 @@ markers = list(markers) # to allow repeated iteration self._data = self._data + rawdata self._all.extend(markers) - if self._cached('successors'): + if self._cached(r'successors'): _addsuccessors(self.successors, markers) - if self._cached('predecessors'): + if self._cached(r'predecessors'): _addpredecessors(self.predecessors, markers) - if self._cached('children'): + if self._cached(r'children'): _addchildren(self.children, markers) _checkinvalidmarkers(markers) @@ -843,42 +810,6 @@ repo.invalidatevolatilesets() return True -# keep compatibility for the 4.3 cycle -def allprecursors(obsstore, nodes, ignoreflags=0): - movemsg = 'obsolete.allprecursors moved to obsutil.allprecursors' - util.nouideprecwarn(movemsg, '4.3') - return obsutil.allprecursors(obsstore, nodes, ignoreflags) - -def allsuccessors(obsstore, nodes, ignoreflags=0): - movemsg = 'obsolete.allsuccessors moved to obsutil.allsuccessors' - util.nouideprecwarn(movemsg, '4.3') - return obsutil.allsuccessors(obsstore, nodes, ignoreflags) - -def marker(repo, data): - movemsg = 'obsolete.marker moved to obsutil.marker' - repo.ui.deprecwarn(movemsg, '4.3') - return obsutil.marker(repo, data) - -def getmarkers(repo, nodes=None, exclusive=False): - movemsg = 'obsolete.getmarkers moved to obsutil.getmarkers' - repo.ui.deprecwarn(movemsg, '4.3') - return obsutil.getmarkers(repo, nodes=nodes, exclusive=exclusive) - -def exclusivemarkers(repo, nodes): - movemsg = 'obsolete.exclusivemarkers moved to obsutil.exclusivemarkers' - repo.ui.deprecwarn(movemsg, '4.3') - return obsutil.exclusivemarkers(repo, nodes) - -def foreground(repo, nodes): - movemsg = 'obsolete.foreground moved to obsutil.foreground' - repo.ui.deprecwarn(movemsg, '4.3') - return obsutil.foreground(repo, nodes) - -def successorssets(repo, initialnode, cache=None): - movemsg = 'obsolete.successorssets moved to obsutil.successorssets' - repo.ui.deprecwarn(movemsg, '4.3') - return obsutil.successorssets(repo, initialnode, cache=cache) - # mapping of 'set-name' -> cachefuncs = {} def cachefor(name): @@ -933,14 +864,6 @@ obs = set(r for r in notpublic if isobs(getnode(r))) return obs -@cachefor('unstable') -def _computeunstableset(repo): - msg = ("'unstable' volatile set is deprecated, " - "use 'orphan'") - repo.ui.deprecwarn(msg, '4.4') - - return _computeorphanset(repo) - @cachefor('orphan') def _computeorphanset(repo): """the set of non obsolete revisions with obsolete parents""" @@ -969,14 +892,6 @@ """the set of obsolete parents without non obsolete descendants""" return getrevs(repo, 'obsolete') - getrevs(repo, 'suspended') -@cachefor('bumped') -def _computebumpedset(repo): - msg = ("'bumped' volatile set is deprecated, " - "use 'phasedivergent'") - repo.ui.deprecwarn(msg, '4.4') - - return _computephasedivergentset(repo) - @cachefor('phasedivergent') def _computephasedivergentset(repo): """the set of revs trying to obsolete public revisions""" @@ -1000,14 +915,6 @@ break # Next draft! return bumped -@cachefor('divergent') -def _computedivergentset(repo): - msg = ("'divergent' volatile set is deprecated, " - "use 'contentdivergent'") - repo.ui.deprecwarn(msg, '4.4') - - return _computecontentdivergentset(repo) - @cachefor('contentdivergent') def _computecontentdivergentset(repo): """the set of rev that compete to be the final successors of some revision. diff -r fb92df8b634c -r ed5448edcbfa mercurial/obsutil.py --- a/mercurial/obsutil.py Wed Apr 04 10:35:09 2018 -0400 +++ b/mercurial/obsutil.py Wed Apr 18 15:32:08 2018 -0400 @@ -15,6 +15,40 @@ phases, util, ) +from .utils import dateutil + +### obsolescence marker flag + +## bumpedfix flag +# +# When a changeset A' succeed to a changeset A which became public, we call A' +# "bumped" because it's a successors of a public changesets +# +# o A' (bumped) +# |`: +# | o A +# |/ +# o Z +# +# The way to solve this situation is to create a new changeset Ad as children +# of A. This changeset have the same content than A'. So the diff from A to A' +# is the same than the diff from A to Ad. Ad is marked as a successors of A' +# +# o Ad +# |`: +# | x A' +# |'| +# o | A +# |/ +# o Z +# +# But by transitivity Ad is also a successors of A. To avoid having Ad marked +# as bumped too, we add the `bumpedfix` flag to the marker. . +# This flag mean that the successors express the changes between the public and +# bumped version and fix the situation, breaking the transitivity of +# "bumped" here. +bumpedfix = 1 +usingsha256 = 2 class marker(object): """Wrap obsolete marker raw data""" @@ -33,12 +67,6 @@ return False return self._data == other._data - def precnode(self): - msg = ("'marker.precnode' is deprecated, " - "use 'marker.prednode'") - util.nouideprecwarn(msg, '4.4') - return self.prednode() - def prednode(self): """Predecessor changeset node identifier""" return self._data[0] @@ -106,15 +134,6 @@ else: stack.append(precnodeid) -def allprecursors(*args, **kwargs): - """ (DEPRECATED) - """ - msg = ("'obsutil.allprecursors' is deprecated, " - "use 'obsutil.allpredecessors'") - util.nouideprecwarn(msg, '4.4') - - return allpredecessors(*args, **kwargs) - def allpredecessors(obsstore, nodes, ignoreflags=0): """Yield node for every precursors of . @@ -421,10 +440,10 @@ # Check if other meta has changed changeextra = changectx.extra().items() - ctxmeta = filter(metanotblacklisted, changeextra) + ctxmeta = list(filter(metanotblacklisted, changeextra)) sourceextra = source.extra().items() - srcmeta = filter(metanotblacklisted, sourceextra) + srcmeta = list(filter(metanotblacklisted, sourceextra)) if ctxmeta != srcmeta: effects |= METACHANGED @@ -813,7 +832,7 @@ return sorted(operations) -def obsfateprinter(successors, markers, ui): +def obsfateprinter(ui, repo, successors, markers, formatctx): """ Build a obsfate string for a single successorset using all obsfate related function defined in obsutil """ @@ -833,7 +852,7 @@ # Successors if successors: - fmtsuccessors = [successors.joinfmt(succ) for succ in successors] + fmtsuccessors = [formatctx(repo[succ]) for succ in successors] line.append(" as %s" % ", ".join(fmtsuccessors)) # Users @@ -856,11 +875,11 @@ max_date = max(dates) if min_date == max_date: - fmtmin_date = util.datestr(min_date, '%Y-%m-%d %H:%M %1%2') + fmtmin_date = dateutil.datestr(min_date, '%Y-%m-%d %H:%M %1%2') line.append(" (at %s)" % fmtmin_date) else: - fmtmin_date = util.datestr(min_date, '%Y-%m-%d %H:%M %1%2') - fmtmax_date = util.datestr(max_date, '%Y-%m-%d %H:%M %1%2') + fmtmin_date = dateutil.datestr(min_date, '%Y-%m-%d %H:%M %1%2') + fmtmax_date = dateutil.datestr(max_date, '%Y-%m-%d %H:%M %1%2') line.append(" (between %s and %s)" % (fmtmin_date, fmtmax_date)) return "".join(line) @@ -904,3 +923,55 @@ args = (changeid, firstsuccessors, remainingnumber) return filteredmsgtable['superseded_split_several'] % args + +def divergentsets(repo, ctx): + """Compute sets of commits divergent with a given one""" + cache = {} + base = {} + for n in allpredecessors(repo.obsstore, [ctx.node()]): + if n == ctx.node(): + # a node can't be a base for divergence with itself + continue + nsuccsets = successorssets(repo, n, cache) + for nsuccset in nsuccsets: + if ctx.node() in nsuccset: + # we are only interested in *other* successor sets + continue + if tuple(nsuccset) in base: + # we already know the latest base for this divergency + continue + base[tuple(nsuccset)] = n + return [{'divergentnodes': divset, 'commonpredecessor': b} + for divset, b in base.iteritems()] + +def whyunstable(repo, ctx): + result = [] + if ctx.orphan(): + for parent in ctx.parents(): + kind = None + if parent.orphan(): + kind = 'orphan' + elif parent.obsolete(): + kind = 'obsolete' + if kind is not None: + result.append({'instability': 'orphan', + 'reason': '%s parent' % kind, + 'node': parent.hex()}) + if ctx.phasedivergent(): + predecessors = allpredecessors(repo.obsstore, [ctx.node()], + ignoreflags=bumpedfix) + immutable = [repo[p] for p in predecessors + if p in repo and not repo[p].mutable()] + for predecessor in immutable: + result.append({'instability': 'phase-divergent', + 'reason': 'immutable predecessor', + 'node': predecessor.hex()}) + if ctx.contentdivergent(): + dsets = divergentsets(repo, ctx) + for dset in dsets: + divnodes = [repo[n] for n in dset['divergentnodes']] + result.append({'instability': 'content-divergent', + 'divergentnodes': divnodes, + 'reason': 'predecessor', + 'node': nodemod.hex(dset['commonpredecessor'])}) + return result diff -r fb92df8b634c -r ed5448edcbfa mercurial/parser.py --- a/mercurial/parser.py Wed Apr 04 10:35:09 2018 -0400 +++ b/mercurial/parser.py Wed Apr 18 15:32:08 2018 -0400 @@ -22,8 +22,12 @@ from . import ( encoding, error, + pycompat, util, ) +from .utils import ( + stringutil, +) class parser(object): def __init__(self, elements, methods=None): @@ -189,14 +193,14 @@ def unescapestr(s): try: - return util.unescapestr(s) + return stringutil.unescapestr(s) except ValueError as e: # mangle Python's exception into our format - raise error.ParseError(str(e).lower()) + raise error.ParseError(pycompat.bytestr(e).lower()) def _brepr(obj): if isinstance(obj, bytes): - return b"'%s'" % util.escapestr(obj) + return b"'%s'" % stringutil.escapestr(obj) return encoding.strtolocal(repr(obj)) def _prettyformat(tree, leafnodes, level, lines): diff -r fb92df8b634c -r ed5448edcbfa mercurial/patch.py --- a/mercurial/patch.py Wed Apr 04 10:35:09 2018 -0400 +++ b/mercurial/patch.py Wed Apr 18 15:32:08 2018 -0400 @@ -9,10 +9,9 @@ from __future__ import absolute_import, print_function import collections +import contextlib import copy -import difflib import email -import email.parser as emailparser import errno import hashlib import os @@ -29,25 +28,30 @@ ) from . import ( copies, + diffhelpers, encoding, error, mail, mdiff, pathutil, - policy, pycompat, scmutil, similar, util, vfs as vfsmod, ) +from .utils import ( + dateutil, + procutil, + stringutil, +) -diffhelpers = policy.importmod(r'diffhelpers') stringio = util.stringio gitre = re.compile(br'diff --git a/(.*) b/(.*)') tabsplitter = re.compile(br'(\t+|[^\t]+)') -_nonwordre = re.compile(br'([^a-zA-Z0-9_\x80-\xff])') +wordsplitter = re.compile(br'(\t+| +|[a-zA-Z0-9_\x80-\xff]+|' + '[^ \ta-zA-Z0-9_\x80-\xff])') PatchError = error.PatchError @@ -56,10 +60,10 @@ def split(stream): '''return an iterator of individual patches from a stream''' def isheader(line, inheader): - if inheader and line[0] in (' ', '\t'): + if inheader and line.startswith((' ', '\t')): # continuation return True - if line[0] in (' ', '-', '+'): + if line.startswith((' ', '-', '+')): # diff line - don't check for header pattern in there return False l = line.split(': ', 1) @@ -109,7 +113,7 @@ cur.append(line) c = chunk(cur) - m = emailparser.Parser().parse(c) + m = pycompat.emailparser().parse(c) if not m.is_multipart(): yield msgfp(m) else: @@ -189,6 +193,7 @@ ('Node ID', 'nodeid'), ] +@contextlib.contextmanager def extract(ui, fileobj): '''extract patch from data read from fileobj. @@ -206,6 +211,16 @@ Any item can be missing from the dictionary. If filename is missing, fileobj did not contain a patch. Caller must unlink filename when done.''' + fd, tmpname = tempfile.mkstemp(prefix='hg-patch-') + tmpfp = os.fdopen(fd, r'wb') + try: + yield _extract(ui, fileobj, tmpname, tmpfp) + finally: + tmpfp.close() + os.unlink(tmpname) + +def _extract(ui, fileobj, tmpname, tmpfp): + # attempt to detect the start of a patch # (this heuristic is borrowed from quilt) diffre = re.compile(br'^(?:Index:[ \t]|diff[ \t]-|RCS file: |' @@ -215,85 +230,80 @@ re.MULTILINE | re.DOTALL) data = {} - fd, tmpname = tempfile.mkstemp(prefix='hg-patch-') - tmpfp = os.fdopen(fd, pycompat.sysstr('w')) - try: - msg = emailparser.Parser().parse(fileobj) + + msg = pycompat.emailparser().parse(fileobj) - subject = msg['Subject'] and mail.headdecode(msg['Subject']) - data['user'] = msg['From'] and mail.headdecode(msg['From']) - if not subject and not data['user']: - # Not an email, restore parsed headers if any - subject = '\n'.join(': '.join(h) for h in msg.items()) + '\n' + subject = msg[r'Subject'] and mail.headdecode(msg[r'Subject']) + data['user'] = msg[r'From'] and mail.headdecode(msg[r'From']) + if not subject and not data['user']: + # Not an email, restore parsed headers if any + subject = '\n'.join(': '.join(map(encoding.strtolocal, h)) + for h in msg.items()) + '\n' - # should try to parse msg['Date'] - parents = [] + # should try to parse msg['Date'] + parents = [] - if subject: - if subject.startswith('[PATCH'): - pend = subject.find(']') - if pend >= 0: - subject = subject[pend + 1:].lstrip() - subject = re.sub(br'\n[ \t]+', ' ', subject) - ui.debug('Subject: %s\n' % subject) - if data['user']: - ui.debug('From: %s\n' % data['user']) - diffs_seen = 0 - ok_types = ('text/plain', 'text/x-diff', 'text/x-patch') - message = '' - for part in msg.walk(): - content_type = part.get_content_type() - ui.debug('Content-Type: %s\n' % content_type) - if content_type not in ok_types: - continue - payload = part.get_payload(decode=True) - m = diffre.search(payload) - if m: - hgpatch = False - hgpatchheader = False - ignoretext = False + if subject: + if subject.startswith('[PATCH'): + pend = subject.find(']') + if pend >= 0: + subject = subject[pend + 1:].lstrip() + subject = re.sub(br'\n[ \t]+', ' ', subject) + ui.debug('Subject: %s\n' % subject) + if data['user']: + ui.debug('From: %s\n' % data['user']) + diffs_seen = 0 + ok_types = ('text/plain', 'text/x-diff', 'text/x-patch') + message = '' + for part in msg.walk(): + content_type = pycompat.bytestr(part.get_content_type()) + ui.debug('Content-Type: %s\n' % content_type) + if content_type not in ok_types: + continue + payload = part.get_payload(decode=True) + m = diffre.search(payload) + if m: + hgpatch = False + hgpatchheader = False + ignoretext = False - ui.debug('found patch at byte %d\n' % m.start(0)) - diffs_seen += 1 - cfp = stringio() - for line in payload[:m.start(0)].splitlines(): - if line.startswith('# HG changeset patch') and not hgpatch: - ui.debug('patch generated by hg export\n') - hgpatch = True - hgpatchheader = True - # drop earlier commit message content - cfp.seek(0) - cfp.truncate() - subject = None - elif hgpatchheader: - if line.startswith('# User '): - data['user'] = line[7:] - ui.debug('From: %s\n' % data['user']) - elif line.startswith("# Parent "): - parents.append(line[9:].lstrip()) - elif line.startswith("# "): - for header, key in patchheadermap: - prefix = '# %s ' % header - if line.startswith(prefix): - data[key] = line[len(prefix):] - else: - hgpatchheader = False - elif line == '---': - ignoretext = True - if not hgpatchheader and not ignoretext: - cfp.write(line) - cfp.write('\n') - message = cfp.getvalue() - if tmpfp: - tmpfp.write(payload) - if not payload.endswith('\n'): - tmpfp.write('\n') - elif not diffs_seen and message and content_type == 'text/plain': - message += '\n' + payload - except: # re-raises - tmpfp.close() - os.unlink(tmpname) - raise + ui.debug('found patch at byte %d\n' % m.start(0)) + diffs_seen += 1 + cfp = stringio() + for line in payload[:m.start(0)].splitlines(): + if line.startswith('# HG changeset patch') and not hgpatch: + ui.debug('patch generated by hg export\n') + hgpatch = True + hgpatchheader = True + # drop earlier commit message content + cfp.seek(0) + cfp.truncate() + subject = None + elif hgpatchheader: + if line.startswith('# User '): + data['user'] = line[7:] + ui.debug('From: %s\n' % data['user']) + elif line.startswith("# Parent "): + parents.append(line[9:].lstrip()) + elif line.startswith("# "): + for header, key in patchheadermap: + prefix = '# %s ' % header + if line.startswith(prefix): + data[key] = line[len(prefix):] + else: + hgpatchheader = False + elif line == '---': + ignoretext = True + if not hgpatchheader and not ignoretext: + cfp.write(line) + cfp.write('\n') + message = cfp.getvalue() + if tmpfp: + tmpfp.write(payload) + if not payload.endswith('\n'): + tmpfp.write('\n') + elif not diffs_seen and message and content_type == 'text/plain': + message += '\n' + payload if subject and not message.startswith(subject): message = '%s\n%s' % (subject, message) @@ -306,8 +316,7 @@ if diffs_seen: data['filename'] = tmpname - else: - os.unlink(tmpname) + return data class patchmeta(object): @@ -567,7 +576,7 @@ root = tempfile.mkdtemp(prefix='hg-patch-') self.opener = vfsmod.vfs(root) # Avoid filename issues with these simple names - fn = str(self.created) + fn = '%d' % self.created self.opener.write(fn, data) self.created += 1 self.files[fname] = (fn, mode, copied) @@ -791,8 +800,7 @@ # if there's skew we want to emit the "(offset %d lines)" even # when the hunk cleanly applies at start + skew, so skip the # fast case code - if (self.skew == 0 and - diffhelpers.testhunk(old, self.lines, oldstart) == 0): + if self.skew == 0 and diffhelpers.testhunk(old, self.lines, oldstart): if self.remove: self.backend.unlink(self.fname) else: @@ -819,7 +827,7 @@ cand = [oldstart] for l in cand: - if not old or diffhelpers.testhunk(old, self.lines, l) == 0: + if not old or diffhelpers.testhunk(old, self.lines, l): self.lines[l : l + len(old)] = new self.offset += len(new) - len(old) self.skew = l - orig_start @@ -1102,11 +1110,11 @@ the hunk is left unchanged. """) (patchfd, patchfn) = tempfile.mkstemp(prefix="hg-editor-", - suffix=".diff", text=True) + suffix=".diff") ncpatchfp = None try: # Write the initial patch - f = os.fdopen(patchfd, pycompat.sysstr("w")) + f = util.nativeeolwriter(os.fdopen(patchfd, r'wb')) chunk.header.write(f) chunk.write(f) f.write('\n'.join(['# ' + i for i in phelp.splitlines()])) @@ -1120,9 +1128,10 @@ ui.warn(_("editor exited with exit code %d\n") % ret) continue # Remove comment lines - patchfp = open(patchfn) + patchfp = open(patchfn, r'rb') ncpatchfp = stringio() for line in util.iterfile(patchfp): + line = util.fromnativeeol(line) if not line.startswith('#'): ncpatchfp.write(line) patchfp.close() @@ -1249,8 +1258,11 @@ self.lenb = int(self.lenb) self.starta = int(self.starta) self.startb = int(self.startb) - diffhelpers.addlines(lr, self.hunk, self.lena, self.lenb, self.a, - self.b) + try: + diffhelpers.addlines(lr, self.hunk, self.lena, self.lenb, + self.a, self.b) + except error.ParseError as e: + raise PatchError(_("bad hunk #%d: %s") % (self.number, e)) # if we hit eof before finishing out the hunk, the last line will # be zero length. Lets try to fix it up. while len(self.hunk[-1]) == 0: @@ -1367,7 +1379,7 @@ def _fixnewline(self, lr): l = lr.readline() if l.startswith('\ '): - diffhelpers.fix_newline(self.hunk, self.a, self.b) + diffhelpers.fixnewline(self.hunk, self.a, self.b) else: lr.push(l) @@ -1385,13 +1397,13 @@ hlen = len(self.hunk) for x in xrange(hlen - 1): # the hunk starts with the @@ line, so use x+1 - if self.hunk[x + 1][0] == ' ': + if self.hunk[x + 1].startswith(' '): top += 1 else: break if not toponly: for x in xrange(hlen - 1): - if self.hunk[hlen - bot - 1][0] == ' ': + if self.hunk[hlen - bot - 1].startswith(' '): bot += 1 else: break @@ -1451,7 +1463,7 @@ dec = [] line = getline(lr, self.hunk) while len(line) > 1: - l = line[0] + l = line[0:1] if l <= 'Z' and l >= 'A': l = ord(l) - ord('A') + 1 else: @@ -1460,7 +1472,7 @@ dec.append(util.b85decode(line[1:])[:l]) except ValueError as e: raise PatchError(_('could not decode "%s" binary patch: %s') - % (self._fname, str(e))) + % (self._fname, stringutil.forcebytestr(e))) line = getline(lr, self.hunk) text = zlib.decompress(''.join(dec)) if len(text) != size: @@ -1793,10 +1805,12 @@ else: lr.push(fromfile) yield 'file', header - elif line[0:1] == ' ': - yield 'context', scanwhile(line, lambda l: l[0] in ' \\') - elif line[0] in '-+': - yield 'hunk', scanwhile(line, lambda l: l[0] in '-+\\') + elif line.startswith(' '): + cs = (' ', '\\') + yield 'context', scanwhile(line, lambda l: l.startswith(cs)) + elif line.startswith(('-', '+')): + cs = ('-', '+', '\\') + yield 'hunk', scanwhile(line, lambda l: l.startswith(cs)) else: m = lines_re.match(line) if m: @@ -1852,7 +1866,7 @@ for x in iter(lr.readline, ''): if state == BFILE and ( - (not context and x[0] == '@') + (not context and x.startswith('@')) or (context is not False and x.startswith('***************')) or x.startswith('GIT binary patch')): gp = None @@ -2096,9 +2110,10 @@ args = [] cwd = repo.root if cwd: - args.append('-d %s' % util.shellquote(cwd)) - fp = util.popen('%s %s -p%d < %s' % (patcher, ' '.join(args), strip, - util.shellquote(patchname))) + args.append('-d %s' % procutil.shellquote(cwd)) + cmd = ('%s %s -p%d < %s' + % (patcher, ' '.join(args), strip, procutil.shellquote(patchname))) + fp = procutil.popen(cmd, 'rb') try: for line in util.iterfile(fp): line = line.rstrip() @@ -2126,7 +2141,7 @@ code = fp.close() if code: raise PatchError(_("patch command failed: %s") % - util.explainexit(code)[0]) + procutil.explainexit(code)) return fuzz def patchbackend(ui, backend, patchobj, strip, prefix, files=None, @@ -2256,6 +2271,7 @@ 'context': get('unified', getter=ui.config), } buildopts['worddiff'] = ui.configbool('experimental', 'worddiff') + buildopts['xdiff'] = ui.configbool('experimental', 'xdiff') if git: buildopts['git'] = get('git') @@ -2342,7 +2358,7 @@ if hunksfilterfn is not None: # If the file has been removed, fctx2 is None; but this should # not occur here since we catch removed files early in - # cmdutil.getloglinerangerevs() for 'hg log -L'. + # logcmdutil.getlinerangerevs() for 'hg log -L'. assert fctx2 is not None, \ 'fctx2 unexpectly None in diff hunks filtering' hunks = hunksfilterfn(fctx2, hunks) @@ -2450,6 +2466,10 @@ # reported as copies. We want to show them in the diff as additions. del copy[dst] + prefetchmatch = scmutil.matchfiles( + repo, list(modifiedset | addedset | removedset)) + scmutil.prefetchfiles(repo, [ctx1.rev(), ctx2.rev()], prefetchmatch) + def difffn(opts, losedata): return trydiff(repo, revs, ctx1, ctx2, modified, added, removed, copy, getfilectx, opts, losedata, prefix, relroot) @@ -2465,11 +2485,102 @@ else: return difffn(opts, None) +def diffsinglehunk(hunklines): + """yield tokens for a list of lines in a single hunk""" + for line in hunklines: + # chomp + chompline = line.rstrip('\n') + # highlight tabs and trailing whitespace + stripline = chompline.rstrip() + if line[0] == '-': + label = 'diff.deleted' + elif line[0] == '+': + label = 'diff.inserted' + else: + raise error.ProgrammingError('unexpected hunk line: %s' % line) + for token in tabsplitter.findall(stripline): + if '\t' == token[0]: + yield (token, 'diff.tab') + else: + yield (token, label) + + if chompline != stripline: + yield (chompline[len(stripline):], 'diff.trailingwhitespace') + if chompline != line: + yield (line[len(chompline):], '') + +def diffsinglehunkinline(hunklines): + """yield tokens for a list of lines in a single hunk, with inline colors""" + # prepare deleted, and inserted content + a = '' + b = '' + for line in hunklines: + if line[0] == '-': + a += line[1:] + elif line[0] == '+': + b += line[1:] + else: + raise error.ProgrammingError('unexpected hunk line: %s' % line) + # fast path: if either side is empty, use diffsinglehunk + if not a or not b: + for t in diffsinglehunk(hunklines): + yield t + return + # re-split the content into words + al = wordsplitter.findall(a) + bl = wordsplitter.findall(b) + # re-arrange the words to lines since the diff algorithm is line-based + aln = [s if s == '\n' else s + '\n' for s in al] + bln = [s if s == '\n' else s + '\n' for s in bl] + an = ''.join(aln) + bn = ''.join(bln) + # run the diff algorithm, prepare atokens and btokens + atokens = [] + btokens = [] + blocks = mdiff.allblocks(an, bn, lines1=aln, lines2=bln) + for (a1, a2, b1, b2), btype in blocks: + changed = btype == '!' + for token in mdiff.splitnewlines(''.join(al[a1:a2])): + atokens.append((changed, token)) + for token in mdiff.splitnewlines(''.join(bl[b1:b2])): + btokens.append((changed, token)) + + # yield deleted tokens, then inserted ones + for prefix, label, tokens in [('-', 'diff.deleted', atokens), + ('+', 'diff.inserted', btokens)]: + nextisnewline = True + for changed, token in tokens: + if nextisnewline: + yield (prefix, label) + nextisnewline = False + # special handling line end + isendofline = token.endswith('\n') + if isendofline: + chomp = token[:-1] # chomp + token = chomp.rstrip() # detect spaces at the end + endspaces = chomp[len(token):] + # scan tabs + for maybetab in tabsplitter.findall(token): + if '\t' == maybetab[0]: + currentlabel = 'diff.tab' + else: + if changed: + currentlabel = label + '.changed' + else: + currentlabel = label + '.unchanged' + yield (maybetab, currentlabel) + if isendofline: + if endspaces: + yield (endspaces, 'diff.trailingwhitespace') + yield ('\n', '') + nextisnewline = True + def difflabel(func, *args, **kw): '''yields 2-tuples of (output, label) based on the output of func()''' - inlinecolor = False - if kw.get(r'opts'): - inlinecolor = kw[r'opts'].worddiff + if kw.get(r'opts') and kw[r'opts'].worddiff: + dodiffhunk = diffsinglehunkinline + else: + dodiffhunk = diffsinglehunk headprefixes = [('diff', 'diff.diffline'), ('copy', 'diff.extended'), ('rename', 'diff.extended'), @@ -2481,124 +2592,59 @@ ('---', 'diff.file_a'), ('+++', 'diff.file_b')] textprefixes = [('@', 'diff.hunk'), - ('-', 'diff.deleted'), - ('+', 'diff.inserted')] + # - and + are handled by diffsinglehunk + ] head = False + + # buffers a hunk, i.e. adjacent "-", "+" lines without other changes. + hunkbuffer = [] + def consumehunkbuffer(): + if hunkbuffer: + for token in dodiffhunk(hunkbuffer): + yield token + hunkbuffer[:] = [] + for chunk in func(*args, **kw): lines = chunk.split('\n') - matches = {} - if inlinecolor: - matches = _findmatches(lines) + linecount = len(lines) for i, line in enumerate(lines): - if i != 0: - yield ('\n', '') if head: if line.startswith('@'): head = False else: - if line and line[0] not in ' +-@\\': + if line and not line.startswith((' ', '+', '-', '@', '\\')): head = True - stripline = line diffline = False - if not head and line and line[0] in '+-': - # highlight tabs and trailing whitespace, but only in - # changed lines - stripline = line.rstrip() + if not head and line and line.startswith(('+', '-')): diffline = True prefixes = textprefixes if head: prefixes = headprefixes - for prefix, label in prefixes: - if stripline.startswith(prefix): - if diffline: - if i in matches: - for t, l in _inlinediff(lines[i].rstrip(), - lines[matches[i]].rstrip(), - label): - yield (t, l) - else: - for token in tabsplitter.findall(stripline): - if '\t' == token[0]: - yield (token, 'diff.tab') - else: - yield (token, label) - else: - yield (stripline, label) - break + if diffline: + # buffered + bufferedline = line + if i + 1 < linecount: + bufferedline += "\n" + hunkbuffer.append(bufferedline) else: - yield (line, '') - if line != stripline: - yield (line[len(stripline):], 'diff.trailingwhitespace') - -def _findmatches(slist): - '''Look for insertion matches to deletion and returns a dict of - correspondences. - ''' - lastmatch = 0 - matches = {} - for i, line in enumerate(slist): - if line == '': - continue - if line[0] == '-': - lastmatch = max(lastmatch, i) - newgroup = False - for j, newline in enumerate(slist[lastmatch + 1:]): - if newline == '': - continue - if newline[0] == '-' and newgroup: # too far, no match - break - if newline[0] == '+': # potential match - newgroup = True - sim = difflib.SequenceMatcher(None, line, newline).ratio() - if sim > 0.7: - lastmatch = lastmatch + 1 + j - matches[i] = lastmatch - matches[lastmatch] = i + # unbuffered + for token in consumehunkbuffer(): + yield token + stripline = line.rstrip() + for prefix, label in prefixes: + if stripline.startswith(prefix): + yield (stripline, label) + if line != stripline: + yield (line[len(stripline):], + 'diff.trailingwhitespace') break - return matches - -def _inlinediff(s1, s2, operation): - '''Perform string diff to highlight specific changes.''' - operation_skip = '+?' if operation == 'diff.deleted' else '-?' - if operation == 'diff.deleted': - s2, s1 = s1, s2 - - buff = [] - # we never want to higlight the leading +- - if operation == 'diff.deleted' and s2.startswith('-'): - label = operation - token = '-' - s2 = s2[1:] - s1 = s1[1:] - elif operation == 'diff.inserted' and s1.startswith('+'): - label = operation - token = '+' - s2 = s2[1:] - s1 = s1[1:] - else: - raise error.ProgrammingError("Case not expected, operation = %s" % - operation) - - s = difflib.ndiff(_nonwordre.split(s2), _nonwordre.split(s1)) - for part in s: - if part[0] in operation_skip or len(part) == 2: - continue - l = operation + '.highlight' - if part[0] in ' ': - l = operation - if part[2:] == '\t': - l = 'diff.tab' - if l == label: # contiguous token with same label - token += part[2:] - continue - else: - buff.append((token, label)) - label = l - token = part[2:] - buff.append((token, label)) - - return buff + else: + yield (line, '') + if i + 1 < linecount: + yield ('\n', '') + for token in consumehunkbuffer(): + yield token def diffui(*args, **kw): '''like diff(), but yields 2-tuples of (output, label) for ui.write()''' @@ -2670,8 +2716,8 @@ def isempty(fctx): return fctx is None or fctx.size() == 0 - date1 = util.datestr(ctx1.date()) - date2 = util.datestr(ctx2.date()) + date1 = dateutil.datestr(ctx1.date()) + date2 = dateutil.datestr(ctx2.date()) gitmode = {'l': '120000', 'x': '100755', '': '100644'} @@ -2698,8 +2744,10 @@ if opts.git or losedatafn: flag2 = ctx2.flags(f2) # if binary is True, output "summary" or "base85", but not "text diff" - binary = not opts.text and any(f.isbinary() - for f in [fctx1, fctx2] if f is not None) + if opts.text: + binary = False + else: + binary = any(f.isbinary() for f in [fctx1, fctx2] if f is not None) if losedatafn and not opts.git: if (binary or @@ -2789,7 +2837,8 @@ uheaders, hunks = mdiff.unidiff(content1, date1, content2, date2, - path1, path2, opts=opts) + path1, path2, + binary=binary, opts=opts) header.extend(uheaders) yield fctx1, fctx2, header, hunks diff -r fb92df8b634c -r ed5448edcbfa mercurial/pathutil.py --- a/mercurial/pathutil.py Wed Apr 04 10:35:09 2018 -0400 +++ b/mercurial/pathutil.py Wed Apr 18 15:32:08 2018 -0400 @@ -65,7 +65,7 @@ parts = util.splitpath(path) if (os.path.splitdrive(path)[0] or _lowerclean(parts[0]) in ('.hg', '.hg.', '') - or os.pardir in parts): + or pycompat.ospardir in parts): raise error.Abort(_("path contains illegal component: %s") % path) # Windows shortname aliases for p in parts: @@ -81,7 +81,7 @@ pos = lparts.index(p) base = os.path.join(*parts[:pos]) raise error.Abort(_("path '%s' is inside nested repo %r") - % (path, base)) + % (path, pycompat.bytestr(base))) normparts = util.splitpath(normpath) assert len(parts) == len(normparts) @@ -119,13 +119,14 @@ raise else: if stat.S_ISLNK(st.st_mode): - msg = _('path %r traverses symbolic link %r') % (path, prefix) + msg = (_('path %r traverses symbolic link %r') + % (pycompat.bytestr(path), pycompat.bytestr(prefix))) raise error.Abort(msg) elif (stat.S_ISDIR(st.st_mode) and os.path.isdir(os.path.join(curpath, '.hg'))): if not self.callback or not self.callback(curpath): msg = _("path '%s' is inside nested repo %r") - raise error.Abort(msg % (path, prefix)) + raise error.Abort(msg % (path, pycompat.bytestr(prefix))) def check(self, path): try: diff -r fb92df8b634c -r ed5448edcbfa mercurial/peer.py --- a/mercurial/peer.py Wed Apr 04 10:35:09 2018 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,100 +0,0 @@ -# peer.py - repository base classes for mercurial -# -# Copyright 2005, 2006 Matt Mackall -# Copyright 2006 Vadim Gelfer -# -# This software may be used and distributed according to the terms of the -# GNU General Public License version 2 or any later version. - -from __future__ import absolute_import - -from . import ( - error, - pycompat, - util, -) - -# abstract batching support - -class future(object): - '''placeholder for a value to be set later''' - def set(self, value): - if util.safehasattr(self, 'value'): - raise error.RepoError("future is already set") - self.value = value - -class batcher(object): - '''base class for batches of commands submittable in a single request - - All methods invoked on instances of this class are simply queued and - return a a future for the result. Once you call submit(), all the queued - calls are performed and the results set in their respective futures. - ''' - def __init__(self): - self.calls = [] - def __getattr__(self, name): - def call(*args, **opts): - resref = future() - # Please don't invent non-ascii method names, or you will - # give core hg a very sad time. - self.calls.append((name.encode('ascii'), args, opts, resref,)) - return resref - return call - def submit(self): - raise NotImplementedError() - -class iterbatcher(batcher): - - def submit(self): - raise NotImplementedError() - - def results(self): - raise NotImplementedError() - -class localiterbatcher(iterbatcher): - def __init__(self, local): - super(iterbatcher, self).__init__() - self.local = local - - def submit(self): - # submit for a local iter batcher is a noop - pass - - def results(self): - for name, args, opts, resref in self.calls: - resref.set(getattr(self.local, name)(*args, **opts)) - yield resref.value - -def batchable(f): - '''annotation for batchable methods - - Such methods must implement a coroutine as follows: - - @batchable - def sample(self, one, two=None): - # Build list of encoded arguments suitable for your wire protocol: - encargs = [('one', encode(one),), ('two', encode(two),)] - # Create future for injection of encoded result: - encresref = future() - # Return encoded arguments and future: - yield encargs, encresref - # Assuming the future to be filled with the result from the batched - # request now. Decode it: - yield decode(encresref.value) - - The decorator returns a function which wraps this coroutine as a plain - method, but adds the original method as an attribute called "batchable", - which is used by remotebatch to split the call into separate encoding and - decoding phases. - ''' - def plain(*args, **opts): - batchable = f(*args, **opts) - encargsorres, encresref = next(batchable) - if not encresref: - return encargsorres # a local result in this case - self = args[0] - cmd = pycompat.bytesurl(f.__name__) # ensure cmd is ascii bytestr - encresref.set(self._submitone(cmd, encargsorres)) - return next(batchable) - setattr(plain, 'batchable', f) - return plain diff -r fb92df8b634c -r ed5448edcbfa mercurial/phases.py --- a/mercurial/phases.py Wed Apr 04 10:35:09 2018 -0400 +++ b/mercurial/phases.py Wed Apr 18 15:32:08 2018 -0400 @@ -262,7 +262,8 @@ repo = repo.unfiltered() nativeroots = [] for phase in trackedphases: - nativeroots.append(map(repo.changelog.rev, self.phaseroots[phase])) + nativeroots.append(pycompat.maplist(repo.changelog.rev, + self.phaseroots[phase])) return repo.changelog.computephases(nativeroots) def _computephaserevspure(self, repo): @@ -326,7 +327,7 @@ def _write(self, fp): for phase, roots in enumerate(self.phaseroots): - for h in roots: + for h in sorted(roots): fp.write('%i %s\n' % (phase, hex(h))) self.dirty = False diff -r fb92df8b634c -r ed5448edcbfa mercurial/policy.py --- a/mercurial/policy.py Wed Apr 04 10:35:09 2018 -0400 +++ b/mercurial/policy.py Wed Apr 18 15:32:08 2018 -0400 @@ -44,11 +44,6 @@ if r'__pypy__' in sys.builtin_module_names: policy = b'cffi' -# Our C extensions aren't yet compatible with Python 3. So use pure Python -# on Python 3 for now. -if sys.version_info[0] >= 3: - policy = b'py' - # Environment variable can always force settings. if sys.version_info[0] >= 3: if r'HGMODULEPOLICY' in os.environ: @@ -71,10 +66,9 @@ # keep in sync with "version" in C modules _cextversions = { (r'cext', r'base85'): 1, - (r'cext', r'bdiff'): 1, - (r'cext', r'diffhelpers'): 1, + (r'cext', r'bdiff'): 3, (r'cext', r'mpatch'): 1, - (r'cext', r'osutil'): 3, + (r'cext', r'osutil'): 4, (r'cext', r'parsers'): 4, } @@ -83,7 +77,6 @@ (r'cext', r'charencode'): (r'cext', r'parsers'), (r'cffi', r'base85'): (r'pure', r'base85'), (r'cffi', r'charencode'): (r'pure', r'charencode'), - (r'cffi', r'diffhelpers'): (r'pure', r'diffhelpers'), (r'cffi', r'parsers'): (r'pure', r'parsers'), } diff -r fb92df8b634c -r ed5448edcbfa mercurial/posix.py --- a/mercurial/posix.py Wed Apr 04 10:35:09 2018 -0400 +++ b/mercurial/posix.py Wed Apr 18 15:32:08 2018 -0400 @@ -113,7 +113,7 @@ if l: if not stat.S_ISLNK(s): # switch file to link - fp = open(f) + fp = open(f, 'rb') data = fp.read() fp.close() unlink(f) @@ -121,7 +121,7 @@ os.symlink(data, f) except OSError: # failed to make a link, rewrite file - fp = open(f, "w") + fp = open(f, "wb") fp.write(data) fp.close() # no chmod needed at this point @@ -130,7 +130,7 @@ # switch link to file data = os.readlink(f) unlink(f) - fp = open(f, "w") + fp = open(f, "wb") fp.write(data) fp.close() s = 0o666 & ~umask # avoid restatting for chmod @@ -264,7 +264,8 @@ # already exists. target = 'checklink-target' try: - open(os.path.join(cachedir, target), 'w').close() + fullpath = os.path.join(cachedir, target) + open(fullpath, 'w').close() except IOError as inst: if inst[0] == errno.EACCES: # If we can't write to cachedir, just pretend @@ -461,12 +462,13 @@ else: return "'%s'" % s.replace("'", "'\\''") +def shellsplit(s): + """Parse a command string in POSIX shell way (best-effort)""" + return pycompat.shlexsplit(s, posix=True) + def quotecommand(cmd): return cmd -def popen(command, mode='r'): - return os.popen(command, mode) - def testpid(pid): '''return False if pid dead, True if running or not sure''' if pycompat.sysplatform == 'OpenVMS': @@ -477,13 +479,6 @@ except OSError as inst: return inst.errno != errno.ESRCH -def explainexit(code): - """return a 2-tuple (desc, code) describing a subprocess status - (codes from kill are negative - not os.system/wait encoding)""" - if code >= 0: - return _("exited with status %d") % code, code - return _("killed by signal %d") % -code, -code - def isowner(st): """Return True if the stat object st is from the current user.""" return st.st_uid == os.getuid() @@ -613,17 +608,14 @@ self.stat.st_uid == other.stat.st_uid and self.stat.st_gid == other.stat.st_gid and self.stat.st_size == other.stat.st_size and - self.stat.st_mtime == other.stat.st_mtime and - self.stat.st_ctime == other.stat.st_ctime) + self.stat[stat.ST_MTIME] == other.stat[stat.ST_MTIME] and + self.stat[stat.ST_CTIME] == other.stat[stat.ST_CTIME]) except AttributeError: return False def __ne__(self, other): return not self == other -def executablepath(): - return None # available on Windows only - def statislink(st): '''check whether a stat result is a symlink''' return st and stat.S_ISLNK(st.st_mode) diff -r fb92df8b634c -r ed5448edcbfa mercurial/profiling.py --- a/mercurial/profiling.py Wed Apr 04 10:35:09 2018 -0400 +++ b/mercurial/profiling.py Wed Apr 18 15:32:08 2018 -0400 @@ -14,6 +14,7 @@ encoding, error, extensions, + pycompat, util, ) @@ -143,7 +144,7 @@ elif profformat == 'hotpath': # inconsistent config: profiling.showmin limit = ui.configwith(fraction, 'profiling', 'showmin', 0.05) - kwargs['limit'] = limit + kwargs[r'limit'] = limit statprof.display(fp, data=data, format=displayformat, **kwargs) @@ -200,6 +201,17 @@ elif self._output: path = self._ui.expandpath(self._output) self._fp = open(path, 'wb') + elif pycompat.iswindows: + # parse escape sequence by win32print() + class uifp(object): + def __init__(self, ui): + self._ui = ui + def write(self, data): + self._ui.write_err(data) + def flush(self): + self._ui.flush() + self._fpdoclose = False + self._fp = uifp(self._ui) else: self._fpdoclose = False self._fp = self._ui.ferr diff -r fb92df8b634c -r ed5448edcbfa mercurial/progress.py --- a/mercurial/progress.py Wed Apr 04 10:35:09 2018 -0400 +++ b/mercurial/progress.py Wed Apr 18 15:32:08 2018 -0400 @@ -119,10 +119,9 @@ add = topic elif indicator == 'number': if total: - add = ('% ' + str(len(str(total))) + - 's/%s') % (pos, total) + add = b'%*d/%d' % (len(str(total)), pos, total) else: - add = str(pos) + add = b'%d' % pos elif indicator.startswith('item') and item: slice = 'end' if '-' in indicator: diff -r fb92df8b634c -r ed5448edcbfa mercurial/pure/base85.py --- a/mercurial/pure/base85.py Wed Apr 04 10:35:09 2018 -0400 +++ b/mercurial/pure/base85.py Wed Apr 18 15:32:08 2018 -0400 @@ -9,8 +9,10 @@ import struct -_b85chars = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ" \ - "abcdefghijklmnopqrstuvwxyz!#$%&()*+-;<=>?@^_`{|}~" +from .. import pycompat + +_b85chars = pycompat.bytestr("0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdef" + "ghijklmnopqrstuvwxyz!#$%&()*+-;<=>?@^_`{|}~") _b85chars2 = [(a + b) for a in _b85chars for b in _b85chars] _b85dec = {} @@ -51,6 +53,7 @@ out = [] for i in range(0, len(text), 5): chunk = text[i:i + 5] + chunk = pycompat.bytestr(chunk) acc = 0 for j, c in enumerate(chunk): try: diff -r fb92df8b634c -r ed5448edcbfa mercurial/pure/bdiff.py --- a/mercurial/pure/bdiff.py Wed Apr 04 10:35:09 2018 -0400 +++ b/mercurial/pure/bdiff.py Wed Apr 18 15:32:08 2018 -0400 @@ -90,3 +90,13 @@ text = re.sub('[ \t\r]+', ' ', text) text = text.replace(' \n', '\n') return text + +def splitnewlines(text): + '''like str.splitlines, but only split on newlines.''' + lines = [l + '\n' for l in text.split('\n')] + if lines: + if lines[-1] == '\n': + lines.pop() + else: + lines[-1] = lines[-1][:-1] + return lines diff -r fb92df8b634c -r ed5448edcbfa mercurial/pure/diffhelpers.py --- a/mercurial/pure/diffhelpers.py Wed Apr 04 10:35:09 2018 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,62 +0,0 @@ -# diffhelpers.py - pure Python implementation of diffhelpers.c -# -# Copyright 2009 Matt Mackall and others -# -# This software may be used and distributed according to the terms of the -# GNU General Public License version 2 or any later version. - -from __future__ import absolute_import - -def addlines(fp, hunk, lena, lenb, a, b): - while True: - todoa = lena - len(a) - todob = lenb - len(b) - num = max(todoa, todob) - if num == 0: - break - for i in xrange(num): - s = fp.readline() - c = s[0] - if s == "\\ No newline at end of file\n": - fix_newline(hunk, a, b) - continue - if c == "\n": - # Some patches may be missing the control char - # on empty lines. Supply a leading space. - s = " \n" - hunk.append(s) - if c == "+": - b.append(s[1:]) - elif c == "-": - a.append(s) - else: - b.append(s[1:]) - a.append(s) - return 0 - -def fix_newline(hunk, a, b): - l = hunk[-1] - # tolerate CRLF in last line - if l.endswith('\r\n'): - hline = l[:-2] - else: - hline = l[:-1] - c = hline[0] - - if c in " +": - b[-1] = hline[1:] - if c in " -": - a[-1] = hline - hunk[-1] = hline - return 0 - - -def testhunk(a, b, bstart): - alen = len(a) - blen = len(b) - if alen > blen - bstart: - return -1 - for i in xrange(alen): - if a[i][1:] != b[i + bstart]: - return -1 - return 0 diff -r fb92df8b634c -r ed5448edcbfa mercurial/pure/mpatch.py --- a/mercurial/pure/mpatch.py Wed Apr 04 10:35:09 2018 -0400 +++ b/mercurial/pure/mpatch.py Wed Apr 18 15:32:08 2018 -0400 @@ -10,7 +10,7 @@ import struct from .. import pycompat -stringio = pycompat.stringio +stringio = pycompat.bytesio class mpatchError(Exception): """error raised when a delta cannot be decoded diff -r fb92df8b634c -r ed5448edcbfa mercurial/pure/parsers.py --- a/mercurial/pure/parsers.py Wed Apr 04 10:35:09 2018 -0400 +++ b/mercurial/pure/parsers.py Wed Apr 18 15:32:08 2018 -0400 @@ -12,7 +12,7 @@ from ..node import nullid from .. import pycompat -stringio = pycompat.stringio +stringio = pycompat.bytesio _pack = struct.pack diff -r fb92df8b634c -r ed5448edcbfa mercurial/pycompat.py --- a/mercurial/pycompat.py Wed Apr 04 10:35:09 2018 -0400 +++ b/mercurial/pycompat.py Wed Apr 18 15:32:08 2018 -0400 @@ -11,6 +11,7 @@ from __future__ import absolute_import import getopt +import inspect import os import shlex import sys @@ -25,7 +26,13 @@ import Queue as _queue import SocketServer as socketserver import xmlrpclib + + from .thirdparty.concurrent import futures + + def future_set_exception_info(f, exc_info): + f.set_exception_info(*exc_info) else: + import concurrent.futures as futures import http.cookiejar as cookielib import http.client as httplib import pickle @@ -33,6 +40,9 @@ import socketserver import xmlrpc.client as xmlrpclib + def future_set_exception_info(f, exc_info): + f.set_exception(exc_info[0]) + empty = _queue.Empty queue = _queue.Queue @@ -47,9 +57,11 @@ fsencode = os.fsencode fsdecode = os.fsdecode + oscurdir = os.curdir.encode('ascii') oslinesep = os.linesep.encode('ascii') osname = os.name.encode('ascii') ospathsep = os.pathsep.encode('ascii') + ospardir = os.pardir.encode('ascii') ossep = os.sep.encode('ascii') osaltsep = os.altsep if osaltsep: @@ -61,10 +73,21 @@ sysexecutable = sys.executable if sysexecutable: sysexecutable = os.fsencode(sysexecutable) - stringio = io.BytesIO - maplist = lambda *args: list(map(*args)) - ziplist = lambda *args: list(zip(*args)) + bytesio = io.BytesIO + # TODO deprecate stringio name, as it is a lie on Python 3. + stringio = bytesio + + def maplist(*args): + return list(map(*args)) + + def rangelist(*args): + return list(range(*args)) + + def ziplist(*args): + return list(zip(*args)) + rawinput = input + getargspec = inspect.getfullargspec # TODO: .buffer might not exist if std streams were replaced; we'll need # a silly wrapper to make a bytes stream backed by a unicode one. @@ -83,12 +106,13 @@ sysargv = list(map(os.fsencode, sys.argv)) bytechr = struct.Struct('>B').pack + byterepr = b'%r'.__mod__ class bytestr(bytes): """A bytes which mostly acts as a Python 2 str >>> bytestr(), bytestr(bytearray(b'foo')), bytestr(u'ascii'), bytestr(1) - (b'', b'foo', b'ascii', b'1') + ('', 'foo', 'ascii', '1') >>> s = bytestr(b'foo') >>> assert s is bytestr(s) @@ -98,7 +122,7 @@ ... def __bytes__(self): ... return b'bytes' >>> bytestr(bytesable()) - b'bytes' + 'bytes' There's no implicit conversion from non-ascii str as its encoding is unknown: @@ -154,10 +178,19 @@ def __iter__(self): return iterbytestr(bytes.__iter__(self)) + def __repr__(self): + return bytes.__repr__(self)[1:] # drop b'' + def iterbytestr(s): """Iterate bytes as if it were a str object of Python 2""" return map(bytechr, s) + def maybebytestr(s): + """Promote bytes to bytestr""" + if isinstance(s, bytes): + return bytestr(s) + return s + def sysbytes(s): """Convert an internal str (e.g. keyword, __doc__) back to bytes @@ -180,11 +213,15 @@ def strurl(url): """Converts a bytes url back to str""" - return url.decode(u'ascii') + if isinstance(url, bytes): + return url.decode(u'ascii') + return url def bytesurl(url): """Converts a str url to bytes by encoding in ascii""" - return url.encode(u'ascii') + if isinstance(url, str): + return url.encode(u'ascii') + return url def raisewithtb(exc, tb): """Raise exception with the given traceback""" @@ -212,8 +249,10 @@ xrange = builtins.range unicode = str - def open(name, mode='r', buffering=-1): - return builtins.open(name, sysstr(mode), buffering) + def open(name, mode='r', buffering=-1, encoding=None): + return builtins.open(name, sysstr(mode), buffering, encoding) + + safehasattr = _wrapattrfunc(builtins.hasattr) def _getoptbwrapper(orig, args, shortlist, namelist): """ @@ -249,21 +288,27 @@ return dic # TODO: handle shlex.shlex(). - def shlexsplit(s): + def shlexsplit(s, comments=False, posix=True): """ Takes bytes argument, convert it to str i.e. unicodes, pass that into shlex.split(), convert the returned value to bytes and return that for Python 3 compatibility as shelx.split() don't accept bytes on Python 3. """ - ret = shlex.split(s.decode('latin-1')) + ret = shlex.split(s.decode('latin-1'), comments, posix) return [a.encode('latin-1') for a in ret] + def emailparser(*args, **kwargs): + import email.parser + return email.parser.BytesParser(*args, **kwargs) + else: import cStringIO bytechr = chr + byterepr = repr bytestr = str iterbytestr = iter + maybebytestr = identity sysbytes = identity sysstr = identity strurl = identity @@ -292,15 +337,22 @@ def getdoc(obj): return getattr(obj, '__doc__', None) + _notset = object() + + def safehasattr(thing, attr): + return getattr(thing, attr, _notset) is not _notset + def _getoptbwrapper(orig, args, shortlist, namelist): return orig(args, shortlist, namelist) strkwargs = identity byteskwargs = identity + oscurdir = os.curdir oslinesep = os.linesep osname = os.name ospathsep = os.pathsep + ospardir = os.pardir ossep = os.sep osaltsep = os.altsep stdin = sys.stdin @@ -312,10 +364,17 @@ getcwd = os.getcwd sysexecutable = sys.executable shlexsplit = shlex.split - stringio = cStringIO.StringIO + bytesio = cStringIO.StringIO + stringio = bytesio maplist = map + rangelist = range ziplist = zip rawinput = raw_input + getargspec = inspect.getargspec + + def emailparser(*args, **kwargs): + import email.parser + return email.parser.Parser(*args, **kwargs) isjython = sysplatform.startswith('java') diff -r fb92df8b634c -r ed5448edcbfa mercurial/registrar.py --- a/mercurial/registrar.py Wed Apr 04 10:35:09 2018 -0400 +++ b/mercurial/registrar.py Wed Apr 18 15:32:08 2018 -0400 @@ -138,15 +138,18 @@ potential repository locations. See ``findrepo()``. If a repository is found, it will be used and passed to the decorated function. - There are three constants in the class which tells what type of the command - that is. That information will be helpful at various places. It will be also - be used to decide what level of access the command has on hidden commits. - The constants are: + The `intents` argument defines a set of intended actions or capabilities + the command is taking. These intents can be used to affect the construction + of the repository object passed to the command. For example, commands + declaring that they are read-only could receive a repository that doesn't + have any methods allowing repository mutation. Other intents could be used + to prevent the command from running if the requested intent could not be + fulfilled. - `unrecoverablewrite` is for those write commands which can't be recovered - like push. - `recoverablewrite` is for write commands which can be recovered like commit. - `readonly` is for commands which are read only. + The following intents are defined: + + readonly + The command is read-only The signature of the decorated function looks like this: def cmd(ui[, repo] [, ] [, ]) @@ -161,29 +164,22 @@ descriptions and examples. """ - unrecoverablewrite = "unrecoverable" - recoverablewrite = "recoverable" - readonly = "readonly" - - possiblecmdtypes = {unrecoverablewrite, recoverablewrite, readonly} - def _doregister(self, func, name, options=(), synopsis=None, norepo=False, optionalrepo=False, inferrepo=False, - cmdtype=unrecoverablewrite): + intents=None): - if cmdtype not in self.possiblecmdtypes: - raise error.ProgrammingError("unknown cmdtype value '%s' for " - "'%s' command" % (cmdtype, name)) func.norepo = norepo func.optionalrepo = optionalrepo func.inferrepo = inferrepo - func.cmdtype = cmdtype + func.intents = intents or set() if synopsis: self._table[name] = func, list(options), synopsis else: self._table[name] = func, list(options) return func +INTENT_READONLY = b'readonly' + class revsetpredicate(_funcregistrarbase): """Decorator to register revset predicate @@ -283,6 +279,14 @@ templatekeyword = registrar.templatekeyword() + # new API (since Mercurial 4.6) + @templatekeyword('mykeyword', requires={'repo', 'ctx'}) + def mykeywordfunc(context, mapping): + '''Explanation of this template keyword .... + ''' + pass + + # old API @templatekeyword('mykeyword') def mykeywordfunc(repo, ctx, templ, cache, revcache, **args): '''Explanation of this template keyword .... @@ -291,6 +295,11 @@ The first string argument is used also in online help. + Optional argument 'requires' should be a collection of resource names + which the template keyword depends on. This also serves as a flag to + switch to the new API. If 'requires' is unspecified, all template + keywords and resources are expanded to the function arguments. + 'templatekeyword' instance in example above can be used to decorate multiple functions. @@ -301,6 +310,9 @@ Otherwise, explicit 'templatekw.loadkeyword()' is needed. """ + def _extrasetup(self, name, func, requires=None): + func._requires = requires + class templatefilter(_templateregistrarbase): """Decorator to register template filer @@ -308,7 +320,7 @@ templatefilter = registrar.templatefilter() - @templatefilter('myfilter') + @templatefilter('myfilter', intype=bytes) def myfilterfunc(text): '''Explanation of this template filter .... ''' @@ -316,6 +328,9 @@ The first string argument is used also in online help. + Optional argument 'intype' defines the type of the input argument, + which should be (bytes, int, templateutil.date, or None for any.) + 'templatefilter' instance in example above can be used to decorate multiple functions. @@ -326,6 +341,9 @@ Otherwise, explicit 'templatefilters.loadkeyword()' is needed. """ + def _extrasetup(self, name, func, intype=None): + func._intype = intype + class templatefunc(_templateregistrarbase): """Decorator to register template function @@ -352,7 +370,7 @@ extension, if an instance named as 'templatefunc' is used for decorating in extension. - Otherwise, explicit 'templater.loadfunction()' is needed. + Otherwise, explicit 'templatefuncs.loadfunction()' is needed. """ _getname = _funcregistrarbase._parsefuncdecl diff -r fb92df8b634c -r ed5448edcbfa mercurial/repair.py --- a/mercurial/repair.py Wed Apr 04 10:35:09 2018 -0400 +++ b/mercurial/repair.py Wed Apr 18 15:32:08 2018 -0400 @@ -26,8 +26,12 @@ obsutil, util, ) +from .utils import ( + stringutil, +) -def _bundle(repo, bases, heads, node, suffix, compress=True, obsolescence=True): +def backupbundle(repo, bases, heads, node, suffix, compress=True, + obsolescence=True): """create a bundle with the specified revisions as a backup""" backupdir = "strip-backup" @@ -166,7 +170,7 @@ vfs = repo.vfs node = nodelist[-1] if backup: - backupfile = _bundle(repo, stripbases, cl.heads(), node, topic) + backupfile = backupbundle(repo, stripbases, cl.heads(), node, topic) repo.ui.status(_("saved backup bundle to %s\n") % vfs.join(backupfile)) repo.ui.log("backupbundle", "saved backup bundle to %s\n", @@ -179,8 +183,8 @@ # we are trying to strip. This is harmless since the stripped markers # are already backed up and we did not touched the markers for the # saved changesets. - tmpbundlefile = _bundle(repo, savebases, saveheads, node, 'temp', - compress=False, obsolescence=False) + tmpbundlefile = backupbundle(repo, savebases, saveheads, node, 'temp', + compress=False, obsolescence=False) try: with repo.transaction("strip") as tr: @@ -235,7 +239,8 @@ except OSError as e: if e.errno != errno.ENOENT: ui.warn(_('error removing %s: %s\n') % - (undovfs.join(undofile), str(e))) + (undovfs.join(undofile), + stringutil.forcebytestr(e))) except: # re-raises if backupfile: diff -r fb92df8b634c -r ed5448edcbfa mercurial/repository.py --- a/mercurial/repository.py Wed Apr 04 10:35:09 2018 -0400 +++ b/mercurial/repository.py Wed Apr 18 15:32:08 2018 -0400 @@ -7,14 +7,15 @@ from __future__ import absolute_import -import abc - from .i18n import _ +from .thirdparty.zope import ( + interface as zi, +) from . import ( error, ) -class _basepeer(object): +class ipeerconnection(zi.Interface): """Represents a "connection" to a repository. This is the base interface for representing a connection to a repository. @@ -23,14 +24,9 @@ This is not a complete interface definition and should not be used outside of this module. """ - __metaclass__ = abc.ABCMeta + ui = zi.Attribute("""ui.ui instance""") - @abc.abstractproperty - def ui(self): - """ui.ui instance.""" - - @abc.abstractmethod - def url(self): + def url(): """Returns a URL string representing this peer. Currently, implementations expose the raw URL used to construct the @@ -42,62 +38,79 @@ value. """ - @abc.abstractmethod - def local(self): + def local(): """Returns a local repository instance. If the peer represents a local repository, returns an object that can be used to interface with it. Otherwise returns ``None``. """ - @abc.abstractmethod - def peer(self): + def peer(): """Returns an object conforming to this interface. Most implementations will ``return self``. """ - @abc.abstractmethod - def canpush(self): + def canpush(): """Returns a boolean indicating if this peer can be pushed to.""" - @abc.abstractmethod - def close(self): + def close(): """Close the connection to this peer. This is called when the peer will no longer be used. Resources associated with the peer should be cleaned up. """ -class _basewirecommands(object): +class ipeercapabilities(zi.Interface): + """Peer sub-interface related to capabilities.""" + + def capable(name): + """Determine support for a named capability. + + Returns ``False`` if capability not supported. + + Returns ``True`` if boolean capability is supported. Returns a string + if capability support is non-boolean. + + Capability strings may or may not map to wire protocol capabilities. + """ + + def requirecap(name, purpose): + """Require a capability to be present. + + Raises a ``CapabilityError`` if the capability isn't present. + """ + +class ipeercommands(zi.Interface): """Client-side interface for communicating over the wire protocol. This interface is used as a gateway to the Mercurial wire protocol. methods commonly call wire protocol commands of the same name. """ - __metaclass__ = abc.ABCMeta - @abc.abstractmethod - def branchmap(self): + def branchmap(): """Obtain heads in named branches. Returns a dict mapping branch name to an iterable of nodes that are heads on that branch. """ - @abc.abstractmethod - def capabilities(self): + def capabilities(): """Obtain capabilities of the peer. Returns a set of string capabilities. """ - @abc.abstractmethod - def debugwireargs(self, one, two, three=None, four=None, five=None): + def clonebundles(): + """Obtains the clone bundles manifest for the repo. + + Returns the manifest as unparsed bytes. + """ + + def debugwireargs(one, two, three=None, four=None, five=None): """Used to facilitate debugging of arguments passed over the wire.""" - @abc.abstractmethod - def getbundle(self, source, **kwargs): + def getbundle(source, **kwargs): """Obtain remote repository data as a bundle. This command is how the bulk of repository data is transferred from @@ -106,15 +119,13 @@ Returns a generator of bundle data. """ - @abc.abstractmethod - def heads(self): + def heads(): """Determine all known head revisions in the peer. Returns an iterable of binary nodes. """ - @abc.abstractmethod - def known(self, nodes): + def known(nodes): """Determine whether multiple nodes are known. Accepts an iterable of nodes whose presence to check for. @@ -123,22 +134,19 @@ at that index is known to the peer. """ - @abc.abstractmethod - def listkeys(self, namespace): + def listkeys(namespace): """Obtain all keys in a pushkey namespace. Returns an iterable of key names. """ - @abc.abstractmethod - def lookup(self, key): + def lookup(key): """Resolve a value to a known revision. Returns a binary node of the resolved revision on success. """ - @abc.abstractmethod - def pushkey(self, namespace, key, old, new): + def pushkey(namespace, key, old, new): """Set a value using the ``pushkey`` protocol. Arguments correspond to the pushkey namespace and key to operate on and @@ -148,15 +156,13 @@ namespace. """ - @abc.abstractmethod - def stream_out(self): + def stream_out(): """Obtain streaming clone data. Successful result should be a generator of data chunks. """ - @abc.abstractmethod - def unbundle(self, bundle, heads, url): + def unbundle(bundle, heads, url): """Transfer repository data to the peer. This is how the bulk of data during a push is transferred. @@ -164,17 +170,15 @@ Returns the integer number of heads added to the peer. """ -class _baselegacywirecommands(object): +class ipeerlegacycommands(zi.Interface): """Interface for implementing support for legacy wire protocol commands. Wire protocol commands transition to legacy status when they are no longer used by modern clients. To facilitate identifying which commands are legacy, the interfaces are split. """ - __metaclass__ = abc.ABCMeta - @abc.abstractmethod - def between(self, pairs): + def between(pairs): """Obtain nodes between pairs of nodes. ``pairs`` is an iterable of node pairs. @@ -183,8 +187,7 @@ requested pair. """ - @abc.abstractmethod - def branches(self, nodes): + def branches(nodes): """Obtain ancestor changesets of specific nodes back to a branch point. For each requested node, the peer finds the first ancestor node that is @@ -193,54 +196,105 @@ Returns an iterable of iterables with the resolved values for each node. """ - @abc.abstractmethod - def changegroup(self, nodes, kind): + def changegroup(nodes, source): """Obtain a changegroup with data for descendants of specified nodes.""" - @abc.abstractmethod - def changegroupsubset(self, bases, heads, kind): + def changegroupsubset(bases, heads, source): pass -class peer(_basepeer, _basewirecommands): - """Unified interface and base class for peer repositories. +class ipeercommandexecutor(zi.Interface): + """Represents a mechanism to execute remote commands. - All peer instances must inherit from this class and conform to its - interface. + This is the primary interface for requesting that wire protocol commands + be executed. Instances of this interface are active in a context manager + and have a well-defined lifetime. When the context manager exits, all + outstanding requests are waited on. """ - @abc.abstractmethod - def iterbatch(self): - """Obtain an object to be used for multiple method calls. + def callcommand(name, args): + """Request that a named command be executed. + + Receives the command name and a dictionary of command arguments. - Various operations call several methods on peer instances. If each - method call were performed immediately and serially, this would - require round trips to remote peers and/or would slow down execution. + Returns a ``concurrent.futures.Future`` that will resolve to the + result of that command request. That exact value is left up to + the implementation and possibly varies by command. - Some peers have the ability to "batch" method calls to avoid costly - round trips or to facilitate concurrent execution. + Not all commands can coexist with other commands in an executor + instance: it depends on the underlying wire protocol transport being + used and the command itself. - This method returns an object that can be used to indicate intent to - perform batched method calls. + Implementations MAY call ``sendcommands()`` automatically if the + requested command can not coexist with other commands in this executor. + + Implementations MAY call ``sendcommands()`` automatically when the + future's ``result()`` is called. So, consumers using multiple + commands with an executor MUST ensure that ``result()`` is not called + until all command requests have been issued. + """ - The returned object is a proxy of this peer. It intercepts calls to - batchable methods and queues them instead of performing them - immediately. This proxy object has a ``submit`` method that will - perform all queued batchable method calls. A ``results()`` method - exposes the results of queued/batched method calls. It is a generator - of results in the order they were called. + def sendcommands(): + """Trigger submission of queued command requests. - Not all peers or wire protocol implementations may actually batch method - calls. However, they must all support this API. + Not all transports submit commands as soon as they are requested to + run. When called, this method forces queued command requests to be + issued. It will no-op if all commands have already been sent. + + When called, no more new commands may be issued with this executor. """ - def capable(self, name): - """Determine support for a named capability. + def close(): + """Signal that this command request is finished. + + When called, no more new commands may be issued. All outstanding + commands that have previously been issued are waited on before + returning. This not only includes waiting for the futures to resolve, + but also waiting for all response data to arrive. In other words, + calling this waits for all on-wire state for issued command requests + to finish. + + When used as a context manager, this method is called when exiting the + context manager. + + This method may call ``sendcommands()`` if there are buffered commands. + """ + +class ipeerrequests(zi.Interface): + """Interface for executing commands on a peer.""" + + def commandexecutor(): + """A context manager that resolves to an ipeercommandexecutor. + + The object this resolves to can be used to issue command requests + to the peer. - Returns ``False`` if capability not supported. + Callers should call its ``callcommand`` method to issue command + requests. - Returns ``True`` if boolean capability is supported. Returns a string - if capability support is non-boolean. + A new executor should be obtained for each distinct set of commands + (possibly just a single command) that the consumer wants to execute + as part of a single operation or round trip. This is because some + peers are half-duplex and/or don't support persistent connections. + e.g. in the case of HTTP peers, commands sent to an executor represent + a single HTTP request. While some peers may support multiple command + sends over the wire per executor, consumers need to code to the least + capable peer. So it should be assumed that command executors buffer + called commands until they are told to send them and that each + command executor could result in a new connection or wire-level request + being issued. """ + +class ipeerbase(ipeerconnection, ipeercapabilities, ipeerrequests): + """Unified interface for peer repositories. + + All peer instances must conform to this interface. + """ + +@zi.implementer(ipeerbase) +class peer(object): + """Base class for peer repositories.""" + + def capable(self, name): caps = self.capabilities() if name in caps: return True @@ -253,10 +307,6 @@ return False def requirecap(self, name, purpose): - """Require a capability to be present. - - Raises a ``CapabilityError`` if the capability isn't present. - """ if self.capable(name): return @@ -264,5 +314,681 @@ _('cannot %s; remote repository does not support the %r ' 'capability') % (purpose, name)) -class legacypeer(peer, _baselegacywirecommands): - """peer but with support for legacy wire protocol commands.""" +class ifilerevisionssequence(zi.Interface): + """Contains index data for all revisions of a file. + + Types implementing this behave like lists of tuples. The index + in the list corresponds to the revision number. The values contain + index metadata. + + The *null* revision (revision number -1) is always the last item + in the index. + """ + + def __len__(): + """The total number of revisions.""" + + def __getitem__(rev): + """Returns the object having a specific revision number. + + Returns an 8-tuple with the following fields: + + offset+flags + Contains the offset and flags for the revision. 64-bit unsigned + integer where first 6 bytes are the offset and the next 2 bytes + are flags. The offset can be 0 if it is not used by the store. + compressed size + Size of the revision data in the store. It can be 0 if it isn't + needed by the store. + uncompressed size + Fulltext size. It can be 0 if it isn't needed by the store. + base revision + Revision number of revision the delta for storage is encoded + against. -1 indicates not encoded against a base revision. + link revision + Revision number of changelog revision this entry is related to. + p1 revision + Revision number of 1st parent. -1 if no 1st parent. + p2 revision + Revision number of 2nd parent. -1 if no 1st parent. + node + Binary node value for this revision number. + + Negative values should index off the end of the sequence. ``-1`` + should return the null revision. ``-2`` should return the most + recent revision. + """ + + def __contains__(rev): + """Whether a revision number exists.""" + + def insert(self, i, entry): + """Add an item to the index at specific revision.""" + +class ifileindex(zi.Interface): + """Storage interface for index data of a single file. + + File storage data is divided into index metadata and data storage. + This interface defines the index portion of the interface. + + The index logically consists of: + + * A mapping between revision numbers and nodes. + * DAG data (storing and querying the relationship between nodes). + * Metadata to facilitate storage. + """ + index = zi.Attribute( + """An ``ifilerevisionssequence`` instance.""") + + def __len__(): + """Obtain the number of revisions stored for this file.""" + + def __iter__(): + """Iterate over revision numbers for this file.""" + + def revs(start=0, stop=None): + """Iterate over revision numbers for this file, with control.""" + + def parents(node): + """Returns a 2-tuple of parent nodes for a revision. + + Values will be ``nullid`` if the parent is empty. + """ + + def parentrevs(rev): + """Like parents() but operates on revision numbers.""" + + def rev(node): + """Obtain the revision number given a node. + + Raises ``error.LookupError`` if the node is not known. + """ + + def node(rev): + """Obtain the node value given a revision number. + + Raises ``IndexError`` if the node is not known. + """ + + def lookup(node): + """Attempt to resolve a value to a node. + + Value can be a binary node, hex node, revision number, or a string + that can be converted to an integer. + + Raises ``error.LookupError`` if a node could not be resolved. + """ + + def linkrev(rev): + """Obtain the changeset revision number a revision is linked to.""" + + def flags(rev): + """Obtain flags used to affect storage of a revision.""" + + def iscensored(rev): + """Return whether a revision's content has been censored.""" + + def commonancestorsheads(node1, node2): + """Obtain an iterable of nodes containing heads of common ancestors. + + See ``ancestor.commonancestorsheads()``. + """ + + def descendants(revs): + """Obtain descendant revision numbers for a set of revision numbers. + + If ``nullrev`` is in the set, this is equivalent to ``revs()``. + """ + + def headrevs(): + """Obtain a list of revision numbers that are DAG heads. + + The list is sorted oldest to newest. + + TODO determine if sorting is required. + """ + + def heads(start=None, stop=None): + """Obtain a list of nodes that are DAG heads, with control. + + The set of revisions examined can be limited by specifying + ``start`` and ``stop``. ``start`` is a node. ``stop`` is an + iterable of nodes. DAG traversal starts at earlier revision + ``start`` and iterates forward until any node in ``stop`` is + encountered. + """ + + def children(node): + """Obtain nodes that are children of a node. + + Returns a list of nodes. + """ + + def deltaparent(rev): + """"Return the revision that is a suitable parent to delta against.""" + + def candelta(baserev, rev): + """"Whether a delta can be generated between two revisions.""" + +class ifiledata(zi.Interface): + """Storage interface for data storage of a specific file. + + This complements ``ifileindex`` and provides an interface for accessing + data for a tracked file. + """ + def rawsize(rev): + """The size of the fulltext data for a revision as stored.""" + + def size(rev): + """Obtain the fulltext size of file data. + + Any metadata is excluded from size measurements. Use ``rawsize()`` if + metadata size is important. + """ + + def checkhash(fulltext, node, p1=None, p2=None, rev=None): + """Validate the stored hash of a given fulltext and node. + + Raises ``error.RevlogError`` is hash validation fails. + """ + + def revision(node, raw=False): + """"Obtain fulltext data for a node. + + By default, any storage transformations are applied before the data + is returned. If ``raw`` is True, non-raw storage transformations + are not applied. + + The fulltext data may contain a header containing metadata. Most + consumers should use ``read()`` to obtain the actual file data. + """ + + def read(node): + """Resolve file fulltext data. + + This is similar to ``revision()`` except any metadata in the data + headers is stripped. + """ + + def renamed(node): + """Obtain copy metadata for a node. + + Returns ``False`` if no copy metadata is stored or a 2-tuple of + (path, node) from which this revision was copied. + """ + + def cmp(node, fulltext): + """Compare fulltext to another revision. + + Returns True if the fulltext is different from what is stored. + + This takes copy metadata into account. + + TODO better document the copy metadata and censoring logic. + """ + + def revdiff(rev1, rev2): + """Obtain a delta between two revision numbers. + + Operates on raw data in the store (``revision(node, raw=True)``). + + The returned data is the result of ``bdiff.bdiff`` on the raw + revision data. + """ + +class ifilemutation(zi.Interface): + """Storage interface for mutation events of a tracked file.""" + + def add(filedata, meta, transaction, linkrev, p1, p2): + """Add a new revision to the store. + + Takes file data, dictionary of metadata, a transaction, linkrev, + and parent nodes. + + Returns the node that was added. + + May no-op if a revision matching the supplied data is already stored. + """ + + def addrevision(revisiondata, transaction, linkrev, p1, p2, node=None, + flags=0, cachedelta=None): + """Add a new revision to the store. + + This is similar to ``add()`` except it operates at a lower level. + + The data passed in already contains a metadata header, if any. + + ``node`` and ``flags`` can be used to define the expected node and + the flags to use with storage. + + ``add()`` is usually called when adding files from e.g. the working + directory. ``addrevision()`` is often called by ``add()`` and for + scenarios where revision data has already been computed, such as when + applying raw data from a peer repo. + """ + + def addgroup(deltas, linkmapper, transaction, addrevisioncb=None): + """Process a series of deltas for storage. + + ``deltas`` is an iterable of 7-tuples of + (node, p1, p2, linknode, deltabase, delta, flags) defining revisions + to add. + + The ``delta`` field contains ``mpatch`` data to apply to a base + revision, identified by ``deltabase``. The base node can be + ``nullid``, in which case the header from the delta can be ignored + and the delta used as the fulltext. + + ``addrevisioncb`` should be called for each node as it is committed. + + Returns a list of nodes that were processed. A node will be in the list + even if it existed in the store previously. + """ + + def getstrippoint(minlink): + """Find the minimum revision that must be stripped to strip a linkrev. + + Returns a 2-tuple containing the minimum revision number and a set + of all revisions numbers that would be broken by this strip. + + TODO this is highly revlog centric and should be abstracted into + a higher-level deletion API. ``repair.strip()`` relies on this. + """ + + def strip(minlink, transaction): + """Remove storage of items starting at a linkrev. + + This uses ``getstrippoint()`` to determine the first node to remove. + Then it effectively truncates storage for all revisions after that. + + TODO this is highly revlog centric and should be abstracted into a + higher-level deletion API. + """ + +class ifilestorage(ifileindex, ifiledata, ifilemutation): + """Complete storage interface for a single tracked file.""" + + version = zi.Attribute( + """Version number of storage. + + TODO this feels revlog centric and could likely be removed. + """) + + storedeltachains = zi.Attribute( + """Whether the store stores deltas. + + TODO deltachains are revlog centric. This can probably removed + once there are better abstractions for obtaining/writing + data. + """) + + _generaldelta = zi.Attribute( + """Whether deltas can be against any parent revision. + + TODO this is used by changegroup code and it could probably be + folded into another API. + """) + + def files(): + """Obtain paths that are backing storage for this file. + + TODO this is used heavily by verify code and there should probably + be a better API for that. + """ + + def checksize(): + """Obtain the expected sizes of backing files. + + TODO this is used by verify and it should not be part of the interface. + """ + +class completelocalrepository(zi.Interface): + """Monolithic interface for local repositories. + + This currently captures the reality of things - not how things should be. + """ + + supportedformats = zi.Attribute( + """Set of requirements that apply to stream clone. + + This is actually a class attribute and is shared among all instances. + """) + + openerreqs = zi.Attribute( + """Set of requirements that are passed to the opener. + + This is actually a class attribute and is shared among all instances. + """) + + supported = zi.Attribute( + """Set of requirements that this repo is capable of opening.""") + + requirements = zi.Attribute( + """Set of requirements this repo uses.""") + + filtername = zi.Attribute( + """Name of the repoview that is active on this repo.""") + + wvfs = zi.Attribute( + """VFS used to access the working directory.""") + + vfs = zi.Attribute( + """VFS rooted at the .hg directory. + + Used to access repository data not in the store. + """) + + svfs = zi.Attribute( + """VFS rooted at the store. + + Used to access repository data in the store. Typically .hg/store. + But can point elsewhere if the store is shared. + """) + + root = zi.Attribute( + """Path to the root of the working directory.""") + + path = zi.Attribute( + """Path to the .hg directory.""") + + origroot = zi.Attribute( + """The filesystem path that was used to construct the repo.""") + + auditor = zi.Attribute( + """A pathauditor for the working directory. + + This checks if a path refers to a nested repository. + + Operates on the filesystem. + """) + + nofsauditor = zi.Attribute( + """A pathauditor for the working directory. + + This is like ``auditor`` except it doesn't do filesystem checks. + """) + + baseui = zi.Attribute( + """Original ui instance passed into constructor.""") + + ui = zi.Attribute( + """Main ui instance for this instance.""") + + sharedpath = zi.Attribute( + """Path to the .hg directory of the repo this repo was shared from.""") + + store = zi.Attribute( + """A store instance.""") + + spath = zi.Attribute( + """Path to the store.""") + + sjoin = zi.Attribute( + """Alias to self.store.join.""") + + cachevfs = zi.Attribute( + """A VFS used to access the cache directory. + + Typically .hg/cache. + """) + + filteredrevcache = zi.Attribute( + """Holds sets of revisions to be filtered.""") + + names = zi.Attribute( + """A ``namespaces`` instance.""") + + def close(): + """Close the handle on this repository.""" + + def peer(): + """Obtain an object conforming to the ``peer`` interface.""" + + def unfiltered(): + """Obtain an unfiltered/raw view of this repo.""" + + def filtered(name, visibilityexceptions=None): + """Obtain a named view of this repository.""" + + obsstore = zi.Attribute( + """A store of obsolescence data.""") + + changelog = zi.Attribute( + """A handle on the changelog revlog.""") + + manifestlog = zi.Attribute( + """A handle on the root manifest revlog.""") + + dirstate = zi.Attribute( + """Working directory state.""") + + narrowpats = zi.Attribute( + """Matcher patterns for this repository's narrowspec.""") + + def narrowmatch(): + """Obtain a matcher for the narrowspec.""" + + def setnarrowpats(newincludes, newexcludes): + """Define the narrowspec for this repository.""" + + def __getitem__(changeid): + """Try to resolve a changectx.""" + + def __contains__(changeid): + """Whether a changeset exists.""" + + def __nonzero__(): + """Always returns True.""" + return True + + __bool__ = __nonzero__ + + def __len__(): + """Returns the number of changesets in the repo.""" + + def __iter__(): + """Iterate over revisions in the changelog.""" + + def revs(expr, *args): + """Evaluate a revset. + + Emits revisions. + """ + + def set(expr, *args): + """Evaluate a revset. + + Emits changectx instances. + """ + + def anyrevs(specs, user=False, localalias=None): + """Find revisions matching one of the given revsets.""" + + def url(): + """Returns a string representing the location of this repo.""" + + def hook(name, throw=False, **args): + """Call a hook.""" + + def tags(): + """Return a mapping of tag to node.""" + + def tagtype(tagname): + """Return the type of a given tag.""" + + def tagslist(): + """Return a list of tags ordered by revision.""" + + def nodetags(node): + """Return the tags associated with a node.""" + + def nodebookmarks(node): + """Return the list of bookmarks pointing to the specified node.""" + + def branchmap(): + """Return a mapping of branch to heads in that branch.""" + + def revbranchcache(): + pass + + def branchtip(branchtip, ignoremissing=False): + """Return the tip node for a given branch.""" + + def lookup(key): + """Resolve the node for a revision.""" + + def lookupbranch(key): + """Look up the branch name of the given revision or branch name.""" + + def known(nodes): + """Determine whether a series of nodes is known. + + Returns a list of bools. + """ + + def local(): + """Whether the repository is local.""" + return True + + def publishing(): + """Whether the repository is a publishing repository.""" + + def cancopy(): + pass + + def shared(): + """The type of shared repository or None.""" + + def wjoin(f, *insidef): + """Calls self.vfs.reljoin(self.root, f, *insidef)""" + + def file(f): + """Obtain a filelog for a tracked path.""" + + def setparents(p1, p2): + """Set the parent nodes of the working directory.""" + + def filectx(path, changeid=None, fileid=None): + """Obtain a filectx for the given file revision.""" + + def getcwd(): + """Obtain the current working directory from the dirstate.""" + + def pathto(f, cwd=None): + """Obtain the relative path to a file.""" + + def adddatafilter(name, fltr): + pass + + def wread(filename): + """Read a file from wvfs, using data filters.""" + + def wwrite(filename, data, flags, backgroundclose=False, **kwargs): + """Write data to a file in the wvfs, using data filters.""" + + def wwritedata(filename, data): + """Resolve data for writing to the wvfs, using data filters.""" + + def currenttransaction(): + """Obtain the current transaction instance or None.""" + + def transaction(desc, report=None): + """Open a new transaction to write to the repository.""" + + def undofiles(): + """Returns a list of (vfs, path) for files to undo transactions.""" + + def recover(): + """Roll back an interrupted transaction.""" + + def rollback(dryrun=False, force=False): + """Undo the last transaction. + + DANGEROUS. + """ + + def updatecaches(tr=None, full=False): + """Warm repo caches.""" + + def invalidatecaches(): + """Invalidate cached data due to the repository mutating.""" + + def invalidatevolatilesets(): + pass + + def invalidatedirstate(): + """Invalidate the dirstate.""" + + def invalidate(clearfilecache=False): + pass + + def invalidateall(): + pass + + def lock(wait=True): + """Lock the repository store and return a lock instance.""" + + def wlock(wait=True): + """Lock the non-store parts of the repository.""" + + def currentwlock(): + """Return the wlock if it's held or None.""" + + def checkcommitpatterns(wctx, vdirs, match, status, fail): + pass + + def commit(text='', user=None, date=None, match=None, force=False, + editor=False, extra=None): + """Add a new revision to the repository.""" + + def commitctx(ctx, error=False): + """Commit a commitctx instance to the repository.""" + + def destroying(): + """Inform the repository that nodes are about to be destroyed.""" + + def destroyed(): + """Inform the repository that nodes have been destroyed.""" + + def status(node1='.', node2=None, match=None, ignored=False, + clean=False, unknown=False, listsubrepos=False): + """Convenience method to call repo[x].status().""" + + def addpostdsstatus(ps): + pass + + def postdsstatus(): + pass + + def clearpostdsstatus(): + pass + + def heads(start=None): + """Obtain list of nodes that are DAG heads.""" + + def branchheads(branch=None, start=None, closed=False): + pass + + def branches(nodes): + pass + + def between(pairs): + pass + + def checkpush(pushop): + pass + + prepushoutgoinghooks = zi.Attribute( + """util.hooks instance.""") + + def pushkey(namespace, key, old, new): + pass + + def listkeys(namespace): + pass + + def debugwireargs(one, two, three=None, four=None, five=None): + pass + + def savecommitmessage(text): + pass diff -r fb92df8b634c -r ed5448edcbfa mercurial/revlog.py --- a/mercurial/revlog.py Wed Apr 04 10:35:09 2018 -0400 +++ b/mercurial/revlog.py Wed Apr 18 15:32:08 2018 -0400 @@ -13,12 +13,13 @@ from __future__ import absolute_import -import binascii import collections +import contextlib import errno import hashlib import heapq import os +import re import struct import zlib @@ -28,6 +29,7 @@ hex, nullid, nullrev, + wdirfilenodeids, wdirhex, wdirid, wdirrev, @@ -45,6 +47,9 @@ templatefilters, util, ) +from .utils import ( + stringutil, +) parsers = policy.importmod(r'parsers') @@ -94,6 +99,29 @@ REVIDX_ISCENSORED: None, } +_mdre = re.compile('\1\n') +def parsemeta(text): + """return (metadatadict, metadatasize)""" + # text can be buffer, so we can't use .startswith or .index + if text[:2] != '\1\n': + return None, None + s = _mdre.search(text, 2).start() + mtext = text[2:s] + meta = {} + for l in mtext.splitlines(): + k, v = l.split(": ", 1) + meta[k] = v + return meta, (s + 2) + +def packmeta(meta, text): + keys = sorted(meta) + metatext = "".join("%s: %s\n" % (k, meta[k]) for k in keys) + return "\1\n%s\1\n%s" % (metatext, text) + +def _censoredtext(text): + m, offs = parsemeta(text) + return m and "censored" in m + def addflagprocessor(flag, processor): """Register a flag processor on a revision data flag. @@ -551,9 +579,11 @@ If mmaplargeindex is True, and an mmapindexthreshold is set, the index will be mmapped rather than read if it is larger than the configured threshold. + + If censorable is True, the revlog can have censored revisions. """ def __init__(self, opener, indexfile, datafile=None, checkambig=False, - mmaplargeindex=False): + mmaplargeindex=False, censorable=False): """ create a revlog object @@ -566,6 +596,7 @@ # When True, indexfile is opened with checkambig=True at writing, to # avoid file stat ambiguity. self._checkambig = checkambig + self._censorable = censorable # 3-tuple of (node, rev, text) for a raw revision. self._cache = None # Maps rev to chain base rev. @@ -629,13 +660,12 @@ indexdata = '' self._initempty = True try: - f = self.opener(self.indexfile) - if (mmapindexthreshold is not None and - self.opener.fstat(f).st_size >= mmapindexthreshold): - indexdata = util.buffer(util.mmapread(f)) - else: - indexdata = f.read() - f.close() + with self._indexfp() as f: + if (mmapindexthreshold is not None and + self.opener.fstat(f).st_size >= mmapindexthreshold): + indexdata = util.buffer(util.mmapread(f)) + else: + indexdata = f.read() if len(indexdata) > 0: v = versionformat_unpack(indexdata[:4])[0] self._initempty = False @@ -690,6 +720,32 @@ def _compressor(self): return util.compengines[self._compengine].revlogcompressor() + def _indexfp(self, mode='r'): + """file object for the revlog's index file""" + args = {r'mode': mode} + if mode != 'r': + args[r'checkambig'] = self._checkambig + if mode == 'w': + args[r'atomictemp'] = True + return self.opener(self.indexfile, **args) + + def _datafp(self, mode='r'): + """file object for the revlog's data file""" + return self.opener(self.datafile, mode=mode) + + @contextlib.contextmanager + def _datareadfp(self, existingfp=None): + """file object suitable to read data""" + if existingfp is not None: + yield existingfp + else: + if self._inline: + func = self._indexfp + else: + func = self._datafp + with func() as fp: + yield fp + def tip(self): return self.node(len(self.index) - 2) def __contains__(self, rev): @@ -752,7 +808,7 @@ raise except RevlogError: # parsers.c radix tree lookup failed - if node == wdirid: + if node == wdirid or node in wdirfilenodeids: raise error.WdirUnsupported raise LookupError(node, self.indexfile, _('no node')) except KeyError: @@ -762,13 +818,15 @@ p = self._nodepos if p is None: p = len(i) - 2 + else: + assert p < len(i) for r in xrange(p, -1, -1): v = i[r][7] n[v] = r if v == node: self._nodepos = r - 1 return r - if node == wdirid: + if node == wdirid or node in wdirfilenodeids: raise error.WdirUnsupported raise LookupError(node, self.indexfile, _('no node')) @@ -1362,7 +1420,7 @@ try: # str(rev) rev = int(id) - if str(rev) != id: + if "%d" % rev != id: raise ValueError if rev < 0: rev = len(self) + rev @@ -1381,6 +1439,7 @@ pass def _partialmatch(self, id): + # we don't care wdirfilenodeids as they should be always full hash maybewdir = wdirhex.startswith(id) try: partial = self.index.partialmatch(id) @@ -1424,7 +1483,7 @@ if maybewdir: raise error.WdirUnsupported return None - except (TypeError, binascii.Error): + except TypeError: pass def lookup(self, id): @@ -1441,8 +1500,8 @@ raise LookupError(id, self.indexfile, _('no match found')) - def shortest(self, hexnode, minlength=1): - """Find the shortest unambiguous prefix that matches hexnode.""" + def shortest(self, node, minlength=1): + """Find the shortest unambiguous prefix that matches node.""" def isvalid(test): try: if self._partialmatch(test) is None: @@ -1464,6 +1523,7 @@ # single 'ff...' match return True + hexnode = hex(node) shortest = hexnode startlength = max(6, minlength) length = startlength @@ -1510,15 +1570,6 @@ Returns a str or buffer of raw byte data. """ - if df is not None: - closehandle = False - else: - if self._inline: - df = self.opener(self.indexfile) - else: - df = self.opener(self.datafile) - closehandle = True - # Cache data both forward and backward around the requested # data, in a fixed size window. This helps speed up operations # involving reading the revlog backwards. @@ -1526,10 +1577,9 @@ realoffset = offset & ~(cachesize - 1) reallength = (((offset + length + cachesize) & ~(cachesize - 1)) - realoffset) - df.seek(realoffset) - d = df.read(reallength) - if closehandle: - df.close() + with self._datareadfp(df) as df: + df.seek(realoffset) + d = df.read(reallength) self._cachesegment(realoffset, d) if offset != realoffset or reallength != length: return util.buffer(d, offset - realoffset, length) @@ -1829,16 +1879,21 @@ Available as a function so that subclasses can extend hash mismatch behaviors as needed. """ - if p1 is None and p2 is None: - p1, p2 = self.parents(node) - if node != self.hash(text, p1, p2): - revornode = rev - if revornode is None: - revornode = templatefilters.short(hex(node)) - raise RevlogError(_("integrity check failed on %s:%s") - % (self.indexfile, pycompat.bytestr(revornode))) + try: + if p1 is None and p2 is None: + p1, p2 = self.parents(node) + if node != self.hash(text, p1, p2): + revornode = rev + if revornode is None: + revornode = templatefilters.short(hex(node)) + raise RevlogError(_("integrity check failed on %s:%s") + % (self.indexfile, pycompat.bytestr(revornode))) + except RevlogError: + if self._censorable and _censoredtext(text): + raise error.CensoredNodeError(self.indexfile, node, text) + raise - def checkinlinesize(self, tr, fp=None): + def _enforceinlinesize(self, tr, fp=None): """Check if the revlog is too big for inline and convert if so. This should be called after revisions are added to the revlog. If the @@ -1867,24 +1922,20 @@ fp.flush() fp.close() - df = self.opener(self.datafile, 'w') - try: + with self._datafp('w') as df: for r in self: df.write(self._getsegmentforrevs(r, r)[1]) - finally: - df.close() - fp = self.opener(self.indexfile, 'w', atomictemp=True, - checkambig=self._checkambig) - self.version &= ~FLAG_INLINE_DATA - self._inline = False - for i in self: - e = self._io.packentry(self.index[i], self.node, self.version, i) - fp.write(e) + with self._indexfp('w') as fp: + self.version &= ~FLAG_INLINE_DATA + self._inline = False + io = self._io + for i in self: + e = io.packentry(self.index[i], self.node, self.version, i) + fp.write(e) - # if we don't call close, the temp file will never replace the - # real index - fp.close() + # the temp file replace the real index when we exit the context + # manager tr.replace(self.indexfile, trindex * self._io.size) self._chunkclear() @@ -1943,8 +1994,8 @@ """ dfh = None if not self._inline: - dfh = self.opener(self.datafile, "a+") - ifh = self.opener(self.indexfile, "a+", checkambig=self._checkambig) + dfh = self._datafp("a+") + ifh = self._indexfp("a+") try: return self._addrevision(node, rawtext, transaction, link, p1, p2, flags, cachedelta, ifh, dfh, @@ -2005,7 +2056,8 @@ try: return _zlibdecompress(data) except zlib.error as e: - raise RevlogError(_('revlog decompress error: %s') % str(e)) + raise RevlogError(_('revlog decompress error: %s') % + stringutil.forcebytestr(e)) # '\0' is more common than 'u' so it goes first. elif t == '\0': return data @@ -2067,7 +2119,7 @@ if node == nullid: raise RevlogError(_("%s: attempt to add null revision") % (self.indexfile)) - if node == wdirid: + if node == wdirid or node in wdirfilenodeids: raise RevlogError(_("%s: attempt to add wdir revision") % (self.indexfile)) @@ -2129,7 +2181,7 @@ if alwayscache and rawtext is None: rawtext = deltacomputer._buildtext(revinfo, fh) - if type(rawtext) == str: # only accept immutable objects + if type(rawtext) == bytes: # only accept immutable objects self._cache = (node, curr, rawtext) self._chainbasecache[curr] = chainbase return node @@ -2163,7 +2215,7 @@ ifh.write(entry) ifh.write(data[0]) ifh.write(data[1]) - self.checkinlinesize(transaction, ifh) + self._enforceinlinesize(transaction, ifh) def addgroup(self, deltas, linkmapper, transaction, addrevisioncb=None): """ @@ -2183,7 +2235,7 @@ end = 0 if r: end = self.end(r - 1) - ifh = self.opener(self.indexfile, "a+", checkambig=self._checkambig) + ifh = self._indexfp("a+") isize = r * self._io.size if self._inline: transaction.add(self.indexfile, end + isize, r) @@ -2191,7 +2243,7 @@ else: transaction.add(self.indexfile, isize, r) transaction.add(self.datafile, end) - dfh = self.opener(self.datafile, "a+") + dfh = self._datafp("a+") def flush(): if dfh: dfh.flush() @@ -2254,9 +2306,8 @@ # addrevision switched from inline to conventional # reopen the index ifh.close() - dfh = self.opener(self.datafile, "a+") - ifh = self.opener(self.indexfile, "a+", - checkambig=self._checkambig) + dfh = self._datafp("a+") + ifh = self._indexfp("a+") finally: if dfh: dfh.close() @@ -2266,11 +2317,33 @@ def iscensored(self, rev): """Check if a file revision is censored.""" - return False + if not self._censorable: + return False + + return self.flags(rev) & REVIDX_ISCENSORED def _peek_iscensored(self, baserev, delta, flush): """Quickly check if a delta produces a censored revision.""" - return False + if not self._censorable: + return False + + # Fragile heuristic: unless new file meta keys are added alphabetically + # preceding "censored", all censored revisions are prefixed by + # "\1\ncensored:". A delta producing such a censored revision must be a + # full-replacement delta, so we inspect the first and only patch in the + # delta for this prefix. + hlen = struct.calcsize(">lll") + if len(delta) <= hlen: + return False + + oldlen = self.rawsize(baserev) + newlen = len(delta) - hlen + if delta[:hlen] != mdiff.replacediffheader(oldlen, newlen): + return False + + add = "\1\ncensored:" + addlen = len(add) + return newlen >= addlen and delta[hlen:hlen + addlen] == add def getstrippoint(self, minlink): """find the minimum rev that must be stripped to strip the linkrev @@ -2351,6 +2424,7 @@ del self.nodemap[self.node(x)] del self.index[rev:-1] + self._nodepos = None def checksize(self): expected = 0 @@ -2358,10 +2432,9 @@ expected = max(0, self.end(len(self) - 1)) try: - f = self.opener(self.datafile) - f.seek(0, 2) - actual = f.tell() - f.close() + with self._datafp() as f: + f.seek(0, 2) + actual = f.tell() dd = actual - expected except IOError as inst: if inst.errno != errno.ENOENT: @@ -2488,7 +2561,7 @@ if populatecachedelta: dp = self.deltaparent(rev) if dp != nullrev: - cachedelta = (dp, str(self._chunk(rev))) + cachedelta = (dp, bytes(self._chunk(rev))) if not cachedelta: rawtext = self.revision(rev, raw=True) diff -r fb92df8b634c -r ed5448edcbfa mercurial/revset.py --- a/mercurial/revset.py Wed Apr 04 10:35:09 2018 -0400 +++ b/mercurial/revset.py Wed Apr 18 15:32:08 2018 -0400 @@ -28,8 +28,13 @@ revsetlang, scmutil, smartset, + stack as stackmod, util, ) +from .utils import ( + dateutil, + stringutil, +) # helpers for processing parsed tree getsymbol = revsetlang.getsymbol @@ -105,10 +110,15 @@ pass return None +def _sortedb(xs): + return sorted(util.rapply(pycompat.maybebytestr, xs)) + # operator methods def stringset(repo, subset, x, order): - x = scmutil.intrev(repo[x]) + if not x: + raise error.ParseError(_("empty string is not a valid revision")) + x = scmutil.intrev(scmutil.revsymbol(repo, x)) if (x in subset or x == node.nullrev and isinstance(subset, fullreposet)): return baseset([x]) @@ -442,7 +452,7 @@ bm = getstring(args[0], # i18n: "bookmark" is a keyword _('the argument to bookmark must be a string')) - kind, pattern, matcher = util.stringmatcher(bm) + kind, pattern, matcher = stringutil.stringmatcher(bm) bms = set() if kind == 'literal': bmrev = repo._bookmarks.get(pattern, None) @@ -487,7 +497,7 @@ # not a string, but another revspec, e.g. tip() pass else: - kind, pattern, matcher = util.stringmatcher(b) + kind, pattern, matcher = stringutil.stringmatcher(b) if kind == 'literal': # note: falls through to the revspec case if no branch with # this name exists and pattern kind is not specified explicitly @@ -507,15 +517,7 @@ b.add(getbranch(r)) c = s.__contains__ return subset.filter(lambda r: c(r) or getbranch(r) in b, - condrepr=lambda: '' % sorted(b)) - -@predicate('bumped()', safe=True) -def bumped(repo, subset, x): - msg = ("'bumped()' is deprecated, " - "use 'phasedivergent()'") - repo.ui.deprecwarn(msg, '4.4') - - return phasedivergent(repo, subset, x) + condrepr=lambda: '' % _sortedb(b)) @predicate('phasedivergent()', safe=True) def phasedivergent(repo, subset, x): @@ -663,7 +665,7 @@ """ # i18n: "date" is a keyword ds = getstring(x, _("date requires a string")) - dm = util.matchdate(ds) + dm = dateutil.matchdate(ds) return subset.filter(lambda x: dm(repo[x].date()[0]), condrepr=('', ds)) @@ -768,15 +770,7 @@ src = _getrevsource(repo, r) return subset.filter(dests.__contains__, - condrepr=lambda: '' % sorted(dests)) - -@predicate('divergent()', safe=True) -def divergent(repo, subset, x): - msg = ("'divergent()' is deprecated, " - "use 'contentdivergent()'") - repo.ui.deprecwarn(msg, '4.4') - - return contentdivergent(repo, subset, x) + condrepr=lambda: '' % _sortedb(dests)) @predicate('contentdivergent()', safe=True) def contentdivergent(repo, subset, x): @@ -830,7 +824,7 @@ # i18n: "extra" is a keyword value = getstring(args['value'], _('second argument to extra must be ' 'a string')) - kind, value, matcher = util.stringmatcher(value) + kind, value, matcher = stringutil.stringmatcher(value) def _matchvalue(r): extra = repo[r].extra() @@ -1024,7 +1018,8 @@ # i18n: "grep" is a keyword gr = re.compile(getstring(x, _("grep requires a string"))) except re.error as e: - raise error.ParseError(_('invalid match pattern: %s') % e) + raise error.ParseError( + _('invalid match pattern: %s') % stringutil.forcebytestr(e)) def matches(x): c = repo[x] @@ -1296,7 +1291,7 @@ ns = getstring(args[0], # i18n: "named" is a keyword _('the argument to named must be a string')) - kind, pattern, matcher = util.stringmatcher(ns) + kind, pattern, matcher = stringutil.stringmatcher(ns) namespaces = set() if kind == 'literal': if pattern not in repo.names: @@ -1543,6 +1538,21 @@ target = phases.secret return _phase(repo, subset, target) +@predicate('stack([revs])', safe=True) +def stack(repo, subset, x): + """Experimental revset for the stack of changesets or working directory + parent. (EXPERIMENTAL) + """ + if x is None: + stacks = stackmod.getstack(repo, x) + else: + stacks = smartset.baseset([]) + for revision in getset(repo, fullreposet(repo), x): + currentstack = stackmod.getstack(repo, revision) + stacks = stacks + currentstack + + return subset & stacks + def parentspec(repo, subset, x, n, order): """``set^0`` The set. @@ -1854,11 +1864,12 @@ keyflags = [] for k in keys.split(): fk = k - reverse = (k[0] == '-') + reverse = (k.startswith('-')) if reverse: k = k[1:] if k not in _sortkeyfuncs and k != 'topo': - raise error.ParseError(_("unknown sort key %r") % fk) + raise error.ParseError( + _("unknown sort key %r") % pycompat.bytestr(fk)) keyflags.append((k, reverse)) if len(keyflags) > 1 and any(k == 'topo' for k, reverse in keyflags): @@ -1936,7 +1947,7 @@ m = matchmod.exact(repo.root, repo.root, ['.hgsubstate']) def submatches(names): - k, p, m = util.stringmatcher(pat) + k, p, m = stringutil.stringmatcher(pat) for name in names: if m(name): yield name @@ -1989,8 +2000,8 @@ return subset & d def _substringmatcher(pattern, casesensitive=True): - kind, pattern, matcher = util.stringmatcher(pattern, - casesensitive=casesensitive) + kind, pattern, matcher = stringutil.stringmatcher( + pattern, casesensitive=casesensitive) if kind == 'literal': if not casesensitive: pattern = encoding.lower(pattern) @@ -2013,7 +2024,7 @@ pattern = getstring(args[0], # i18n: "tag" is a keyword _('the argument to tag must be a string')) - kind, pattern, matcher = util.stringmatcher(pattern) + kind, pattern, matcher = stringutil.stringmatcher(pattern) if kind == 'literal': # avoid resolving all tags tn = repo._tagscache.tags.get(pattern, None) @@ -2031,14 +2042,6 @@ def tagged(repo, subset, x): return tag(repo, subset, x) -@predicate('unstable()', safe=True) -def unstable(repo, subset, x): - msg = ("'unstable()' is deprecated, " - "use 'orphan()'") - repo.ui.deprecwarn(msg, '4.4') - - return orphan(repo, subset, x) - @predicate('orphan()', safe=True) def orphan(repo, subset, x): """Non-obsolete changesets with obsolete ancestors. (EXPERIMENTAL) @@ -2080,7 +2083,7 @@ try: # fast path for integer revision r = int(t) - if str(r) != t or r not in cl: + if ('%d' % r) != t or r not in cl: raise ValueError revs = [r] except ValueError: @@ -2163,18 +2166,20 @@ "parentpost": parentpost, } -def posttreebuilthook(tree, repo): - # hook for extensions to execute code on the optimized tree - pass +def lookupfn(repo): + return lambda symbol: scmutil.isrevsymbol(repo, symbol) -def match(ui, spec, repo=None): +def match(ui, spec, lookup=None): """Create a matcher for a single revision spec""" - return matchany(ui, [spec], repo=repo) + return matchany(ui, [spec], lookup=None) -def matchany(ui, specs, repo=None, localalias=None): +def matchany(ui, specs, lookup=None, localalias=None): """Create a matcher that will include any revisions matching one of the given specs + If lookup function is not None, the parser will first attempt to handle + old-style ranges, which may contain operator characters. + If localalias is not None, it is a dict {name: definitionstring}. It takes precedence over [revsetalias] config section. """ @@ -2184,9 +2189,6 @@ return mfunc if not all(specs): raise error.ParseError(_("empty query")) - lookup = None - if repo: - lookup = repo.__contains__ if len(specs) == 1: tree = revsetlang.parse(specs[0], lookup) else: @@ -2205,7 +2207,6 @@ tree = revsetlang.foldconcat(tree) tree = revsetlang.analyze(tree) tree = revsetlang.optimize(tree) - posttreebuilthook(tree, repo) return makematcher(tree) def makematcher(tree): diff -r fb92df8b634c -r ed5448edcbfa mercurial/revsetlang.py --- a/mercurial/revsetlang.py Wed Apr 04 10:35:09 2018 -0400 +++ b/mercurial/revsetlang.py Wed Apr 18 15:32:08 2018 -0400 @@ -17,6 +17,9 @@ pycompat, util, ) +from .utils import ( + stringutil, +) elements = { # token-type: binding-strength, primary, prefix, infix, suffix @@ -86,6 +89,9 @@ [('symbol', '@', 0), ('::', None, 1), ('end', None, 3)] ''' + if not isinstance(program, bytes): + raise error.ProgrammingError('revset statement must be bytes, got %r' + % program) program = pycompat.bytestr(program) if syminitletters is None: syminitletters = _syminitletters @@ -207,7 +213,7 @@ raise error.ParseError(err) def getboolean(x, err): - value = util.parsebool(getsymbol(x)) + value = stringutil.parsebool(getsymbol(x)) if value is not None: return value raise error.ParseError(err) @@ -349,6 +355,9 @@ elif op == 'keyvalue': return (op, x[1], _analyze(x[2])) elif op == 'func': + f = getsymbol(x[1]) + if f == 'revset': + return _analyze(x[2]) return (op, x[1], _analyze(x[2])) raise ValueError('invalid operator %r' % op) @@ -479,6 +488,8 @@ ... ParseError: ('invalid token', 4) """ + if lookup and spec.startswith('revset(') and spec.endswith(')'): + lookup = None p = parser.parser(elements) tree, pos = p.parse(tokenize(spec, lookup=lookup, syminitletters=syminitletters)) @@ -539,7 +550,19 @@ return tuple(foldconcat(t) for t in tree) def parse(spec, lookup=None): - return _parsewith(spec, lookup=lookup) + try: + return _parsewith(spec, lookup=lookup) + except error.ParseError as inst: + if len(inst.args) > 1: # has location + loc = inst.args[1] + # Remove newlines -- spaces are equivalent whitespace. + spec = spec.replace('\n', ' ') + # We want the caret to point to the place in the template that + # failed to parse, but in a hint we get a open paren at the + # start. Therefore, we print "loc + 1" spaces (instead of "loc") + # to line up the caret with the location of the error. + inst.hint = spec + '\n' + ' ' * (loc + 1) + '^ ' + _('here') + raise def _quote(s): r"""Quote a value in order to make it safe for the revset engine. @@ -553,7 +576,7 @@ >>> _quote(1) "'1'" """ - return "'%s'" % util.escapestr(pycompat.bytestr(s)) + return "'%s'" % stringutil.escapestr(pycompat.bytestr(s)) def _formatargtype(c, arg): if c == 'd': @@ -561,6 +584,8 @@ elif c == 's': return _quote(arg) elif c == 'r': + if not isinstance(arg, bytes): + raise TypeError parse(arg) # make sure syntax errors are confined return '(%s)' % arg elif c == 'n': @@ -635,7 +660,7 @@ "root(_list('a\\\\x00b\\\\x00c\\\\x00d'))" >>> formatspec(b'sort(%r, %ps)', b':', [b'desc', b'user']) "sort((:), 'desc', 'user')" - >>> formatspec('%ls', ['a', "'"]) + >>> formatspec(b'%ls', [b'a', b"'"]) "_list('a\\\\x00\\\\'')" ''' expr = pycompat.bytestr(expr) @@ -717,13 +742,13 @@ def gethashlikesymbols(tree): """returns the list of symbols of the tree that look like hashes - >>> gethashlikesymbols(('dagrange', ('symbol', '3'), ('symbol', 'abe3ff'))) + >>> gethashlikesymbols(parse(b'3::abe3ff')) ['3', 'abe3ff'] - >>> gethashlikesymbols(('func', ('symbol', 'precursors'), ('symbol', '.'))) + >>> gethashlikesymbols(parse(b'precursors(.)')) [] - >>> gethashlikesymbols(('func', ('symbol', 'precursors'), ('symbol', '34'))) + >>> gethashlikesymbols(parse(b'precursors(34)')) ['34'] - >>> gethashlikesymbols(('symbol', 'abe3ffZ')) + >>> gethashlikesymbols(parse(b'abe3ffZ')) [] """ if not tree: diff -r fb92df8b634c -r ed5448edcbfa mercurial/scmutil.py --- a/mercurial/scmutil.py Wed Apr 04 10:35:09 2018 -0400 +++ b/mercurial/scmutil.py Wed Apr 18 15:32:08 2018 -0400 @@ -18,6 +18,7 @@ from .i18n import _ from .node import ( + bin, hex, nullid, short, @@ -41,6 +42,11 @@ vfs, ) +from .utils import ( + procutil, + stringutil, +) + if pycompat.iswindows: from . import scmwindows as scmplatform else: @@ -162,13 +168,14 @@ reason = _('timed out waiting for lock held by %r') % inst.locker else: reason = _('lock held by %r') % inst.locker - ui.warn(_("abort: %s: %s\n") % (inst.desc or inst.filename, reason)) + ui.warn(_("abort: %s: %s\n") + % (inst.desc or stringutil.forcebytestr(inst.filename), reason)) if not inst.locker: ui.warn(_("(lock might be very busy)\n")) except error.LockUnavailable as inst: ui.warn(_("abort: could not lock %s: %s\n") % - (inst.desc or inst.filename, - encoding.strtolocal(inst.strerror))) + (inst.desc or stringutil.forcebytestr(inst.filename), + encoding.strtolocal(inst.strerror))) except error.OutOfBandError as inst: if inst.args: msg = _("abort: remote error:\n") @@ -185,12 +192,15 @@ ui.warn(_("(%s)\n") % inst.hint) except error.ResponseError as inst: ui.warn(_("abort: %s") % inst.args[0]) - if not isinstance(inst.args[1], basestring): - ui.warn(" %r\n" % (inst.args[1],)) - elif not inst.args[1]: + msg = inst.args[1] + if isinstance(msg, type(u'')): + msg = pycompat.sysbytes(msg) + if not isinstance(msg, bytes): + ui.warn(" %r\n" % (msg,)) + elif not msg: ui.warn(_(" empty string\n")) else: - ui.warn("\n%r\n" % util.ellipsis(inst.args[1])) + ui.warn("\n%r\n" % stringutil.ellipsis(msg)) except error.CensoredNodeError as inst: ui.warn(_("abort: file censored %s!\n") % inst) except error.RevlogError as inst: @@ -207,15 +217,15 @@ if inst.hint: ui.warn(_("(%s)\n") % inst.hint) except ImportError as inst: - ui.warn(_("abort: %s!\n") % inst) - m = str(inst).split()[-1] + ui.warn(_("abort: %s!\n") % stringutil.forcebytestr(inst)) + m = stringutil.forcebytestr(inst).split()[-1] if m in "mpatch bdiff".split(): ui.warn(_("(did you forget to compile extensions?)\n")) elif m in "zlib".split(): ui.warn(_("(is your Python install correct?)\n")) except IOError as inst: if util.safehasattr(inst, "code"): - ui.warn(_("abort: %s\n") % inst) + ui.warn(_("abort: %s\n") % stringutil.forcebytestr(inst)) elif util.safehasattr(inst, "reason"): try: # usually it is in the form (errno, strerror) reason = inst.reason.args[1] @@ -232,7 +242,8 @@ elif getattr(inst, "strerror", None): if getattr(inst, "filename", None): ui.warn(_("abort: %s: %s\n") % ( - encoding.strtolocal(inst.strerror), inst.filename)) + encoding.strtolocal(inst.strerror), + stringutil.forcebytestr(inst.filename))) else: ui.warn(_("abort: %s\n") % encoding.strtolocal(inst.strerror)) else: @@ -240,7 +251,8 @@ except OSError as inst: if getattr(inst, "filename", None) is not None: ui.warn(_("abort: %s: '%s'\n") % ( - encoding.strtolocal(inst.strerror), inst.filename)) + encoding.strtolocal(inst.strerror), + stringutil.forcebytestr(inst.filename))) else: ui.warn(_("abort: %s\n") % encoding.strtolocal(inst.strerror)) except MemoryError: @@ -250,7 +262,7 @@ # Just in case catch this and and pass exit code to caller. return inst.code except socket.error as inst: - ui.warn(_("abort: %s\n") % inst.args[-1]) + ui.warn(_("abort: %s\n") % stringutil.forcebytestr(inst.args[-1])) return -1 @@ -261,12 +273,15 @@ raise error.Abort(_("the name '%s' is reserved") % lbl) for c in (':', '\0', '\n', '\r'): if c in lbl: - raise error.Abort(_("%r cannot be used in a name") % c) + raise error.Abort( + _("%r cannot be used in a name") % pycompat.bytestr(c)) try: int(lbl) raise error.Abort(_("cannot use an integer as a name")) except ValueError: pass + if lbl.strip() != lbl: + raise error.Abort(_("leading or trailing whitespace in name %r") % lbl) def checkfilename(f): '''Check that the filename f is an acceptable filename for a tracked file''' @@ -280,7 +295,7 @@ if abort or warn: msg = util.checkwinfilename(f) if msg: - msg = "%s: %s" % (msg, util.shellquote(f)) + msg = "%s: %s" % (msg, procutil.shellquote(f)) if abort: raise error.Abort(msg) ui.warn(_("warning: %s\n") % msg) @@ -290,7 +305,7 @@ non-portable filenames''' val = ui.config('ui', 'portablefilenames') lval = val.lower() - bval = util.parsebool(val) + bval = stringutil.parsebool(val) abort = pycompat.iswindows or lval == 'abort' warn = bval or lval == 'warn' if bval is None and not (warn or abort or lval == 'ignore'): @@ -355,12 +370,8 @@ samestat = getattr(os.path, 'samestat', None) if followsym and samestat is not None: def adddir(dirlst, dirname): - match = False dirstat = os.stat(dirname) - for lstdirstat in dirlst: - if samestat(dirstat, lstdirstat): - match = True - break + match = any(samestat(dirstat, lstdirstat) for lstdirstat in dirlst) if not match: dirlst.append(dirstat) return not match @@ -411,7 +422,7 @@ def formatchangeid(ctx): """Format changectx as '{rev}:{node|formatnode}', which is the default - template provided by cmdutil.changeset_templater""" + template provided by logcmdutil.changesettemplater""" repo = ctx.repo() return formatrevnode(repo.ui, intrev(ctx), binnode(ctx)) @@ -423,6 +434,120 @@ hexfunc = short return '%d:%s' % (rev, hexfunc(node)) +def resolvehexnodeidprefix(repo, prefix): + # Uses unfiltered repo because it's faster when prefix is ambiguous/ + # This matches the shortesthexnodeidprefix() function below. + node = repo.unfiltered().changelog._partialmatch(prefix) + if node is None: + return + repo.changelog.rev(node) # make sure node isn't filtered + return node + +def shortesthexnodeidprefix(repo, node, minlength=1): + """Find the shortest unambiguous prefix that matches hexnode.""" + # _partialmatch() of filtered changelog could take O(len(repo)) time, + # which would be unacceptably slow. so we look for hash collision in + # unfiltered space, which means some hashes may be slightly longer. + return repo.unfiltered().changelog.shortest(node, minlength) + +def isrevsymbol(repo, symbol): + """Checks if a symbol exists in the repo. + + See revsymbol() for details. Raises error.LookupError if the symbol is an + ambiguous nodeid prefix. + """ + try: + revsymbol(repo, symbol) + return True + except error.RepoLookupError: + return False + +def revsymbol(repo, symbol): + """Returns a context given a single revision symbol (as string). + + This is similar to revsingle(), but accepts only a single revision symbol, + i.e. things like ".", "tip", "1234", "deadbeef", "my-bookmark" work, but + not "max(public())". + """ + if not isinstance(symbol, bytes): + msg = ("symbol (%s of type %s) was not a string, did you mean " + "repo[symbol]?" % (symbol, type(symbol))) + raise error.ProgrammingError(msg) + try: + if symbol in ('.', 'tip', 'null'): + return repo[symbol] + + try: + r = int(symbol) + if '%d' % r != symbol: + raise ValueError + l = len(repo.changelog) + if r < 0: + r += l + if r < 0 or r >= l and r != wdirrev: + raise ValueError + return repo[r] + except error.FilteredIndexError: + raise + except (ValueError, OverflowError, IndexError): + pass + + if len(symbol) == 40: + try: + node = bin(symbol) + rev = repo.changelog.rev(node) + return repo[rev] + except error.FilteredLookupError: + raise + except (TypeError, LookupError): + pass + + # look up bookmarks through the name interface + try: + node = repo.names.singlenode(repo, symbol) + rev = repo.changelog.rev(node) + return repo[rev] + except KeyError: + pass + + node = resolvehexnodeidprefix(repo, symbol) + if node is not None: + rev = repo.changelog.rev(node) + return repo[rev] + + raise error.RepoLookupError(_("unknown revision '%s'") % symbol) + + except error.WdirUnsupported: + return repo[None] + except (error.FilteredIndexError, error.FilteredLookupError, + error.FilteredRepoLookupError): + raise _filterederror(repo, symbol) + +def _filterederror(repo, changeid): + """build an exception to be raised about a filtered changeid + + This is extracted in a function to help extensions (eg: evolve) to + experiment with various message variants.""" + if repo.filtername.startswith('visible'): + + # Check if the changeset is obsolete + unfilteredrepo = repo.unfiltered() + ctx = revsymbol(unfilteredrepo, changeid) + + # If the changeset is obsolete, enrich the message with the reason + # that made this changeset not visible + if ctx.obsolete(): + msg = obsutil._getfilteredreason(repo, changeid, ctx) + else: + msg = _("hidden revision '%s'") % changeid + + hint = _('use --hidden to access hidden revisions') + + return error.FilteredRepoLookupError(msg, hint=hint) + msg = _("filtered revision '%s' (not in '%s' subset)") + msg %= (changeid, repo.filtername) + return error.FilteredRepoLookupError(msg) + def revsingle(repo, revspec, default='.', localalias=None): if not revspec and revspec != 0: return repo[default] @@ -436,9 +561,14 @@ tree = revsetlang.parse(revspec) return tree and tree[0] in ('range', 'rangepre', 'rangepost', 'rangeall') +def revpairnodes(repo, revs): + repo.ui.deprecwarn("revpairnodes is deprecated, please use revpair", "4.6") + ctx1, ctx2 = revpair(repo, revs) + return ctx1.node(), ctx2.node() + def revpair(repo, revs): if not revs: - return repo.dirstate.p1(), None + return repo['.'], repo[None] l = revrange(repo, revs) @@ -462,9 +592,9 @@ # if top-level is range expression, the result must always be a pair if first == second and len(revs) == 1 and not _pairspec(revs[0]): - return repo.lookup(first), None + return repo[first], repo[None] - return repo.lookup(first), repo.lookup(second) + return repo[first], repo[second] def revrange(repo, specs, localalias=None): """Execute 1 to many revsets and return the union. @@ -684,7 +814,8 @@ continue from . import bookmarks # avoid import cycle repo.ui.debug('moving bookmarks %r from %s to %s\n' % - (oldbmarks, hex(oldnode), hex(newnode))) + (util.rapply(pycompat.maybebytestr, oldbmarks), + hex(oldnode), hex(newnode))) # Delete divergent bookmarks being parents of related newnodes deleterevs = repo.revs('parents(roots(%ln & (::%n))) - parents(%n)', allnewnodes, newnode, oldnode) @@ -720,14 +851,18 @@ if tostrip: repair.delayedstrip(repo.ui, repo, tostrip, operation) -def addremove(repo, matcher, prefix, opts=None, dry_run=None, similarity=None): +def addremove(repo, matcher, prefix, opts=None): if opts is None: opts = {} m = matcher - if dry_run is None: - dry_run = opts.get('dry_run') - if similarity is None: + dry_run = opts.get('dry_run') + try: similarity = float(opts.get('similarity') or 0) + except ValueError: + raise error.Abort(_('similarity must be a number')) + if similarity < 0 or similarity > 100: + raise error.Abort(_('similarity must be between 0 and 100')) + similarity /= 100.0 ret = 0 join = lambda f: os.path.join(prefix, f) @@ -738,7 +873,7 @@ if opts.get('subrepos') or m.exact(subpath) or any(submatch.files()): sub = wctx.sub(subpath) try: - if sub.addremove(submatch, prefix, opts, dry_run, similarity): + if sub.addremove(submatch, prefix, opts): ret = 1 except error.LookupError: repo.ui.status(_("skipping missing subrepository: %s\n") @@ -885,7 +1020,7 @@ missings = [] for r in requirements: if r not in supported: - if not r or not r[0].isalnum(): + if not r or not r[0:1].isalnum(): raise error.RequirementError(_(".hg/requires file is corrupt")) missings.append(r) missings.sort() @@ -1080,7 +1215,7 @@ # external commands should be run relative to the repo root cmd = spec[6:] proc = subprocess.Popen(cmd, shell=True, bufsize=-1, - close_fds=util.closefds, + close_fds=procutil.closefds, stdout=subprocess.PIPE, cwd=repo.root) src = proc.stdout else: @@ -1094,7 +1229,7 @@ k = encoding.tolocal(k) try: - data[repo[k].rev()] = encoding.tolocal(v) + data[revsingle(repo, k).rev()] = encoding.tolocal(v) except (error.LookupError, error.RepoLookupError): pass # we ignore data for nodes that don't exist locally finally: @@ -1104,7 +1239,7 @@ src.close() if proc and proc.returncode != 0: raise error.Abort(_("extdata command '%s' failed: %s") - % (cmd, util.explainexit(proc.returncode)[0])) + % (cmd, procutil.explainexit(proc.returncode))) return data @@ -1196,7 +1331,7 @@ if k == self.firstlinekey: e = "key name '%s' is reserved" % self.firstlinekey raise error.ProgrammingError(e) - if not k[0].isalpha(): + if not k[0:1].isalpha(): e = "keys must start with a letter in a key-value file" raise error.ProgrammingError(e) if not k.isalnum(): @@ -1222,6 +1357,22 @@ 'unbundle', ] +def prefetchfiles(repo, revs, match): + """Invokes the registered file prefetch functions, allowing extensions to + ensure the corresponding files are available locally, before the command + uses them.""" + if match: + # The command itself will complain about files that don't exist, so + # don't duplicate the message. + match = matchmod.badmatch(match, lambda fn, msg: None) + else: + match = matchall(repo) + + fileprefetchhooks(repo, revs, match) + +# a list of (repo, revs, match) prefetch functions +fileprefetchhooks = util.hooks() + # A marker that tells the evolve extension to suppress its own reporting _reportstroubledchangesets = True @@ -1404,7 +1555,7 @@ try: s = pmatch(s) - except error.LookupError: + except (error.LookupError, error.WdirUnsupported): s = None if s is not None: diff -r fb92df8b634c -r ed5448edcbfa mercurial/scmwindows.py --- a/mercurial/scmwindows.py Wed Apr 04 10:35:09 2018 -0400 +++ b/mercurial/scmwindows.py Wed Apr 18 15:32:08 2018 -0400 @@ -21,7 +21,7 @@ def systemrcpath(): '''return default os-specific hgrc search path''' rcpath = [] - filename = util.executablepath() + filename = win32.executablepath() # Use mercurial.ini found in directory with hg.exe progrc = os.path.join(os.path.dirname(filename), 'mercurial.ini') rcpath.append(progrc) diff -r fb92df8b634c -r ed5448edcbfa mercurial/server.py --- a/mercurial/server.py Wed Apr 04 10:35:09 2018 -0400 +++ b/mercurial/server.py Wed Apr 18 15:32:08 2018 -0400 @@ -22,10 +22,44 @@ util, ) +from .utils import ( + procutil, +) + def runservice(opts, parentfn=None, initfn=None, runfn=None, logfile=None, runargs=None, appendpid=False): '''Run a command as a service.''' + postexecargs = {} + + if opts['daemon_postexec']: + for inst in opts['daemon_postexec']: + if inst.startswith('unlink:'): + postexecargs['unlink'] = inst[7:] + elif inst.startswith('chdir:'): + postexecargs['chdir'] = inst[6:] + elif inst != 'none': + raise error.Abort(_('invalid value for --daemon-postexec: %s') + % inst) + + # When daemonized on Windows, redirect stdout/stderr to the lockfile (which + # gets cleaned up after the child is up and running), so that the parent can + # read and print the error if this child dies early. See 594dd384803c. On + # other platforms, the child can write to the parent's stdio directly, until + # it is redirected prior to runfn(). + if pycompat.iswindows and opts['daemon_postexec']: + if 'unlink' in postexecargs and os.path.exists(postexecargs['unlink']): + procutil.stdout.flush() + procutil.stderr.flush() + + fd = os.open(postexecargs['unlink'], + os.O_WRONLY | os.O_APPEND | os.O_BINARY) + try: + os.dup2(fd, procutil.stdout.fileno()) + os.dup2(fd, procutil.stderr.fileno()) + finally: + os.close(fd) + def writepid(pid): if opts['pid_file']: if appendpid: @@ -42,7 +76,7 @@ os.close(lockfd) try: if not runargs: - runargs = util.hgcmd() + pycompat.sysargv[1:] + runargs = procutil.hgcmd() + pycompat.sysargv[1:] runargs.append('--daemon-postexec=unlink:%s' % lockpath) # Don't pass --cwd to the child process, because we've already # changed directory. @@ -55,8 +89,14 @@ break def condfn(): return not os.path.exists(lockpath) - pid = util.rundetached(runargs, condfn) + pid = procutil.rundetached(runargs, condfn) if pid < 0: + # If the daemonized process managed to write out an error msg, + # report it. + if pycompat.iswindows and os.path.exists(lockpath): + with open(lockpath, 'rb') as log: + for line in log: + procutil.stderr.write(line) raise error.Abort(_('child process failed to start')) writepid(pid) finally: @@ -70,39 +110,40 @@ initfn() if not opts['daemon']: - writepid(util.getpid()) + writepid(procutil.getpid()) if opts['daemon_postexec']: try: os.setsid() except AttributeError: pass - for inst in opts['daemon_postexec']: - if inst.startswith('unlink:'): - lockpath = inst[7:] - os.unlink(lockpath) - elif inst.startswith('chdir:'): - os.chdir(inst[6:]) - elif inst != 'none': - raise error.Abort(_('invalid value for --daemon-postexec: %s') - % inst) - util.hidewindow() - util.stdout.flush() - util.stderr.flush() + + if 'chdir' in postexecargs: + os.chdir(postexecargs['chdir']) + procutil.hidewindow() + procutil.stdout.flush() + procutil.stderr.flush() nullfd = os.open(os.devnull, os.O_RDWR) logfilefd = nullfd if logfile: logfilefd = os.open(logfile, os.O_RDWR | os.O_CREAT | os.O_APPEND, 0o666) - os.dup2(nullfd, 0) - os.dup2(logfilefd, 1) - os.dup2(logfilefd, 2) - if nullfd not in (0, 1, 2): + os.dup2(nullfd, procutil.stdin.fileno()) + os.dup2(logfilefd, procutil.stdout.fileno()) + os.dup2(logfilefd, procutil.stderr.fileno()) + stdio = (procutil.stdin.fileno(), procutil.stdout.fileno(), + procutil.stderr.fileno()) + if nullfd not in stdio: os.close(nullfd) - if logfile and logfilefd not in (0, 1, 2): + if logfile and logfilefd not in stdio: os.close(logfilefd) + # Only unlink after redirecting stdout/stderr, so Windows doesn't + # complain about a sharing violation. + if 'unlink' in postexecargs: + os.unlink(postexecargs['unlink']) + if runfn: return runfn() diff -r fb92df8b634c -r ed5448edcbfa mercurial/setdiscovery.py --- a/mercurial/setdiscovery.py Wed Apr 04 10:35:09 2018 -0400 +++ b/mercurial/setdiscovery.py Wed Apr 18 15:32:08 2018 -0400 @@ -106,7 +106,7 @@ :nodes: set of nodes to discover :size: the maximum size of the sample""" sample = dag.headsetofconnecteds(nodes) - if size <= len(sample): + if len(sample) >= size: return _limitsample(sample, size) _updatesample(dag, None, sample, quicksamplesize=size) return sample @@ -155,11 +155,14 @@ sample = _limitsample(ownheads, initialsamplesize) # indices between sample and externalized version must match sample = list(sample) - batch = remote.iterbatch() - batch.heads() - batch.known(dag.externalizeall(sample)) - batch.submit() - srvheadhashes, yesno = batch.results() + + with remote.commandexecutor() as e: + fheads = e.callcommand('heads', {}) + fknown = e.callcommand('known', { + 'nodes': dag.externalizeall(sample), + }) + + srvheadhashes, yesno = fheads.result(), fknown.result() if cl.tip() == nullid: if srvheadhashes != [nullid]: @@ -175,7 +178,7 @@ ui.debug("all remote heads known locally\n") return (srvheadhashes, False, srvheadhashes,) - if sample and len(ownheads) <= initialsamplesize and all(yesno): + if len(sample) == len(ownheads) and all(yesno): ui.note(_("all local heads known remotely\n")) ownheadhashes = dag.externalizeall(ownheads) return (ownheadhashes, True, srvheadhashes,) @@ -221,7 +224,6 @@ sample = list(undecided) else: sample = samplefunc(dag, undecided, targetsize) - sample = _limitsample(sample, targetsize) roundtrips += 1 ui.progress(_('searching'), roundtrips, unit=_('queries')) @@ -229,7 +231,12 @@ % (roundtrips, len(undecided), len(sample))) # indices between sample and externalized version must match sample = list(sample) - yesno = remote.known(dag.externalizeall(sample)) + + with remote.commandexecutor() as e: + yesno = e.callcommand('known', { + 'nodes': dag.externalizeall(sample), + }).result() + full = True if sample: diff -r fb92df8b634c -r ed5448edcbfa mercurial/simplemerge.py --- a/mercurial/simplemerge.py Wed Apr 04 10:35:09 2018 -0400 +++ b/mercurial/simplemerge.py Wed Apr 18 15:32:08 2018 -0400 @@ -23,7 +23,9 @@ error, mdiff, pycompat, - util, +) +from .utils import ( + stringutil, ) class CantReprocessAndShowBase(Exception): @@ -397,7 +399,7 @@ def _verifytext(text, path, ui, opts): """verifies that text is non-binary (unless opts[text] is passed, then we just warn)""" - if util.binary(text): + if stringutil.binary(text): msg = _("%s looks like a binary file.") % path if not opts.get('quiet'): ui.warn(_('warning: %s\n') % msg) diff -r fb92df8b634c -r ed5448edcbfa mercurial/smartset.py --- a/mercurial/smartset.py Wed Apr 04 10:35:09 2018 -0400 +++ b/mercurial/smartset.py Wed Apr 18 15:32:08 2018 -0400 @@ -8,7 +8,9 @@ from __future__ import absolute_import from . import ( + encoding, error, + pycompat, util, ) @@ -19,7 +21,7 @@ type(r) example ======== ================================= tuple ('', other) - str '' + bytes '' callable lambda: '' % sorted(b) object other ======== ================================= @@ -27,13 +29,16 @@ if r is None: return '' elif isinstance(r, tuple): - return r[0] % r[1:] - elif isinstance(r, str): + return r[0] % util.rapply(pycompat.maybebytestr, r[1:]) + elif isinstance(r, bytes): return r elif callable(r): return r() else: - return repr(r) + return pycompat.byterepr(r) + +def _typename(o): + return pycompat.sysbytes(type(o).__name__).lstrip('_') class abstractsmartset(object): @@ -306,7 +311,7 @@ self._istopo = False def __len__(self): - if '_list' in self.__dict__: + if r'_list' in self.__dict__: return len(self._list) else: return len(self._set) @@ -384,6 +389,7 @@ s._ascending = self._ascending return s + @encoding.strmethod def __repr__(self): d = {None: '', False: '-', True: '+'}[self._ascending] s = _formatsetrepr(self._datarepr) @@ -394,8 +400,8 @@ # We fallback to the sorted version for a stable output. if self._ascending is not None: l = self._asclist - s = repr(l) - return '<%s%s %s>' % (type(self).__name__, d, s) + s = pycompat.byterepr(l) + return '<%s%s %s>' % (_typename(self), d, s) class filteredset(abstractsmartset): """Duck type for baseset class which iterates lazily over the revisions in @@ -505,12 +511,13 @@ pass return x + @encoding.strmethod def __repr__(self): - xs = [repr(self._subset)] + xs = [pycompat.byterepr(self._subset)] s = _formatsetrepr(self._condrepr) if s: xs.append(s) - return '<%s %s>' % (type(self).__name__, ', '.join(xs)) + return '<%s %s>' % (_typename(self), ', '.join(xs)) def _iterordered(ascending, iter1, iter2): """produce an ordered iteration from two iterators with the same order @@ -755,9 +762,10 @@ self.reverse() return val + @encoding.strmethod def __repr__(self): d = {None: '', False: '-', True: '+'}[self._ascending] - return '<%s%s %r, %r>' % (type(self).__name__, d, self._r1, self._r2) + return '<%s%s %r, %r>' % (_typename(self), d, self._r1, self._r2) class generatorset(abstractsmartset): """Wrap a generator for lazy iteration @@ -918,9 +926,10 @@ return self.last() return next(it(), None) + @encoding.strmethod def __repr__(self): d = {False: '-', True: '+'}[self._ascending] - return '<%s%s>' % (type(self).__name__.lstrip('_'), d) + return '<%s%s>' % (_typename(self), d) class _generatorsetasc(generatorset): """Special case of generatorset optimized for ascending generators.""" @@ -1087,10 +1096,10 @@ y = max(self._end - start, self._start) return _spanset(x, y, self._ascending, self._hiddenrevs) + @encoding.strmethod def __repr__(self): d = {False: '-', True: '+'}[self._ascending] - return '<%s%s %d:%d>' % (type(self).__name__.lstrip('_'), d, - self._start, self._end) + return '<%s%s %d:%d>' % (_typename(self), d, self._start, self._end) class fullreposet(_spanset): """a set containing all revisions in the repo @@ -1123,7 +1132,7 @@ def prettyformat(revs): lines = [] - rs = repr(revs) + rs = pycompat.byterepr(revs) p = 0 while p < len(rs): q = rs.find('<', p + 1) diff -r fb92df8b634c -r ed5448edcbfa mercurial/sparse.py --- a/mercurial/sparse.py Wed Apr 04 10:35:09 2018 -0400 +++ b/mercurial/sparse.py Wed Apr 18 15:32:08 2018 -0400 @@ -579,7 +579,7 @@ # part of the active rules. changed = False for p in paths: - with util.posixfile(util.expandpath(p)) as fh: + with util.posixfile(util.expandpath(p), mode='rb') as fh: raw = fh.read() iincludes, iexcludes, iprofiles = parseconfig(repo.ui, raw) diff -r fb92df8b634c -r ed5448edcbfa mercurial/sshpeer.py --- a/mercurial/sshpeer.py Wed Apr 04 10:35:09 2018 -0400 +++ b/mercurial/sshpeer.py Wed Apr 18 15:32:08 2018 -0400 @@ -8,13 +8,20 @@ from __future__ import absolute_import import re +import uuid from .i18n import _ from . import ( error, pycompat, util, - wireproto, + wireprotoserver, + wireprototypes, + wireprotov1peer, + wireprotov1server, +) +from .utils import ( + procutil, ) def _serverquote(s): @@ -29,10 +36,11 @@ """display all data currently available on pipe as remote output. This is non blocking.""" - s = util.readpipe(pipe) - if s: - for l in s.splitlines(): - ui.status(_("remote: "), l, '\n') + if pipe: + s = procutil.readpipe(pipe) + if s: + for l in s.splitlines(): + ui.status(_("remote: "), l, '\n') class doublepipe(object): """Operate a side-channel pipe in addition of a main one @@ -63,8 +71,11 @@ (This will only wait for data if the setup is supported by `util.poll`) """ - if getattr(self._main, 'hasbuffer', False): # getattr for classic pipe - return (True, True) # main has data, assume side is worth poking at. + if (isinstance(self._main, util.bufferedinputpipe) and + self._main.hasbuffer): + # Main has data. Assume side is worth poking at. + return True, True + fds = [self._main.fileno(), self._side.fileno()] try: act = util.poll(fds) @@ -114,49 +125,271 @@ def flush(self): return self._main.flush() -class sshpeer(wireproto.wirepeer): - def __init__(self, ui, path, create=False): - self._url = path - self._ui = ui - self._pipeo = self._pipei = self._pipee = None +def _cleanuppipes(ui, pipei, pipeo, pipee): + """Clean up pipes used by an SSH connection.""" + if pipeo: + pipeo.close() + if pipei: + pipei.close() + + if pipee: + # Try to read from the err descriptor until EOF. + try: + for l in pipee: + ui.status(_('remote: '), l) + except (IOError, ValueError): + pass + + pipee.close() + +def _makeconnection(ui, sshcmd, args, remotecmd, path, sshenv=None): + """Create an SSH connection to a server. + + Returns a tuple of (process, stdin, stdout, stderr) for the + spawned process. + """ + cmd = '%s %s %s' % ( + sshcmd, + args, + procutil.shellquote('%s -R %s serve --stdio' % ( + _serverquote(remotecmd), _serverquote(path)))) - u = util.url(path, parsequery=False, parsefragment=False) - if u.scheme != 'ssh' or not u.host or u.path is None: - self._abort(error.RepoError(_("couldn't parse location %s") % path)) + ui.debug('running %s\n' % cmd) + cmd = procutil.quotecommand(cmd) + + # no buffer allow the use of 'select' + # feel free to remove buffering and select usage when we ultimately + # move to threading. + stdin, stdout, stderr, proc = procutil.popen4(cmd, bufsize=0, env=sshenv) + + return proc, stdin, stdout, stderr + +def _clientcapabilities(): + """Return list of capabilities of this client. + + Returns a list of capabilities that are supported by this client. + """ + protoparams = {'partial-pull'} + comps = [e.wireprotosupport().name for e in + util.compengines.supportedwireengines(util.CLIENTROLE)] + protoparams.add('comp=%s' % ','.join(comps)) + return protoparams + +def _performhandshake(ui, stdin, stdout, stderr): + def badresponse(): + # Flush any output on stderr. + _forwardoutput(ui, stderr) + + msg = _('no suitable response from remote hg') + hint = ui.config('ui', 'ssherrorhint') + raise error.RepoError(msg, hint=hint) - util.checksafessh(path) - - if u.passwd is not None: - self._abort(error.RepoError(_("password in URL not supported"))) + # The handshake consists of sending wire protocol commands in reverse + # order of protocol implementation and then sniffing for a response + # to one of them. + # + # Those commands (from oldest to newest) are: + # + # ``between`` + # Asks for the set of revisions between a pair of revisions. Command + # present in all Mercurial server implementations. + # + # ``hello`` + # Instructs the server to advertise its capabilities. Introduced in + # Mercurial 0.9.1. + # + # ``upgrade`` + # Requests upgrade from default transport protocol version 1 to + # a newer version. Introduced in Mercurial 4.6 as an experimental + # feature. + # + # The ``between`` command is issued with a request for the null + # range. If the remote is a Mercurial server, this request will + # generate a specific response: ``1\n\n``. This represents the + # wire protocol encoded value for ``\n``. We look for ``1\n\n`` + # in the output stream and know this is the response to ``between`` + # and we're at the end of our handshake reply. + # + # The response to the ``hello`` command will be a line with the + # length of the value returned by that command followed by that + # value. If the server doesn't support ``hello`` (which should be + # rare), that line will be ``0\n``. Otherwise, the value will contain + # RFC 822 like lines. Of these, the ``capabilities:`` line contains + # the capabilities of the server. + # + # The ``upgrade`` command isn't really a command in the traditional + # sense of version 1 of the transport because it isn't using the + # proper mechanism for formatting insteads: instead, it just encodes + # arguments on the line, delimited by spaces. + # + # The ``upgrade`` line looks like ``upgrade ``. + # If the server doesn't support protocol upgrades, it will reply to + # this line with ``0\n``. Otherwise, it emits an + # ``upgraded `` line to both stdout and stderr. + # Content immediately following this line describes additional + # protocol and server state. + # + # In addition to the responses to our command requests, the server + # may emit "banner" output on stdout. SSH servers are allowed to + # print messages to stdout on login. Issuing commands on connection + # allows us to flush this banner output from the server by scanning + # for output to our well-known ``between`` command. Of course, if + # the banner contains ``1\n\n``, this will throw off our detection. - self._user = u.user - self._host = u.host - self._port = u.port - self._path = u.path or '.' + requestlog = ui.configbool('devel', 'debug.peer-request') + + # Generate a random token to help identify responses to version 2 + # upgrade request. + token = pycompat.sysbytes(str(uuid.uuid4())) + upgradecaps = [ + ('proto', wireprotoserver.SSHV2), + ] + upgradecaps = util.urlreq.urlencode(upgradecaps) + + try: + pairsarg = '%s-%s' % ('0' * 40, '0' * 40) + handshake = [ + 'hello\n', + 'between\n', + 'pairs %d\n' % len(pairsarg), + pairsarg, + ] - sshcmd = self.ui.config("ui", "ssh") - remotecmd = self.ui.config("ui", "remotecmd") - sshaddenv = dict(self.ui.configitems("sshenv")) - sshenv = util.shellenviron(sshaddenv) + # Request upgrade to version 2 if configured. + if ui.configbool('experimental', 'sshpeer.advertise-v2'): + ui.debug('sending upgrade request: %s %s\n' % (token, upgradecaps)) + handshake.insert(0, 'upgrade %s %s\n' % (token, upgradecaps)) + + if requestlog: + ui.debug('devel-peer-request: hello\n') + ui.debug('sending hello command\n') + if requestlog: + ui.debug('devel-peer-request: between\n') + ui.debug('devel-peer-request: pairs: %d bytes\n' % len(pairsarg)) + ui.debug('sending between command\n') + + stdin.write(''.join(handshake)) + stdin.flush() + except IOError: + badresponse() + + # Assume version 1 of wire protocol by default. + protoname = wireprototypes.SSHV1 + reupgraded = re.compile(b'^upgraded %s (.*)$' % re.escape(token)) - args = util.sshargs(sshcmd, self._host, self._user, self._port) + lines = ['', 'dummy'] + max_noise = 500 + while lines[-1] and max_noise: + try: + l = stdout.readline() + _forwardoutput(ui, stderr) + + # Look for reply to protocol upgrade request. It has a token + # in it, so there should be no false positives. + m = reupgraded.match(l) + if m: + protoname = m.group(1) + ui.debug('protocol upgraded to %s\n' % protoname) + # If an upgrade was handled, the ``hello`` and ``between`` + # requests are ignored. The next output belongs to the + # protocol, so stop scanning lines. + break + + # Otherwise it could be a banner, ``0\n`` response if server + # doesn't support upgrade. + + if lines[-1] == '1\n' and l == '\n': + break + if l: + ui.debug('remote: ', l) + lines.append(l) + max_noise -= 1 + except IOError: + badresponse() + else: + badresponse() + + caps = set() - if create: - cmd = '%s %s %s' % (sshcmd, args, - util.shellquote("%s init %s" % - (_serverquote(remotecmd), _serverquote(self._path)))) - ui.debug('running %s\n' % cmd) - res = ui.system(cmd, blockedtag='sshpeer', environ=sshenv) - if res != 0: - self._abort(error.RepoError(_("could not create remote repo"))) + # For version 1, we should see a ``capabilities`` line in response to the + # ``hello`` command. + if protoname == wireprototypes.SSHV1: + for l in reversed(lines): + # Look for response to ``hello`` command. Scan from the back so + # we don't misinterpret banner output as the command reply. + if l.startswith('capabilities:'): + caps.update(l[:-1].split(':')[1].split()) + break + elif protoname == wireprotoserver.SSHV2: + # We see a line with number of bytes to follow and then a value + # looking like ``capabilities: *``. + line = stdout.readline() + try: + valuelen = int(line) + except ValueError: + badresponse() + + capsline = stdout.read(valuelen) + if not capsline.startswith('capabilities: '): + badresponse() + + ui.debug('remote: %s\n' % capsline) + + caps.update(capsline.split(':')[1].split()) + # Trailing newline. + stdout.read(1) + + # Error if we couldn't find capabilities, this means: + # + # 1. Remote isn't a Mercurial server + # 2. Remote is a <0.9.1 Mercurial server + # 3. Remote is a future Mercurial server that dropped ``hello`` + # and other attempted handshake mechanisms. + if not caps: + badresponse() + + # Flush any output on stderr before proceeding. + _forwardoutput(ui, stderr) - self._validaterepo(sshcmd, args, remotecmd, sshenv) + return protoname, caps + +class sshv1peer(wireprotov1peer.wirepeer): + def __init__(self, ui, url, proc, stdin, stdout, stderr, caps, + autoreadstderr=True): + """Create a peer from an existing SSH connection. - # Begin of _basepeer interface. + ``proc`` is a handle on the underlying SSH process. + ``stdin``, ``stdout``, and ``stderr`` are handles on the stdio + pipes for that process. + ``caps`` is a set of capabilities supported by the remote. + ``autoreadstderr`` denotes whether to automatically read from + stderr and to forward its output. + """ + self._url = url + self.ui = ui + # self._subprocess is unused. Keeping a handle on the process + # holds a reference and prevents it from being garbage collected. + self._subprocess = proc - @util.propertycache - def ui(self): - return self._ui + # And we hook up our "doublepipe" wrapper to allow querying + # stderr any time we perform I/O. + if autoreadstderr: + stdout = doublepipe(ui, util.bufferedinputpipe(stdout), stderr) + stdin = doublepipe(ui, stdin, stderr) + + self._pipeo = stdin + self._pipei = stdout + self._pipee = stderr + self._caps = caps + self._autoreadstderr = autoreadstderr + + # Commands that have a "framed" response where the first line of the + # response contains the length of that response. + _FRAMED_COMMANDS = { + 'batch', + } + + # Begin of ipeerconnection interface. def url(self): return self._url @@ -173,72 +406,14 @@ def close(self): pass - # End of _basepeer interface. + # End of ipeerconnection interface. - # Begin of _basewirecommands interface. + # Begin of ipeercommands interface. def capabilities(self): return self._caps - # End of _basewirecommands interface. - - def _validaterepo(self, sshcmd, args, remotecmd, sshenv=None): - # cleanup up previous run - self._cleanup() - - cmd = '%s %s %s' % (sshcmd, args, - util.shellquote("%s -R %s serve --stdio" % - (_serverquote(remotecmd), _serverquote(self._path)))) - self.ui.debug('running %s\n' % cmd) - cmd = util.quotecommand(cmd) - - # while self._subprocess isn't used, having it allows the subprocess to - # to clean up correctly later - # - # no buffer allow the use of 'select' - # feel free to remove buffering and select usage when we ultimately - # move to threading. - sub = util.popen4(cmd, bufsize=0, env=sshenv) - self._pipeo, self._pipei, self._pipee, self._subprocess = sub - - self._pipei = util.bufferedinputpipe(self._pipei) - self._pipei = doublepipe(self.ui, self._pipei, self._pipee) - self._pipeo = doublepipe(self.ui, self._pipeo, self._pipee) - - def badresponse(): - msg = _("no suitable response from remote hg") - hint = self.ui.config("ui", "ssherrorhint") - self._abort(error.RepoError(msg, hint=hint)) - - try: - # skip any noise generated by remote shell - self._callstream("hello") - r = self._callstream("between", pairs=("%s-%s" % ("0"*40, "0"*40))) - except IOError: - badresponse() - - lines = ["", "dummy"] - max_noise = 500 - while lines[-1] and max_noise: - try: - l = r.readline() - self._readerr() - if lines[-1] == "1\n" and l == "\n": - break - if l: - self.ui.debug("remote: ", l) - lines.append(l) - max_noise -= 1 - except IOError: - badresponse() - else: - badresponse() - - self._caps = set() - for l in reversed(lines): - if l.startswith("capabilities:"): - self._caps.update(l[:-1].split(":")[1].split()) - break + # End of ipeercommands interface. def _readerr(self): _forwardoutput(self.ui, self._pipee) @@ -248,41 +423,11 @@ raise exception def _cleanup(self): - if self._pipeo is None: - return - self._pipeo.close() - self._pipei.close() - try: - # read the error descriptor until EOF - for l in self._pipee: - self.ui.status(_("remote: "), l) - except (IOError, ValueError): - pass - self._pipee.close() + _cleanuppipes(self.ui, self._pipei, self._pipeo, self._pipee) __del__ = _cleanup - def _submitbatch(self, req): - rsp = self._callstream("batch", cmds=wireproto.encodebatchcmds(req)) - available = self._getamount() - # TODO this response parsing is probably suboptimal for large - # batches with large responses. - toread = min(available, 1024) - work = rsp.read(toread) - available -= toread - chunk = work - while chunk: - while ';' in work: - one, work = work.split(';', 1) - yield wireproto.unescapearg(one) - toread = min(available, 1024) - chunk = rsp.read(toread) - available -= toread - work += chunk - yield wireproto.unescapearg(work) - - def _callstream(self, cmd, **args): - args = pycompat.byteskwargs(args) + def _sendrequest(self, cmd, args, framed=False): if (self.ui.debugflag and self.ui.configbool('devel', 'debug.peer-request')): dbg = self.ui.debug @@ -296,7 +441,7 @@ dbg(line % ' %s-%s: %d' % (key, dk, len(dv))) self.ui.debug("sending %s command\n" % cmd) self._pipeo.write("%s\n" % cmd) - _func, names = wireproto.commands[cmd] + _func, names = wireprotov1server.commands[cmd] keys = names.split() wireargs = {} for k in keys: @@ -316,58 +461,176 @@ self._pipeo.write(v) self._pipeo.flush() + # We know exactly how many bytes are in the response. So return a proxy + # around the raw output stream that allows reading exactly this many + # bytes. Callers then can read() without fear of overrunning the + # response. + if framed: + amount = self._getamount() + return util.cappedreader(self._pipei, amount) + return self._pipei + def _callstream(self, cmd, **args): + args = pycompat.byteskwargs(args) + return self._sendrequest(cmd, args, framed=cmd in self._FRAMED_COMMANDS) + def _callcompressable(self, cmd, **args): - return self._callstream(cmd, **args) + args = pycompat.byteskwargs(args) + return self._sendrequest(cmd, args, framed=cmd in self._FRAMED_COMMANDS) def _call(self, cmd, **args): - self._callstream(cmd, **args) - return self._recv() + args = pycompat.byteskwargs(args) + return self._sendrequest(cmd, args, framed=True).read() def _callpush(self, cmd, fp, **args): + # The server responds with an empty frame if the client should + # continue submitting the payload. r = self._call(cmd, **args) if r: return '', r + + # The payload consists of frames with content followed by an empty + # frame. for d in iter(lambda: fp.read(4096), ''): - self._send(d) - self._send("", flush=True) - r = self._recv() + self._writeframed(d) + self._writeframed("", flush=True) + + # In case of success, there is an empty frame and a frame containing + # the integer result (as a string). + # In case of error, there is a non-empty frame containing the error. + r = self._readframed() if r: return '', r - return self._recv(), '' + return self._readframed(), '' def _calltwowaystream(self, cmd, fp, **args): + # The server responds with an empty frame if the client should + # continue submitting the payload. r = self._call(cmd, **args) if r: # XXX needs to be made better raise error.Abort(_('unexpected remote reply: %s') % r) + + # The payload consists of frames with content followed by an empty + # frame. for d in iter(lambda: fp.read(4096), ''): - self._send(d) - self._send("", flush=True) + self._writeframed(d) + self._writeframed("", flush=True) + return self._pipei def _getamount(self): l = self._pipei.readline() if l == '\n': - self._readerr() + if self._autoreadstderr: + self._readerr() msg = _('check previous remote output') self._abort(error.OutOfBandError(hint=msg)) - self._readerr() + if self._autoreadstderr: + self._readerr() try: return int(l) except ValueError: self._abort(error.ResponseError(_("unexpected response:"), l)) - def _recv(self): - return self._pipei.read(self._getamount()) + def _readframed(self): + size = self._getamount() + if not size: + return b'' - def _send(self, data, flush=False): + return self._pipei.read(size) + + def _writeframed(self, data, flush=False): self._pipeo.write("%d\n" % len(data)) if data: self._pipeo.write(data) if flush: self._pipeo.flush() - self._readerr() + if self._autoreadstderr: + self._readerr() + +class sshv2peer(sshv1peer): + """A peer that speakers version 2 of the transport protocol.""" + # Currently version 2 is identical to version 1 post handshake. + # And handshake is performed before the peer is instantiated. So + # we need no custom code. + +def makepeer(ui, path, proc, stdin, stdout, stderr, autoreadstderr=True): + """Make a peer instance from existing pipes. + + ``path`` and ``proc`` are stored on the eventual peer instance and may + not be used for anything meaningful. + + ``stdin``, ``stdout``, and ``stderr`` are the pipes connected to the + SSH server's stdio handles. + + This function is factored out to allow creating peers that don't + actually spawn a new process. It is useful for starting SSH protocol + servers and clients via non-standard means, which can be useful for + testing. + """ + try: + protoname, caps = _performhandshake(ui, stdin, stdout, stderr) + except Exception: + _cleanuppipes(ui, stdout, stdin, stderr) + raise + + if protoname == wireprototypes.SSHV1: + return sshv1peer(ui, path, proc, stdin, stdout, stderr, caps, + autoreadstderr=autoreadstderr) + elif protoname == wireprototypes.SSHV2: + return sshv2peer(ui, path, proc, stdin, stdout, stderr, caps, + autoreadstderr=autoreadstderr) + else: + _cleanuppipes(ui, stdout, stdin, stderr) + raise error.RepoError(_('unknown version of SSH protocol: %s') % + protoname) + +def instance(ui, path, create, intents=None): + """Create an SSH peer. -instance = sshpeer + The returned object conforms to the ``wireprotov1peer.wirepeer`` interface. + """ + u = util.url(path, parsequery=False, parsefragment=False) + if u.scheme != 'ssh' or not u.host or u.path is None: + raise error.RepoError(_("couldn't parse location %s") % path) + + util.checksafessh(path) + + if u.passwd is not None: + raise error.RepoError(_('password in URL not supported')) + + sshcmd = ui.config('ui', 'ssh') + remotecmd = ui.config('ui', 'remotecmd') + sshaddenv = dict(ui.configitems('sshenv')) + sshenv = procutil.shellenviron(sshaddenv) + remotepath = u.path or '.' + + args = procutil.sshargs(sshcmd, u.host, u.user, u.port) + + if create: + cmd = '%s %s %s' % (sshcmd, args, + procutil.shellquote('%s init %s' % + (_serverquote(remotecmd), _serverquote(remotepath)))) + ui.debug('running %s\n' % cmd) + res = ui.system(cmd, blockedtag='sshpeer', environ=sshenv) + if res != 0: + raise error.RepoError(_('could not create remote repo')) + + proc, stdin, stdout, stderr = _makeconnection(ui, sshcmd, args, remotecmd, + remotepath, sshenv) + + peer = makepeer(ui, path, proc, stdin, stdout, stderr) + + # Finally, if supported by the server, notify it about our own + # capabilities. + if 'protocaps' in peer.capabilities(): + try: + peer._call("protocaps", + caps=' '.join(sorted(_clientcapabilities()))) + except IOError: + peer._cleanup() + raise error.RepoError(_('capability exchange failed')) + + return peer diff -r fb92df8b634c -r ed5448edcbfa mercurial/sshserver.py --- a/mercurial/sshserver.py Wed Apr 04 10:35:09 2018 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,131 +0,0 @@ -# sshserver.py - ssh protocol server support for mercurial -# -# Copyright 2005-2007 Matt Mackall -# Copyright 2006 Vadim Gelfer -# -# This software may be used and distributed according to the terms of the -# GNU General Public License version 2 or any later version. - -from __future__ import absolute_import - -import sys - -from .i18n import _ -from . import ( - encoding, - error, - hook, - util, - wireproto, -) - -class sshserver(wireproto.abstractserverproto): - def __init__(self, ui, repo): - self.ui = ui - self.repo = repo - self.lock = None - self.fin = ui.fin - self.fout = ui.fout - self.name = 'ssh' - - hook.redirect(True) - ui.fout = repo.ui.fout = ui.ferr - - # Prevent insertion/deletion of CRs - util.setbinary(self.fin) - util.setbinary(self.fout) - - def getargs(self, args): - data = {} - keys = args.split() - for n in xrange(len(keys)): - argline = self.fin.readline()[:-1] - arg, l = argline.split() - if arg not in keys: - raise error.Abort(_("unexpected parameter %r") % arg) - if arg == '*': - star = {} - for k in xrange(int(l)): - argline = self.fin.readline()[:-1] - arg, l = argline.split() - val = self.fin.read(int(l)) - star[arg] = val - data['*'] = star - else: - val = self.fin.read(int(l)) - data[arg] = val - return [data[k] for k in keys] - - def getarg(self, name): - return self.getargs(name)[0] - - def getfile(self, fpout): - self.sendresponse('') - count = int(self.fin.readline()) - while count: - fpout.write(self.fin.read(count)) - count = int(self.fin.readline()) - - def redirect(self): - pass - - def sendresponse(self, v): - self.fout.write("%d\n" % len(v)) - self.fout.write(v) - self.fout.flush() - - def sendstream(self, source): - write = self.fout.write - for chunk in source.gen: - write(chunk) - self.fout.flush() - - def sendpushresponse(self, rsp): - self.sendresponse('') - self.sendresponse(str(rsp.res)) - - def sendpusherror(self, rsp): - self.sendresponse(rsp.res) - - def sendooberror(self, rsp): - self.ui.ferr.write('%s\n-\n' % rsp.message) - self.ui.ferr.flush() - self.fout.write('\n') - self.fout.flush() - - def serve_forever(self): - try: - while self.serve_one(): - pass - finally: - if self.lock is not None: - self.lock.release() - sys.exit(0) - - handlers = { - str: sendresponse, - wireproto.streamres: sendstream, - wireproto.streamres_legacy: sendstream, - wireproto.pushres: sendpushresponse, - wireproto.pusherr: sendpusherror, - wireproto.ooberror: sendooberror, - } - - def serve_one(self): - cmd = self.fin.readline()[:-1] - if cmd and cmd in wireproto.commands: - rsp = wireproto.dispatch(self.repo, self, cmd) - self.handlers[rsp.__class__](self, rsp) - elif cmd: - impl = getattr(self, 'do_' + cmd, None) - if impl: - r = impl() - if r is not None: - self.sendresponse(r) - else: - self.sendresponse("") - return cmd != '' - - def _client(self): - client = encoding.environ.get('SSH_CLIENT', '').split(' ', 1)[0] - return 'remote:ssh:' + client diff -r fb92df8b634c -r ed5448edcbfa mercurial/sslutil.py --- a/mercurial/sslutil.py Wed Apr 04 10:35:09 2018 -0400 +++ b/mercurial/sslutil.py Wed Apr 18 15:32:08 2018 -0400 @@ -21,6 +21,10 @@ pycompat, util, ) +from .utils import ( + procutil, + stringutil, +) # Python 2.7.9+ overhauled the built-in SSL/TLS features of Python. It added # support for TLS 1.1, TLS 1.2, SNI, system CA stores, etc. These features are @@ -113,6 +117,7 @@ Returns a dict of settings relevant to that hostname. """ + bhostname = pycompat.bytesurl(hostname) s = { # Whether we should attempt to load default/available CA certs # if an explicit ``cafile`` is not defined. @@ -162,14 +167,14 @@ ui.warn(_('warning: connecting to %s using legacy security ' 'technology (TLS 1.0); see ' 'https://mercurial-scm.org/wiki/SecureConnections for ' - 'more info\n') % hostname) + 'more info\n') % bhostname) defaultprotocol = 'tls1.0' key = 'minimumprotocol' protocol = ui.config('hostsecurity', key, defaultprotocol) validateprotocol(protocol, key) - key = '%s:minimumprotocol' % hostname + key = '%s:minimumprotocol' % bhostname protocol = ui.config('hostsecurity', key, protocol) validateprotocol(protocol, key) @@ -182,16 +187,16 @@ s['protocol'], s['ctxoptions'], s['protocolui'] = protocolsettings(protocol) ciphers = ui.config('hostsecurity', 'ciphers') - ciphers = ui.config('hostsecurity', '%s:ciphers' % hostname, ciphers) + ciphers = ui.config('hostsecurity', '%s:ciphers' % bhostname, ciphers) s['ciphers'] = ciphers # Look for fingerprints in [hostsecurity] section. Value is a list # of : strings. - fingerprints = ui.configlist('hostsecurity', '%s:fingerprints' % hostname) + fingerprints = ui.configlist('hostsecurity', '%s:fingerprints' % bhostname) for fingerprint in fingerprints: if not (fingerprint.startswith(('sha1:', 'sha256:', 'sha512:'))): raise error.Abort(_('invalid fingerprint for %s: %s') % ( - hostname, fingerprint), + bhostname, fingerprint), hint=_('must begin with "sha1:", "sha256:", ' 'or "sha512:"')) @@ -200,7 +205,7 @@ s['certfingerprints'].append((alg, fingerprint)) # Fingerprints from [hostfingerprints] are always SHA-1. - for fingerprint in ui.configlist('hostfingerprints', hostname): + for fingerprint in ui.configlist('hostfingerprints', bhostname): fingerprint = fingerprint.replace(':', '').lower() s['certfingerprints'].append(('sha1', fingerprint)) s['legacyfingerprint'] = True @@ -223,11 +228,11 @@ # If both fingerprints and a per-host ca file are specified, issue a warning # because users should not be surprised about what security is or isn't # being performed. - cafile = ui.config('hostsecurity', '%s:verifycertsfile' % hostname) + cafile = ui.config('hostsecurity', '%s:verifycertsfile' % bhostname) if s['certfingerprints'] and cafile: ui.warn(_('(hostsecurity.%s:verifycertsfile ignored when host ' 'fingerprints defined; using host fingerprints for ' - 'verification)\n') % hostname) + 'verification)\n') % bhostname) # Try to hook up CA certificate validation unless something above # makes it not necessary. @@ -237,8 +242,8 @@ cafile = util.expandpath(cafile) if not os.path.exists(cafile): raise error.Abort(_('path specified by %s does not exist: %s') % - ('hostsecurity.%s:verifycertsfile' % hostname, - cafile)) + ('hostsecurity.%s:verifycertsfile' % ( + bhostname,), cafile)) s['cafile'] = cafile else: # Find global certificates file in config. @@ -345,10 +350,11 @@ for f in (keyfile, certfile): if f and not os.path.exists(f): - raise error.Abort(_('certificate file (%s) does not exist; ' - 'cannot connect to %s') % (f, serverhostname), - hint=_('restore missing file or fix references ' - 'in Mercurial config')) + raise error.Abort( + _('certificate file (%s) does not exist; cannot connect to %s') + % (f, pycompat.bytesurl(serverhostname)), + hint=_('restore missing file or fix references ' + 'in Mercurial config')) settings = _hostsettings(ui, serverhostname) @@ -369,11 +375,13 @@ if settings['ciphers']: try: - sslcontext.set_ciphers(settings['ciphers']) + sslcontext.set_ciphers(pycompat.sysstr(settings['ciphers'])) except ssl.SSLError as e: - raise error.Abort(_('could not set ciphers: %s') % e.args[0], - hint=_('change cipher string (%s) in config') % - settings['ciphers']) + raise error.Abort( + _('could not set ciphers: %s') + % stringutil.forcebytestr(e.args[0]), + hint=_('change cipher string (%s) in config') % + settings['ciphers']) if certfile is not None: def password(): @@ -390,7 +398,7 @@ else: msg = e.args[1] raise error.Abort(_('error loading CA file %s: %s') % ( - settings['cafile'], msg), + settings['cafile'], stringutil.forcebytestr(msg)), hint=_('file is empty or malformed?')) caloaded = True elif settings['allowloaddefaultcerts']: @@ -583,8 +591,10 @@ pats = [] if not dn: return False + dn = pycompat.bytesurl(dn) + hostname = pycompat.bytesurl(hostname) - pieces = dn.split(r'.') + pieces = dn.split('.') leftmost = pieces[0] remainder = pieces[1:] wildcards = leftmost.count('*') @@ -611,13 +621,13 @@ pats.append(re.escape(leftmost)) else: # Otherwise, '*' matches any dotless string, e.g. www* - pats.append(re.escape(leftmost).replace(r'\*', '[^.]*')) + pats.append(re.escape(leftmost).replace(br'\*', '[^.]*')) # add the remaining fragments, ignore any wildcards for frag in remainder: pats.append(re.escape(frag)) - pat = re.compile(r'\A' + r'\.'.join(pats) + r'\Z', re.IGNORECASE) + pat = re.compile(br'\A' + br'\.'.join(pats) + br'\Z', re.IGNORECASE) return pat.match(hostname) is not None def _verifycert(cert, hostname): @@ -637,17 +647,17 @@ if _dnsnamematch(value, hostname): return except wildcarderror as e: - return e.args[0] + return stringutil.forcebytestr(e.args[0]) dnsnames.append(value) if not dnsnames: # The subject is only checked when there is no DNS in subjectAltName. - for sub in cert.get('subject', []): + for sub in cert.get(r'subject', []): for key, value in sub: # According to RFC 2818 the most specific Common Name must # be used. - if key == 'commonName': + if key == r'commonName': # 'subject' entries are unicode. try: value = value.encode('ascii') @@ -658,7 +668,7 @@ if _dnsnamematch(value, hostname): return except wildcarderror as e: - return e.args[0] + return stringutil.forcebytestr(e.args[0]) dnsnames.append(value) @@ -677,7 +687,7 @@ for using system certificate store CAs in addition to the provided cacerts file """ - if (not pycompat.isdarwin or util.mainfrozen() or + if (not pycompat.isdarwin or procutil.mainfrozen() or not pycompat.sysexecutable): return False exe = os.path.realpath(pycompat.sysexecutable).lower() @@ -780,7 +790,8 @@ The passed socket must have been created with ``wrapsocket()``. """ - host = sock._hgstate['hostname'] + shost = sock._hgstate['hostname'] + host = pycompat.bytesurl(shost) ui = sock._hgstate['ui'] settings = sock._hgstate['settings'] @@ -856,7 +867,7 @@ 'hostsecurity.%s:fingerprints=%s to trust this server') % (host, nicefingerprint)) - msg = _verifycert(peercert2, host) + msg = _verifycert(peercert2, shost) if msg: raise error.Abort(_('%s certificate error: %s') % (host, msg), hint=_('set hostsecurity.%s:certfingerprints=%s ' diff -r fb92df8b634c -r ed5448edcbfa mercurial/stack.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/mercurial/stack.py Wed Apr 18 15:32:08 2018 -0400 @@ -0,0 +1,29 @@ +# stack.py - Mercurial functions for stack definition +# +# Copyright Matt Mackall and other +# +# This software may be used and distributed according to the terms of the +# GNU General Public License version 2 or any later version. + +from __future__ import absolute_import + +from . import ( + revsetlang, + scmutil, +) + +def getstack(repo, rev=None): + """return a sorted smartrev of the stack containing either rev if it is + not None or the current working directory parent. + + The stack will always contain all drafts changesets which are ancestors to + the revision and are not merges. + """ + if rev is None: + rev = '.' + + revspec = 'reverse(only(%s) and not public() and not ::merge())' + revset = revsetlang.formatspec(revspec, rev) + revisions = scmutil.revrange(repo, [revset]) + revisions.sort() + return revisions diff -r fb92df8b634c -r ed5448edcbfa mercurial/statichttprepo.py --- a/mercurial/statichttprepo.py Wed Apr 04 10:35:09 2018 -0400 +++ b/mercurial/statichttprepo.py Wed Apr 18 15:32:08 2018 -0400 @@ -13,7 +13,6 @@ from .i18n import _ from . import ( - byterange, changelog, error, localrepo, @@ -82,10 +81,36 @@ def close(self): pass +# _RangeError and _HTTPRangeHandler were originally in byterange.py, +# which was itself extracted from urlgrabber. See the last version of +# byterange.py from history if you need more information. +class _RangeError(IOError): + """Error raised when an unsatisfiable range is requested.""" + +class _HTTPRangeHandler(urlreq.basehandler): + """Handler that enables HTTP Range headers. + + This was extremely simple. The Range header is a HTTP feature to + begin with so all this class does is tell urllib2 that the + "206 Partial Content" response from the HTTP server is what we + expected. + """ + + def http_error_206(self, req, fp, code, msg, hdrs): + # 206 Partial Content Response + r = urlreq.addinfourl(fp, hdrs, req.get_full_url()) + r.code = code + r.msg = msg + return r + + def http_error_416(self, req, fp, code, msg, hdrs): + # HTTP's Range Not Satisfiable error + raise _RangeError('Requested Range Not Satisfiable') + def build_opener(ui, authinfo): # urllib cannot handle URLs with embedded user or passwd urlopener = url.opener(ui, authinfo) - urlopener.add_handler(byterange.HTTPRangeHandler()) + urlopener.add_handler(_HTTPRangeHandler()) class statichttpvfs(vfsmod.abstractvfs): def __init__(self, base): @@ -190,7 +215,7 @@ def _writecaches(self): pass # statichttprepository are read only -def instance(ui, path, create): +def instance(ui, path, create, intents=None): if create: raise error.Abort(_('cannot create new static-http repository')) return statichttprepository(ui, path[7:]) diff -r fb92df8b634c -r ed5448edcbfa mercurial/store.py --- a/mercurial/store.py Wed Apr 04 10:35:09 2018 -0400 +++ b/mercurial/store.py Wed Apr 18 15:32:08 2018 -0400 @@ -319,6 +319,9 @@ _data = ('data meta 00manifest.d 00manifest.i 00changelog.d 00changelog.i' ' phaseroots obsstore') +def isrevlog(f, kind, st): + return kind == stat.S_IFREG and f[-2:] in ('.i', '.d') + class basicstore(object): '''base class for local repository stores''' def __init__(self, path, vfstype): @@ -333,7 +336,7 @@ def join(self, f): return self.path + '/' + encodedir(f) - def _walk(self, relpath, recurse): + def _walk(self, relpath, recurse, filefilter=isrevlog): '''yields (unencoded, encoded, size)''' path = self.path if relpath: @@ -347,7 +350,7 @@ p = visit.pop() for f, kind, st in readdir(p, stat=True): fp = p + '/' + f - if kind == stat.S_IFREG and f[-2:] in ('.d', '.i'): + if filefilter(f, kind, st): n = util.pconvert(fp[striplen:]) l.append((decodedir(n), n, st.st_size)) elif kind == stat.S_IFDIR and recurse: diff -r fb92df8b634c -r ed5448edcbfa mercurial/streamclone.py --- a/mercurial/streamclone.py Wed Apr 04 10:35:09 2018 -0400 +++ b/mercurial/streamclone.py Wed Apr 18 15:32:08 2018 -0400 @@ -126,11 +126,18 @@ # creation. rbranchmap = None if remote.capable('branchmap'): - rbranchmap = remote.branchmap() + with remote.commandexecutor() as e: + rbranchmap = e.callcommand('branchmap', {}).result() repo.ui.status(_('streaming all changes\n')) - fp = remote.stream_out() + with remote.commandexecutor() as e: + fp = e.callcommand('stream_out', {}).result() + + # TODO strictly speaking, this code should all be inside the context + # manager because the context manager is supposed to ensure all wire state + # is flushed when exiting. But the legacy peers don't do this, so it + # doesn't matter. l = fp.readline() try: resp = int(l) diff -r fb92df8b634c -r ed5448edcbfa mercurial/subrepo.py --- a/mercurial/subrepo.py Wed Apr 04 10:35:09 2018 -0400 +++ b/mercurial/subrepo.py Wed Apr 18 15:32:08 2018 -0400 @@ -1,4 +1,4 @@ -# subrepo.py - sub-repository handling for Mercurial +# subrepo.py - sub-repository classes and factory # # Copyright 2009-2010 Matt Mackall # @@ -19,30 +19,35 @@ import tarfile import xml.dom.minidom - from .i18n import _ from . import ( cmdutil, - config, encoding, error, exchange, - filemerge, + logcmdutil, match as matchmod, node, pathutil, phases, pycompat, scmutil, + subrepoutil, util, vfs as vfsmod, ) +from .utils import ( + dateutil, + procutil, + stringutil, +) hg = None +reporelpath = subrepoutil.reporelpath +subrelpath = subrepoutil.subrelpath +_abssource = subrepoutil._abssource propertycache = util.propertycache -nullstate = ('', '', 'empty') - def _expandedabspath(path): ''' get a path or url and if it is a path expand it and return an absolute path @@ -73,291 +78,14 @@ raise ex except error.Abort as ex: subrepo = subrelpath(self) - errormsg = str(ex) + ' ' + _('(in subrepository "%s")') % subrepo + errormsg = (stringutil.forcebytestr(ex) + ' ' + + _('(in subrepository "%s")') % subrepo) # avoid handling this exception by raising a SubrepoAbort exception raise SubrepoAbort(errormsg, hint=ex.hint, subrepo=subrepo, cause=sys.exc_info()) return res return decoratedmethod -def state(ctx, ui): - """return a state dict, mapping subrepo paths configured in .hgsub - to tuple: (source from .hgsub, revision from .hgsubstate, kind - (key in types dict)) - """ - p = config.config() - repo = ctx.repo() - def read(f, sections=None, remap=None): - if f in ctx: - try: - data = ctx[f].data() - except IOError as err: - if err.errno != errno.ENOENT: - raise - # handle missing subrepo spec files as removed - ui.warn(_("warning: subrepo spec file \'%s\' not found\n") % - repo.pathto(f)) - return - p.parse(f, data, sections, remap, read) - else: - raise error.Abort(_("subrepo spec file \'%s\' not found") % - repo.pathto(f)) - if '.hgsub' in ctx: - read('.hgsub') - - for path, src in ui.configitems('subpaths'): - p.set('subpaths', path, src, ui.configsource('subpaths', path)) - - rev = {} - if '.hgsubstate' in ctx: - try: - for i, l in enumerate(ctx['.hgsubstate'].data().splitlines()): - l = l.lstrip() - if not l: - continue - try: - revision, path = l.split(" ", 1) - except ValueError: - raise error.Abort(_("invalid subrepository revision " - "specifier in \'%s\' line %d") - % (repo.pathto('.hgsubstate'), (i + 1))) - rev[path] = revision - except IOError as err: - if err.errno != errno.ENOENT: - raise - - def remap(src): - for pattern, repl in p.items('subpaths'): - # Turn r'C:\foo\bar' into r'C:\\foo\\bar' since re.sub - # does a string decode. - repl = util.escapestr(repl) - # However, we still want to allow back references to go - # through unharmed, so we turn r'\\1' into r'\1'. Again, - # extra escapes are needed because re.sub string decodes. - repl = re.sub(br'\\\\([0-9]+)', br'\\\1', repl) - try: - src = re.sub(pattern, repl, src, 1) - except re.error as e: - raise error.Abort(_("bad subrepository pattern in %s: %s") - % (p.source('subpaths', pattern), e)) - return src - - state = {} - for path, src in p[''].items(): - kind = 'hg' - if src.startswith('['): - if ']' not in src: - raise error.Abort(_('missing ] in subrepository source')) - kind, src = src.split(']', 1) - kind = kind[1:] - src = src.lstrip() # strip any extra whitespace after ']' - - if not util.url(src).isabs(): - parent = _abssource(repo, abort=False) - if parent: - parent = util.url(parent) - parent.path = posixpath.join(parent.path or '', src) - parent.path = posixpath.normpath(parent.path) - joined = str(parent) - # Remap the full joined path and use it if it changes, - # else remap the original source. - remapped = remap(joined) - if remapped == joined: - src = remap(src) - else: - src = remapped - - src = remap(src) - state[util.pconvert(path)] = (src.strip(), rev.get(path, ''), kind) - - return state - -def writestate(repo, state): - """rewrite .hgsubstate in (outer) repo with these subrepo states""" - lines = ['%s %s\n' % (state[s][1], s) for s in sorted(state) - if state[s][1] != nullstate[1]] - repo.wwrite('.hgsubstate', ''.join(lines), '') - -def submerge(repo, wctx, mctx, actx, overwrite, labels=None): - """delegated from merge.applyupdates: merging of .hgsubstate file - in working context, merging context and ancestor context""" - if mctx == actx: # backwards? - actx = wctx.p1() - s1 = wctx.substate - s2 = mctx.substate - sa = actx.substate - sm = {} - - repo.ui.debug("subrepo merge %s %s %s\n" % (wctx, mctx, actx)) - - def debug(s, msg, r=""): - if r: - r = "%s:%s:%s" % r - repo.ui.debug(" subrepo %s: %s %s\n" % (s, msg, r)) - - promptssrc = filemerge.partextras(labels) - for s, l in sorted(s1.iteritems()): - prompts = None - a = sa.get(s, nullstate) - ld = l # local state with possible dirty flag for compares - if wctx.sub(s).dirty(): - ld = (l[0], l[1] + "+") - if wctx == actx: # overwrite - a = ld - - prompts = promptssrc.copy() - prompts['s'] = s - if s in s2: - r = s2[s] - if ld == r or r == a: # no change or local is newer - sm[s] = l - continue - elif ld == a: # other side changed - debug(s, "other changed, get", r) - wctx.sub(s).get(r, overwrite) - sm[s] = r - elif ld[0] != r[0]: # sources differ - prompts['lo'] = l[0] - prompts['ro'] = r[0] - if repo.ui.promptchoice( - _(' subrepository sources for %(s)s differ\n' - 'use (l)ocal%(l)s source (%(lo)s)' - ' or (r)emote%(o)s source (%(ro)s)?' - '$$ &Local $$ &Remote') % prompts, 0): - debug(s, "prompt changed, get", r) - wctx.sub(s).get(r, overwrite) - sm[s] = r - elif ld[1] == a[1]: # local side is unchanged - debug(s, "other side changed, get", r) - wctx.sub(s).get(r, overwrite) - sm[s] = r - else: - debug(s, "both sides changed") - srepo = wctx.sub(s) - prompts['sl'] = srepo.shortid(l[1]) - prompts['sr'] = srepo.shortid(r[1]) - option = repo.ui.promptchoice( - _(' subrepository %(s)s diverged (local revision: %(sl)s, ' - 'remote revision: %(sr)s)\n' - '(M)erge, keep (l)ocal%(l)s or keep (r)emote%(o)s?' - '$$ &Merge $$ &Local $$ &Remote') - % prompts, 0) - if option == 0: - wctx.sub(s).merge(r) - sm[s] = l - debug(s, "merge with", r) - elif option == 1: - sm[s] = l - debug(s, "keep local subrepo revision", l) - else: - wctx.sub(s).get(r, overwrite) - sm[s] = r - debug(s, "get remote subrepo revision", r) - elif ld == a: # remote removed, local unchanged - debug(s, "remote removed, remove") - wctx.sub(s).remove() - elif a == nullstate: # not present in remote or ancestor - debug(s, "local added, keep") - sm[s] = l - continue - else: - if repo.ui.promptchoice( - _(' local%(l)s changed subrepository %(s)s' - ' which remote%(o)s removed\n' - 'use (c)hanged version or (d)elete?' - '$$ &Changed $$ &Delete') % prompts, 0): - debug(s, "prompt remove") - wctx.sub(s).remove() - - for s, r in sorted(s2.items()): - prompts = None - if s in s1: - continue - elif s not in sa: - debug(s, "remote added, get", r) - mctx.sub(s).get(r) - sm[s] = r - elif r != sa[s]: - prompts = promptssrc.copy() - prompts['s'] = s - if repo.ui.promptchoice( - _(' remote%(o)s changed subrepository %(s)s' - ' which local%(l)s removed\n' - 'use (c)hanged version or (d)elete?' - '$$ &Changed $$ &Delete') % prompts, 0) == 0: - debug(s, "prompt recreate", r) - mctx.sub(s).get(r) - sm[s] = r - - # record merged .hgsubstate - writestate(repo, sm) - return sm - -def precommit(ui, wctx, status, match, force=False): - """Calculate .hgsubstate changes that should be applied before committing - - Returns (subs, commitsubs, newstate) where - - subs: changed subrepos (including dirty ones) - - commitsubs: dirty subrepos which the caller needs to commit recursively - - newstate: new state dict which the caller must write to .hgsubstate - - This also updates the given status argument. - """ - subs = [] - commitsubs = set() - newstate = wctx.substate.copy() - - # only manage subrepos and .hgsubstate if .hgsub is present - if '.hgsub' in wctx: - # we'll decide whether to track this ourselves, thanks - for c in status.modified, status.added, status.removed: - if '.hgsubstate' in c: - c.remove('.hgsubstate') - - # compare current state to last committed state - # build new substate based on last committed state - oldstate = wctx.p1().substate - for s in sorted(newstate.keys()): - if not match(s): - # ignore working copy, use old state if present - if s in oldstate: - newstate[s] = oldstate[s] - continue - if not force: - raise error.Abort( - _("commit with new subrepo %s excluded") % s) - dirtyreason = wctx.sub(s).dirtyreason(True) - if dirtyreason: - if not ui.configbool('ui', 'commitsubrepos'): - raise error.Abort(dirtyreason, - hint=_("use --subrepos for recursive commit")) - subs.append(s) - commitsubs.add(s) - else: - bs = wctx.sub(s).basestate() - newstate[s] = (newstate[s][0], bs, newstate[s][2]) - if oldstate.get(s, (None, None, None))[1] != bs: - subs.append(s) - - # check for removed subrepos - for p in wctx.parents(): - r = [s for s in p.substate if s not in newstate] - subs += [s for s in r if match(s)] - if subs: - if (not match('.hgsub') and - '.hgsub' in (wctx.modified() + wctx.added())): - raise error.Abort(_("can't commit subrepos without .hgsub")) - status.modified.insert(0, '.hgsubstate') - - elif '.hgsub' in status.removed: - # clean up .hgsubstate when .hgsub is removed - if ('.hgsubstate' in wctx and - '.hgsubstate' not in (status.modified + status.added + - status.removed)): - status.removed.insert(0, '.hgsubstate') - - return subs, commitsubs, newstate - def _updateprompt(ui, sub, dirty, local, remote): if dirty: msg = (_(' subrepository sources for %s differ\n' @@ -372,64 +100,6 @@ % (subrelpath(sub), local, remote)) return ui.promptchoice(msg, 0) -def reporelpath(repo): - """return path to this (sub)repo as seen from outermost repo""" - parent = repo - while util.safehasattr(parent, '_subparent'): - parent = parent._subparent - return repo.root[len(pathutil.normasprefix(parent.root)):] - -def subrelpath(sub): - """return path to this subrepo as seen from outermost repo""" - return sub._relpath - -def _abssource(repo, push=False, abort=True): - """return pull/push path of repo - either based on parent repo .hgsub info - or on the top repo config. Abort or return None if no source found.""" - if util.safehasattr(repo, '_subparent'): - source = util.url(repo._subsource) - if source.isabs(): - return bytes(source) - source.path = posixpath.normpath(source.path) - parent = _abssource(repo._subparent, push, abort=False) - if parent: - parent = util.url(util.pconvert(parent)) - parent.path = posixpath.join(parent.path or '', source.path) - parent.path = posixpath.normpath(parent.path) - return bytes(parent) - else: # recursion reached top repo - path = None - if util.safehasattr(repo, '_subtoppath'): - path = repo._subtoppath - elif push and repo.ui.config('paths', 'default-push'): - path = repo.ui.config('paths', 'default-push') - elif repo.ui.config('paths', 'default'): - path = repo.ui.config('paths', 'default') - elif repo.shared(): - # chop off the .hg component to get the default path form. This has - # already run through vfsmod.vfs(..., realpath=True), so it doesn't - # have problems with 'C:' - return os.path.dirname(repo.sharedpath) - if path: - # issue5770: 'C:\' and 'C:' are not equivalent paths. The former is - # as expected: an absolute path to the root of the C: drive. The - # latter is a relative path, and works like so: - # - # C:\>cd C:\some\path - # C:\>D: - # D:\>python -c "import os; print os.path.abspath('C:')" - # C:\some\path - # - # D:\>python -c "import os; print os.path.abspath('C:relative')" - # C:\some\path\relative - if util.hasdriveletter(path): - if len(path) == 2 or path[2:3] not in br'\/': - path = os.path.abspath(path) - return path - - if abort: - raise error.Abort(_("default path for subrepository not found")) - def _sanitize(ui, vfs, ignore): for dirname, dirs, names in vfs.walk(): for i, d in enumerate(dirs): @@ -508,37 +178,6 @@ subrev = "0" * 40 return types[state[2]](pctx, path, (state[0], subrev), True) -def newcommitphase(ui, ctx): - commitphase = phases.newcommitphase(ui) - substate = getattr(ctx, "substate", None) - if not substate: - return commitphase - check = ui.config('phases', 'checksubrepos') - if check not in ('ignore', 'follow', 'abort'): - raise error.Abort(_('invalid phases.checksubrepos configuration: %s') - % (check)) - if check == 'ignore': - return commitphase - maxphase = phases.public - maxsub = None - for s in sorted(substate): - sub = ctx.sub(s) - subphase = sub.phase(substate[s][1]) - if maxphase < subphase: - maxphase = subphase - maxsub = s - if commitphase < maxphase: - if check == 'abort': - raise error.Abort(_("can't commit in %s phase" - " conflicting %s from subrepository %s") % - (phases.phasenames[commitphase], - phases.phasenames[maxphase], maxsub)) - ui.warn(_("warning: changes are committed in" - " %s phase from subrepository %s\n") % - (phases.phasenames[maxphase], maxsub)) - return maxphase - return commitphase - # subrepo classes need to implement the following abstract class: class abstractsubrepo(object): @@ -648,7 +287,7 @@ def add(self, ui, match, prefix, explicitonly, **opts): return [] - def addremove(self, matcher, prefix, opts, dry_run, similarity): + def addremove(self, matcher, prefix, opts): self.ui.warn("%s: %s" % (prefix, _("addremove is not supported"))) return 1 @@ -713,10 +352,11 @@ matched by the match function ''' - def forget(self, match, prefix): + def forget(self, match, prefix, dryrun, interactive): return ([], []) - def removefiles(self, matcher, prefix, after, force, subrepos, warnings): + def removefiles(self, matcher, prefix, after, force, subrepos, + dryrun, warnings): """remove the matched files from the subrepository and the filesystem, possibly by force and/or after the file has been removed from the filesystem. Return 0 on success, 1 on any warning. @@ -870,15 +510,14 @@ explicitonly, **opts) @annotatesubrepoerror - def addremove(self, m, prefix, opts, dry_run, similarity): + def addremove(self, m, prefix, opts): # In the same way as sub directories are processed, once in a subrepo, # always entry any of its subrepos. Don't corrupt the options that will # be used to process sibling subrepos however. opts = copy.copy(opts) opts['subrepos'] = True return scmutil.addremove(self._repo, m, - self.wvfs.reljoin(prefix, self._path), opts, - dry_run, similarity) + self.wvfs.reljoin(prefix, self._path), opts) @annotatesubrepoerror def cat(self, match, fm, fntemplate, prefix, **opts): @@ -907,10 +546,10 @@ # in hex format if node2 is not None: node2 = node.bin(node2) - cmdutil.diffordiffstat(ui, self._repo, diffopts, - node1, node2, match, - prefix=posixpath.join(prefix, self._path), - listsubrepos=True, **opts) + logcmdutil.diffordiffstat(ui, self._repo, diffopts, + node1, node2, match, + prefix=posixpath.join(prefix, self._path), + listsubrepos=True, **opts) except error.RepoLookupError as inst: self.ui.warn(_('warning: error "%s" in subrepository "%s"\n') % (inst, subrelpath(self))) @@ -918,9 +557,14 @@ @annotatesubrepoerror def archive(self, archiver, prefix, match=None, decode=True): self._get(self._state + ('hg',)) - total = abstractsubrepo.archive(self, archiver, prefix, match) + files = self.files() + if match: + files = [f for f in files if match(f)] rev = self._state[1] ctx = self._repo[rev] + scmutil.prefetchfiles(self._repo, [ctx.rev()], + scmutil.matchfiles(self._repo, files)) + total = abstractsubrepo.archive(self, archiver, prefix, match) for subpath in ctx.substate: s = subrepo(ctx, subpath, True) submatch = matchmod.subdirmatcher(subpath, match) @@ -959,7 +603,7 @@ @annotatesubrepoerror def phase(self, state): - return self._repo[state].phase() + return self._repo[state or '.'].phase() @annotatesubrepoerror def remove(self): @@ -1080,7 +724,7 @@ ssh = opts.get('ssh') # push subrepos depth-first for coherent ordering - c = self._repo[''] + c = self._repo['.'] subs = c.substate # only repos that are committed for s in sorted(subs): if c.sub(s).push(opts) == 0: @@ -1172,15 +816,17 @@ return ctx.walk(match) @annotatesubrepoerror - def forget(self, match, prefix): + def forget(self, match, prefix, dryrun, interactive): return cmdutil.forget(self.ui, self._repo, match, - self.wvfs.reljoin(prefix, self._path), True) + self.wvfs.reljoin(prefix, self._path), + True, dryrun=dryrun, interactive=interactive) @annotatesubrepoerror - def removefiles(self, matcher, prefix, after, force, subrepos, warnings): + def removefiles(self, matcher, prefix, after, force, subrepos, + dryrun, warnings): return cmdutil.remove(self.ui, self._repo, matcher, self.wvfs.reljoin(prefix, self._path), - after, force, subrepos) + after, force, subrepos, dryrun) @annotatesubrepoerror def revert(self, substate, *pats, **opts): @@ -1269,7 +915,7 @@ def __init__(self, ctx, path, state, allowcreate): super(svnsubrepo, self).__init__(ctx, path) self._state = state - self._exe = util.findexe('svn') + self._exe = procutil.findexe('svn') if not self._exe: raise error.Abort(_("'svn' executable not found for subrepo '%s'") % self._path) @@ -1299,7 +945,7 @@ env['LANG'] = lc_all del env['LC_ALL'] env['LC_MESSAGES'] = 'C' - p = subprocess.Popen(cmd, bufsize=-1, close_fds=util.closefds, + p = subprocess.Popen(cmd, bufsize=-1, close_fds=procutil.closefds, stdout=subprocess.PIPE, stderr=subprocess.PIPE, universal_newlines=True, env=env, **extrakw) stdout, stderr = p.communicate() @@ -1484,7 +1130,7 @@ doc = xml.dom.minidom.parseString(output) paths = [] for e in doc.getElementsByTagName('entry'): - kind = str(e.getAttribute('kind')) + kind = pycompat.bytestr(e.getAttribute('kind')) if kind != 'file': continue name = ''.join(c.data for c @@ -1617,7 +1263,7 @@ # the end of git diff arguments is used for paths commands.insert(1, '--color') p = subprocess.Popen([self._gitexecutable] + commands, bufsize=-1, - cwd=cwd, env=env, close_fds=util.closefds, + cwd=cwd, env=env, close_fds=procutil.closefds, stdout=subprocess.PIPE, stderr=errpipe) if stream: return p.stdout, None @@ -1849,7 +1495,7 @@ if date: # git's date parser silently ignores when seconds < 1e9 # convert to ISO8601 - env['GIT_AUTHOR_DATE'] = util.datestr(date, + env['GIT_AUTHOR_DATE'] = dateutil.datestr(date, '%Y-%m-%dT%H:%M:%S %1%2') self._gitcommand(cmd, env=env) # make sure commit works otherwise HEAD might not exist under certain @@ -1992,7 +1638,7 @@ # This should be much faster than manually traversing the trees # and objects with many subprocess calls. tarstream = self._gitcommand(['archive', revision], stream=True) - tar = tarfile.open(fileobj=tarstream, mode='r|') + tar = tarfile.open(fileobj=tarstream, mode=r'r|') relpath = subrelpath(self) self.ui.progress(_('archiving (%s)') % relpath, 0, unit=_('files')) for i, info in enumerate(tar): @@ -2025,8 +1671,7 @@ # TODO: add support for non-plain formatter (see cmdutil.cat()) for f in match.files(): output = self._gitcommand(["show", "%s:%s" % (rev, f)]) - fp = cmdutil.makefileobj(self._subparent, fntemplate, - self._ctx.node(), + fp = cmdutil.makefileobj(self._ctx, fntemplate, pathname=self.wvfs.reljoin(prefix, f)) fp.write(output) fp.close() diff -r fb92df8b634c -r ed5448edcbfa mercurial/subrepoutil.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/mercurial/subrepoutil.py Wed Apr 18 15:32:08 2018 -0400 @@ -0,0 +1,396 @@ +# subrepoutil.py - sub-repository operations and substate handling +# +# Copyright 2009-2010 Matt Mackall +# +# This software may be used and distributed according to the terms of the +# GNU General Public License version 2 or any later version. + +from __future__ import absolute_import + +import errno +import os +import posixpath +import re + +from .i18n import _ +from . import ( + config, + error, + filemerge, + pathutil, + phases, + util, +) +from .utils import ( + stringutil, +) + +nullstate = ('', '', 'empty') + +def state(ctx, ui): + """return a state dict, mapping subrepo paths configured in .hgsub + to tuple: (source from .hgsub, revision from .hgsubstate, kind + (key in types dict)) + """ + p = config.config() + repo = ctx.repo() + def read(f, sections=None, remap=None): + if f in ctx: + try: + data = ctx[f].data() + except IOError as err: + if err.errno != errno.ENOENT: + raise + # handle missing subrepo spec files as removed + ui.warn(_("warning: subrepo spec file \'%s\' not found\n") % + repo.pathto(f)) + return + p.parse(f, data, sections, remap, read) + else: + raise error.Abort(_("subrepo spec file \'%s\' not found") % + repo.pathto(f)) + if '.hgsub' in ctx: + read('.hgsub') + + for path, src in ui.configitems('subpaths'): + p.set('subpaths', path, src, ui.configsource('subpaths', path)) + + rev = {} + if '.hgsubstate' in ctx: + try: + for i, l in enumerate(ctx['.hgsubstate'].data().splitlines()): + l = l.lstrip() + if not l: + continue + try: + revision, path = l.split(" ", 1) + except ValueError: + raise error.Abort(_("invalid subrepository revision " + "specifier in \'%s\' line %d") + % (repo.pathto('.hgsubstate'), (i + 1))) + rev[path] = revision + except IOError as err: + if err.errno != errno.ENOENT: + raise + + def remap(src): + for pattern, repl in p.items('subpaths'): + # Turn r'C:\foo\bar' into r'C:\\foo\\bar' since re.sub + # does a string decode. + repl = stringutil.escapestr(repl) + # However, we still want to allow back references to go + # through unharmed, so we turn r'\\1' into r'\1'. Again, + # extra escapes are needed because re.sub string decodes. + repl = re.sub(br'\\\\([0-9]+)', br'\\\1', repl) + try: + src = re.sub(pattern, repl, src, 1) + except re.error as e: + raise error.Abort(_("bad subrepository pattern in %s: %s") + % (p.source('subpaths', pattern), + stringutil.forcebytestr(e))) + return src + + state = {} + for path, src in p[''].items(): + kind = 'hg' + if src.startswith('['): + if ']' not in src: + raise error.Abort(_('missing ] in subrepository source')) + kind, src = src.split(']', 1) + kind = kind[1:] + src = src.lstrip() # strip any extra whitespace after ']' + + if not util.url(src).isabs(): + parent = _abssource(repo, abort=False) + if parent: + parent = util.url(parent) + parent.path = posixpath.join(parent.path or '', src) + parent.path = posixpath.normpath(parent.path) + joined = bytes(parent) + # Remap the full joined path and use it if it changes, + # else remap the original source. + remapped = remap(joined) + if remapped == joined: + src = remap(src) + else: + src = remapped + + src = remap(src) + state[util.pconvert(path)] = (src.strip(), rev.get(path, ''), kind) + + return state + +def writestate(repo, state): + """rewrite .hgsubstate in (outer) repo with these subrepo states""" + lines = ['%s %s\n' % (state[s][1], s) for s in sorted(state) + if state[s][1] != nullstate[1]] + repo.wwrite('.hgsubstate', ''.join(lines), '') + +def submerge(repo, wctx, mctx, actx, overwrite, labels=None): + """delegated from merge.applyupdates: merging of .hgsubstate file + in working context, merging context and ancestor context""" + if mctx == actx: # backwards? + actx = wctx.p1() + s1 = wctx.substate + s2 = mctx.substate + sa = actx.substate + sm = {} + + repo.ui.debug("subrepo merge %s %s %s\n" % (wctx, mctx, actx)) + + def debug(s, msg, r=""): + if r: + r = "%s:%s:%s" % r + repo.ui.debug(" subrepo %s: %s %s\n" % (s, msg, r)) + + promptssrc = filemerge.partextras(labels) + for s, l in sorted(s1.iteritems()): + prompts = None + a = sa.get(s, nullstate) + ld = l # local state with possible dirty flag for compares + if wctx.sub(s).dirty(): + ld = (l[0], l[1] + "+") + if wctx == actx: # overwrite + a = ld + + prompts = promptssrc.copy() + prompts['s'] = s + if s in s2: + r = s2[s] + if ld == r or r == a: # no change or local is newer + sm[s] = l + continue + elif ld == a: # other side changed + debug(s, "other changed, get", r) + wctx.sub(s).get(r, overwrite) + sm[s] = r + elif ld[0] != r[0]: # sources differ + prompts['lo'] = l[0] + prompts['ro'] = r[0] + if repo.ui.promptchoice( + _(' subrepository sources for %(s)s differ\n' + 'use (l)ocal%(l)s source (%(lo)s)' + ' or (r)emote%(o)s source (%(ro)s)?' + '$$ &Local $$ &Remote') % prompts, 0): + debug(s, "prompt changed, get", r) + wctx.sub(s).get(r, overwrite) + sm[s] = r + elif ld[1] == a[1]: # local side is unchanged + debug(s, "other side changed, get", r) + wctx.sub(s).get(r, overwrite) + sm[s] = r + else: + debug(s, "both sides changed") + srepo = wctx.sub(s) + prompts['sl'] = srepo.shortid(l[1]) + prompts['sr'] = srepo.shortid(r[1]) + option = repo.ui.promptchoice( + _(' subrepository %(s)s diverged (local revision: %(sl)s, ' + 'remote revision: %(sr)s)\n' + '(M)erge, keep (l)ocal%(l)s or keep (r)emote%(o)s?' + '$$ &Merge $$ &Local $$ &Remote') + % prompts, 0) + if option == 0: + wctx.sub(s).merge(r) + sm[s] = l + debug(s, "merge with", r) + elif option == 1: + sm[s] = l + debug(s, "keep local subrepo revision", l) + else: + wctx.sub(s).get(r, overwrite) + sm[s] = r + debug(s, "get remote subrepo revision", r) + elif ld == a: # remote removed, local unchanged + debug(s, "remote removed, remove") + wctx.sub(s).remove() + elif a == nullstate: # not present in remote or ancestor + debug(s, "local added, keep") + sm[s] = l + continue + else: + if repo.ui.promptchoice( + _(' local%(l)s changed subrepository %(s)s' + ' which remote%(o)s removed\n' + 'use (c)hanged version or (d)elete?' + '$$ &Changed $$ &Delete') % prompts, 0): + debug(s, "prompt remove") + wctx.sub(s).remove() + + for s, r in sorted(s2.items()): + prompts = None + if s in s1: + continue + elif s not in sa: + debug(s, "remote added, get", r) + mctx.sub(s).get(r) + sm[s] = r + elif r != sa[s]: + prompts = promptssrc.copy() + prompts['s'] = s + if repo.ui.promptchoice( + _(' remote%(o)s changed subrepository %(s)s' + ' which local%(l)s removed\n' + 'use (c)hanged version or (d)elete?' + '$$ &Changed $$ &Delete') % prompts, 0) == 0: + debug(s, "prompt recreate", r) + mctx.sub(s).get(r) + sm[s] = r + + # record merged .hgsubstate + writestate(repo, sm) + return sm + +def precommit(ui, wctx, status, match, force=False): + """Calculate .hgsubstate changes that should be applied before committing + + Returns (subs, commitsubs, newstate) where + - subs: changed subrepos (including dirty ones) + - commitsubs: dirty subrepos which the caller needs to commit recursively + - newstate: new state dict which the caller must write to .hgsubstate + + This also updates the given status argument. + """ + subs = [] + commitsubs = set() + newstate = wctx.substate.copy() + + # only manage subrepos and .hgsubstate if .hgsub is present + if '.hgsub' in wctx: + # we'll decide whether to track this ourselves, thanks + for c in status.modified, status.added, status.removed: + if '.hgsubstate' in c: + c.remove('.hgsubstate') + + # compare current state to last committed state + # build new substate based on last committed state + oldstate = wctx.p1().substate + for s in sorted(newstate.keys()): + if not match(s): + # ignore working copy, use old state if present + if s in oldstate: + newstate[s] = oldstate[s] + continue + if not force: + raise error.Abort( + _("commit with new subrepo %s excluded") % s) + dirtyreason = wctx.sub(s).dirtyreason(True) + if dirtyreason: + if not ui.configbool('ui', 'commitsubrepos'): + raise error.Abort(dirtyreason, + hint=_("use --subrepos for recursive commit")) + subs.append(s) + commitsubs.add(s) + else: + bs = wctx.sub(s).basestate() + newstate[s] = (newstate[s][0], bs, newstate[s][2]) + if oldstate.get(s, (None, None, None))[1] != bs: + subs.append(s) + + # check for removed subrepos + for p in wctx.parents(): + r = [s for s in p.substate if s not in newstate] + subs += [s for s in r if match(s)] + if subs: + if (not match('.hgsub') and + '.hgsub' in (wctx.modified() + wctx.added())): + raise error.Abort(_("can't commit subrepos without .hgsub")) + status.modified.insert(0, '.hgsubstate') + + elif '.hgsub' in status.removed: + # clean up .hgsubstate when .hgsub is removed + if ('.hgsubstate' in wctx and + '.hgsubstate' not in (status.modified + status.added + + status.removed)): + status.removed.insert(0, '.hgsubstate') + + return subs, commitsubs, newstate + +def reporelpath(repo): + """return path to this (sub)repo as seen from outermost repo""" + parent = repo + while util.safehasattr(parent, '_subparent'): + parent = parent._subparent + return repo.root[len(pathutil.normasprefix(parent.root)):] + +def subrelpath(sub): + """return path to this subrepo as seen from outermost repo""" + return sub._relpath + +def _abssource(repo, push=False, abort=True): + """return pull/push path of repo - either based on parent repo .hgsub info + or on the top repo config. Abort or return None if no source found.""" + if util.safehasattr(repo, '_subparent'): + source = util.url(repo._subsource) + if source.isabs(): + return bytes(source) + source.path = posixpath.normpath(source.path) + parent = _abssource(repo._subparent, push, abort=False) + if parent: + parent = util.url(util.pconvert(parent)) + parent.path = posixpath.join(parent.path or '', source.path) + parent.path = posixpath.normpath(parent.path) + return bytes(parent) + else: # recursion reached top repo + path = None + if util.safehasattr(repo, '_subtoppath'): + path = repo._subtoppath + elif push and repo.ui.config('paths', 'default-push'): + path = repo.ui.config('paths', 'default-push') + elif repo.ui.config('paths', 'default'): + path = repo.ui.config('paths', 'default') + elif repo.shared(): + # chop off the .hg component to get the default path form. This has + # already run through vfsmod.vfs(..., realpath=True), so it doesn't + # have problems with 'C:' + return os.path.dirname(repo.sharedpath) + if path: + # issue5770: 'C:\' and 'C:' are not equivalent paths. The former is + # as expected: an absolute path to the root of the C: drive. The + # latter is a relative path, and works like so: + # + # C:\>cd C:\some\path + # C:\>D: + # D:\>python -c "import os; print os.path.abspath('C:')" + # C:\some\path + # + # D:\>python -c "import os; print os.path.abspath('C:relative')" + # C:\some\path\relative + if util.hasdriveletter(path): + if len(path) == 2 or path[2:3] not in br'\/': + path = os.path.abspath(path) + return path + + if abort: + raise error.Abort(_("default path for subrepository not found")) + +def newcommitphase(ui, ctx): + commitphase = phases.newcommitphase(ui) + substate = getattr(ctx, "substate", None) + if not substate: + return commitphase + check = ui.config('phases', 'checksubrepos') + if check not in ('ignore', 'follow', 'abort'): + raise error.Abort(_('invalid phases.checksubrepos configuration: %s') + % (check)) + if check == 'ignore': + return commitphase + maxphase = phases.public + maxsub = None + for s in sorted(substate): + sub = ctx.sub(s) + subphase = sub.phase(substate[s][1]) + if maxphase < subphase: + maxphase = subphase + maxsub = s + if commitphase < maxphase: + if check == 'abort': + raise error.Abort(_("can't commit in %s phase" + " conflicting %s from subrepository %s") % + (phases.phasenames[commitphase], + phases.phasenames[maxphase], maxsub)) + ui.warn(_("warning: changes are committed in" + " %s phase from subrepository %s\n") % + (phases.phasenames[maxphase], maxsub)) + return maxphase + return commitphase diff -r fb92df8b634c -r ed5448edcbfa mercurial/tagmerge.py --- a/mercurial/tagmerge.py Wed Apr 04 10:35:09 2018 -0400 +++ b/mercurial/tagmerge.py Wed Apr 18 15:32:08 2018 -0400 @@ -73,8 +73,6 @@ from __future__ import absolute_import -import operator - from .i18n import _ from .node import ( hex, @@ -146,7 +144,7 @@ possible to the first parent's .hgtags file. ''' # group the node-tag pairs that must be written next to each other - for tname, taglist in mergedtags.items(): + for tname, taglist in list(mergedtags.items()): mergedtags[tname] = grouptagnodesbyline(taglist) # convert the grouped merged tags dict into a format that resembles the @@ -164,7 +162,7 @@ # before writing them # the position is calculated to ensure that the diff of the merged .hgtags # file to the first parent's .hgtags file is as small as possible - finaltags.sort(key=operator.itemgetter(0)) + finaltags.sort(key=lambda x: -1 if x[0] is None else x[0]) # finally we can join the sorted groups to get the final contents of the # merged .hgtags file, and then write it to disk @@ -269,4 +267,3 @@ writemergedtags(fcd, mergedtags) ui.note(_('.hgtags merged successfully\n')) return False, 0 - diff -r fb92df8b634c -r ed5448edcbfa mercurial/tags.py --- a/mercurial/tags.py Wed Apr 04 10:35:09 2018 -0400 +++ b/mercurial/tags.py Wed Apr 18 15:32:08 2018 -0400 @@ -28,6 +28,9 @@ scmutil, util, ) +from .utils import ( + stringutil, +) # Tags computation can be expensive and caches exist to make it fast in # the common case. @@ -244,7 +247,7 @@ # remove tags pointing to invalid nodes cl = repo.changelog - for t in filetags.keys(): + for t in list(filetags): try: cl.rev(filetags[t][0]) except (LookupError, ValueError): @@ -276,7 +279,7 @@ count = 0 def dbg(msg): - ui.debug("%s, line %s: %s\n" % (fn, count, msg)) + ui.debug("%s, line %d: %s\n" % (fn, count, msg)) for nline, line in enumerate(lines): count += 1 @@ -559,7 +562,7 @@ def writetags(fp, names, munge, prevtags): fp.seek(0, 2) - if prevtags and prevtags[-1] != '\n': + if prevtags and not prevtags.endswith('\n'): fp.write('\n') for name in names: if munge: @@ -739,7 +742,7 @@ entry = bytearray(prefix + fnode) self._raw[offset:offset + _fnodesrecsize] = entry # self._dirtyoffset could be None. - self._dirtyoffset = min(self._dirtyoffset, offset) or 0 + self._dirtyoffset = min(self._dirtyoffset or 0, offset or 0) def write(self): """Perform all necessary writes to cache file. @@ -783,6 +786,6 @@ except (IOError, OSError) as inst: repo.ui.log('tagscache', "couldn't write cache/%s: %s\n" % ( - _fnodescachefile, inst)) + _fnodescachefile, stringutil.forcebytestr(inst))) finally: lock.release() diff -r fb92df8b634c -r ed5448edcbfa mercurial/templatefilters.py --- a/mercurial/templatefilters.py Wed Apr 04 10:35:09 2018 -0400 +++ b/mercurial/templatefilters.py Wed Apr 18 15:32:08 2018 -0400 @@ -11,17 +11,21 @@ import re import time +from .i18n import _ from . import ( encoding, error, - hbisect, node, pycompat, registrar, - templatekw, + templateutil, url, util, ) +from .utils import ( + dateutil, + stringutil, +) urlerr = util.urlerr urlreq = util.urlreq @@ -37,7 +41,7 @@ templatefilter = registrar.templatefilter(filters) -@templatefilter('addbreaks') +@templatefilter('addbreaks', intype=bytes) def addbreaks(text): """Any text. Add an XHTML "
" tag before the end of every line except the last. @@ -52,7 +56,7 @@ ("minute", 60, 'm'), ("second", 1, 's')] -@templatefilter('age') +@templatefilter('age', intype=templateutil.date) def age(date, abbrev=False): """Date. Returns a human-readable date/time difference between the given date/time and the current date/time. @@ -78,7 +82,7 @@ else: delta = max(1, int(now - then)) if delta > agescales[0][1] * 2: - return util.shortdate(date) + return dateutil.shortdate(date) for t, s, a in agescales: n = delta // s @@ -87,7 +91,7 @@ return '%s from now' % fmt(t, n, a) return '%s ago' % fmt(t, n, a) -@templatefilter('basename') +@templatefilter('basename', intype=bytes) def basename(path): """Any text. Treats the text as a path, and returns the last component of the path after splitting by the path separator. @@ -98,9 +102,19 @@ @templatefilter('count') def count(i): """List or text. Returns the length as an integer.""" - return len(i) + try: + return len(i) + except TypeError: + raise error.ParseError(_('not countable')) -@templatefilter('domain') +@templatefilter('dirname', intype=bytes) +def dirname(path): + """Any text. Treats the text as a path, and strips the last + component of the path after splitting by the path separator. + """ + return os.path.dirname(path) + +@templatefilter('domain', intype=bytes) def domain(author): """Any text. Finds the first string that looks like an email address, and extracts just the domain component. Example: ``User @@ -115,15 +129,15 @@ author = author[:f] return author -@templatefilter('email') +@templatefilter('email', intype=bytes) def email(text): """Any text. Extracts the first string that looks like an email address. Example: ``User `` becomes ``user@example.com``. """ - return util.email(text) + return stringutil.email(text) -@templatefilter('escape') +@templatefilter('escape', intype=bytes) def escape(text): """Any text. Replaces the special XML/XHTML characters "&", "<" and ">" with XML entities, and filters out NUL characters. @@ -138,38 +152,39 @@ global para_re, space_re if para_re is None: para_re = re.compile('(\n\n|\n\\s*[-*]\\s*)', re.M) - space_re = re.compile(r' +') + space_re = re.compile(br' +') def findparas(): start = 0 while True: m = para_re.search(text, start) if not m: - uctext = unicode(text[start:], encoding.encoding) + uctext = encoding.unifromlocal(text[start:]) w = len(uctext) while 0 < w and uctext[w - 1].isspace(): w -= 1 - yield (uctext[:w].encode(encoding.encoding), - uctext[w:].encode(encoding.encoding)) + yield (encoding.unitolocal(uctext[:w]), + encoding.unitolocal(uctext[w:])) break yield text[start:m.start(0)], m.group(1) start = m.end(1) - return "".join([util.wrap(space_re.sub(' ', util.wrap(para, width)), - width, initindent, hangindent) + rest + return "".join([stringutil.wrap(space_re.sub(' ', + stringutil.wrap(para, width)), + width, initindent, hangindent) + rest for para, rest in findparas()]) -@templatefilter('fill68') +@templatefilter('fill68', intype=bytes) def fill68(text): """Any text. Wraps the text to fit in 68 columns.""" return fill(text, 68) -@templatefilter('fill76') +@templatefilter('fill76', intype=bytes) def fill76(text): """Any text. Wraps the text to fit in 76 columns.""" return fill(text, 76) -@templatefilter('firstline') +@templatefilter('firstline', intype=bytes) def firstline(text): """Any text. Returns the first line of text.""" try: @@ -177,34 +192,34 @@ except IndexError: return '' -@templatefilter('hex') +@templatefilter('hex', intype=bytes) def hexfilter(text): """Any text. Convert a binary Mercurial node identifier into its long hexadecimal representation. """ return node.hex(text) -@templatefilter('hgdate') +@templatefilter('hgdate', intype=templateutil.date) def hgdate(text): """Date. Returns the date as a pair of numbers: "1157407993 25200" (Unix timestamp, timezone offset). """ return "%d %d" % text -@templatefilter('isodate') +@templatefilter('isodate', intype=templateutil.date) def isodate(text): """Date. Returns the date in ISO 8601 format: "2009-08-18 13:00 +0200". """ - return util.datestr(text, '%Y-%m-%d %H:%M %1%2') + return dateutil.datestr(text, '%Y-%m-%d %H:%M %1%2') -@templatefilter('isodatesec') +@templatefilter('isodatesec', intype=templateutil.date) def isodatesec(text): """Date. Returns the date in ISO 8601 format, including seconds: "2009-08-18 13:00:13 +0200". See also the rfc3339date filter. """ - return util.datestr(text, '%Y-%m-%d %H:%M:%S %1%2') + return dateutil.datestr(text, '%Y-%m-%d %H:%M:%S %1%2') def indent(text, prefix): '''indent each non-empty line of text after first with prefix.''' @@ -248,28 +263,27 @@ elif util.safehasattr(obj, '__iter__'): out = [json(i, paranoid) for i in obj] return '[' + ', '.join(out) + ']' - else: - raise TypeError('cannot encode type %s' % obj.__class__.__name__) + raise error.ProgrammingError('cannot encode %r' % obj) -@templatefilter('lower') +@templatefilter('lower', intype=bytes) def lower(text): """Any text. Converts the text to lowercase.""" return encoding.lower(text) -@templatefilter('nonempty') -def nonempty(str): +@templatefilter('nonempty', intype=bytes) +def nonempty(text): """Any text. Returns '(none)' if the string is empty.""" - return str or "(none)" + return text or "(none)" -@templatefilter('obfuscate') +@templatefilter('obfuscate', intype=bytes) def obfuscate(text): """Any text. Returns the input text rendered as a sequence of XML entities. """ - text = unicode(text, encoding.encoding, 'replace') + text = unicode(text, pycompat.sysstr(encoding.encoding), r'replace') return ''.join(['&#%d;' % ord(c) for c in text]) -@templatefilter('permissions') +@templatefilter('permissions', intype=bytes) def permissions(flags): if "l" in flags: return "lrwxrwxrwx" @@ -277,35 +291,14 @@ return "-rwxr-xr-x" return "-rw-r--r--" -@templatefilter('person') +@templatefilter('person', intype=bytes) def person(author): """Any text. Returns the name before an email address, interpreting it as per RFC 5322. + """ + return stringutil.person(author) - >>> person(b'foo@bar') - 'foo' - >>> person(b'Foo Bar ') - 'Foo Bar' - >>> person(b'"Foo Bar" ') - 'Foo Bar' - >>> person(b'"Foo \"buz\" Bar" ') - 'Foo "buz" Bar' - >>> # The following are invalid, but do exist in real-life - ... - >>> person(b'Foo "buz" Bar ') - 'Foo "buz" Bar' - >>> person(b'"Foo Bar ') - 'Foo Bar' - """ - if '@' not in author: - return author - f = author.find('<') - if f != -1: - return author[:f].strip(' "').replace('\\"', '"') - f = author.find('@') - return author[:f].replace('.', ' ') - -@templatefilter('revescape') +@templatefilter('revescape', intype=bytes) def revescape(text): """Any text. Escapes all "special" characters, except @. Forward slashes are escaped twice to prevent web servers from prematurely @@ -313,68 +306,65 @@ """ return urlreq.quote(text, safe='/@').replace('/', '%252F') -@templatefilter('rfc3339date') +@templatefilter('rfc3339date', intype=templateutil.date) def rfc3339date(text): """Date. Returns a date using the Internet date format specified in RFC 3339: "2009-08-18T13:00:13+02:00". """ - return util.datestr(text, "%Y-%m-%dT%H:%M:%S%1:%2") + return dateutil.datestr(text, "%Y-%m-%dT%H:%M:%S%1:%2") -@templatefilter('rfc822date') +@templatefilter('rfc822date', intype=templateutil.date) def rfc822date(text): """Date. Returns a date using the same format used in email headers: "Tue, 18 Aug 2009 13:00:13 +0200". """ - return util.datestr(text, "%a, %d %b %Y %H:%M:%S %1%2") + return dateutil.datestr(text, "%a, %d %b %Y %H:%M:%S %1%2") -@templatefilter('short') +@templatefilter('short', intype=bytes) def short(text): """Changeset hash. Returns the short form of a changeset hash, i.e. a 12 hexadecimal digit string. """ return text[:12] -@templatefilter('shortbisect') -def shortbisect(text): - """Any text. Treats `text` as a bisection status, and +@templatefilter('shortbisect', intype=bytes) +def shortbisect(label): + """Any text. Treats `label` as a bisection status, and returns a single-character representing the status (G: good, B: bad, S: skipped, U: untested, I: ignored). Returns single space if `text` is not a valid bisection status. """ - return hbisect.shortlabel(text) or ' ' + if label: + return label[0:1].upper() + return ' ' -@templatefilter('shortdate') +@templatefilter('shortdate', intype=templateutil.date) def shortdate(text): """Date. Returns a date like "2006-09-18".""" - return util.shortdate(text) + return dateutil.shortdate(text) -@templatefilter('slashpath') +@templatefilter('slashpath', intype=bytes) def slashpath(path): """Any text. Replaces the native path separator with slash.""" return util.pconvert(path) -@templatefilter('splitlines') +@templatefilter('splitlines', intype=bytes) def splitlines(text): """Any text. Split text into a list of lines.""" - return templatekw.hybridlist(text.splitlines(), name='line') + return templateutil.hybridlist(text.splitlines(), name='line') -@templatefilter('stringescape') +@templatefilter('stringescape', intype=bytes) def stringescape(text): - return util.escapestr(text) + return stringutil.escapestr(text) -@templatefilter('stringify') +@templatefilter('stringify', intype=bytes) def stringify(thing): """Any type. Turns the value into text by converting values into text and concatenating them. """ - thing = templatekw.unwraphybrid(thing) - if util.safehasattr(thing, '__iter__') and not isinstance(thing, bytes): - return "".join([stringify(t) for t in thing if t is not None]) - if thing is None: - return "" - return pycompat.bytestr(thing) + return thing # coerced by the intype -@templatefilter('stripdir') +@templatefilter('stripdir', intype=bytes) def stripdir(text): """Treat the text as path and strip a directory level, if possible. For example, "foo" and "foo/bar" becomes "foo". @@ -385,42 +375,42 @@ else: return dir -@templatefilter('tabindent') +@templatefilter('tabindent', intype=bytes) def tabindent(text): """Any text. Returns the text, with every non-empty line except the first starting with a tab character. """ return indent(text, '\t') -@templatefilter('upper') +@templatefilter('upper', intype=bytes) def upper(text): """Any text. Converts the text to uppercase.""" return encoding.upper(text) -@templatefilter('urlescape') +@templatefilter('urlescape', intype=bytes) def urlescape(text): """Any text. Escapes all "special" characters. For example, "foo bar" becomes "foo%20bar". """ return urlreq.quote(text) -@templatefilter('user') +@templatefilter('user', intype=bytes) def userfilter(text): """Any text. Returns a short representation of a user name or email address.""" - return util.shortuser(text) + return stringutil.shortuser(text) -@templatefilter('emailuser') +@templatefilter('emailuser', intype=bytes) def emailuser(text): """Any text. Returns the user portion of an email address.""" - return util.emailuser(text) + return stringutil.emailuser(text) -@templatefilter('utf8') +@templatefilter('utf8', intype=bytes) def utf8(text): """Any text. Converts from the local character encoding to UTF-8.""" return encoding.fromlocal(text) -@templatefilter('xmlescape') +@templatefilter('xmlescape', intype=bytes) def xmlescape(text): text = (text .replace('&', '&') diff -r fb92df8b634c -r ed5448edcbfa mercurial/templatefuncs.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/mercurial/templatefuncs.py Wed Apr 18 15:32:08 2018 -0400 @@ -0,0 +1,690 @@ +# templatefuncs.py - common template functions +# +# Copyright 2005, 2006 Matt Mackall +# +# This software may be used and distributed according to the terms of the +# GNU General Public License version 2 or any later version. + +from __future__ import absolute_import + +import re + +from .i18n import _ +from .node import ( + bin, +) +from . import ( + color, + encoding, + error, + minirst, + obsutil, + pycompat, + registrar, + revset as revsetmod, + revsetlang, + scmutil, + templatefilters, + templatekw, + templateutil, + util, +) +from .utils import ( + dateutil, + stringutil, +) + +evalrawexp = templateutil.evalrawexp +evalfuncarg = templateutil.evalfuncarg +evalboolean = templateutil.evalboolean +evaldate = templateutil.evaldate +evalinteger = templateutil.evalinteger +evalstring = templateutil.evalstring +evalstringliteral = templateutil.evalstringliteral + +# dict of template built-in functions +funcs = {} +templatefunc = registrar.templatefunc(funcs) + +@templatefunc('date(date[, fmt])') +def date(context, mapping, args): + """Format a date. See :hg:`help dates` for formatting + strings. The default is a Unix date format, including the timezone: + "Mon Sep 04 15:13:13 2006 0700".""" + if not (1 <= len(args) <= 2): + # i18n: "date" is a keyword + raise error.ParseError(_("date expects one or two arguments")) + + date = evaldate(context, mapping, args[0], + # i18n: "date" is a keyword + _("date expects a date information")) + fmt = None + if len(args) == 2: + fmt = evalstring(context, mapping, args[1]) + if fmt is None: + return dateutil.datestr(date) + else: + return dateutil.datestr(date, fmt) + +@templatefunc('dict([[key=]value...])', argspec='*args **kwargs') +def dict_(context, mapping, args): + """Construct a dict from key-value pairs. A key may be omitted if + a value expression can provide an unambiguous name.""" + data = util.sortdict() + + for v in args['args']: + k = templateutil.findsymbolicname(v) + if not k: + raise error.ParseError(_('dict key cannot be inferred')) + if k in data or k in args['kwargs']: + raise error.ParseError(_("duplicated dict key '%s' inferred") % k) + data[k] = evalfuncarg(context, mapping, v) + + data.update((k, evalfuncarg(context, mapping, v)) + for k, v in args['kwargs'].iteritems()) + return templateutil.hybriddict(data) + +@templatefunc('diff([includepattern [, excludepattern]])') +def diff(context, mapping, args): + """Show a diff, optionally + specifying files to include or exclude.""" + if len(args) > 2: + # i18n: "diff" is a keyword + raise error.ParseError(_("diff expects zero, one, or two arguments")) + + def getpatterns(i): + if i < len(args): + s = evalstring(context, mapping, args[i]).strip() + if s: + return [s] + return [] + + ctx = context.resource(mapping, 'ctx') + chunks = ctx.diff(match=ctx.match([], getpatterns(0), getpatterns(1))) + + return ''.join(chunks) + +@templatefunc('extdata(source)', argspec='source') +def extdata(context, mapping, args): + """Show a text read from the specified extdata source. (EXPERIMENTAL)""" + if 'source' not in args: + # i18n: "extdata" is a keyword + raise error.ParseError(_('extdata expects one argument')) + + source = evalstring(context, mapping, args['source']) + cache = context.resource(mapping, 'cache').setdefault('extdata', {}) + ctx = context.resource(mapping, 'ctx') + if source in cache: + data = cache[source] + else: + data = cache[source] = scmutil.extdatasource(ctx.repo(), source) + return data.get(ctx.rev(), '') + +@templatefunc('files(pattern)') +def files(context, mapping, args): + """All files of the current changeset matching the pattern. See + :hg:`help patterns`.""" + if not len(args) == 1: + # i18n: "files" is a keyword + raise error.ParseError(_("files expects one argument")) + + raw = evalstring(context, mapping, args[0]) + ctx = context.resource(mapping, 'ctx') + m = ctx.match([raw]) + files = list(ctx.matches(m)) + return templateutil.compatlist(context, mapping, "file", files) + +@templatefunc('fill(text[, width[, initialident[, hangindent]]])') +def fill(context, mapping, args): + """Fill many + paragraphs with optional indentation. See the "fill" filter.""" + if not (1 <= len(args) <= 4): + # i18n: "fill" is a keyword + raise error.ParseError(_("fill expects one to four arguments")) + + text = evalstring(context, mapping, args[0]) + width = 76 + initindent = '' + hangindent = '' + if 2 <= len(args) <= 4: + width = evalinteger(context, mapping, args[1], + # i18n: "fill" is a keyword + _("fill expects an integer width")) + try: + initindent = evalstring(context, mapping, args[2]) + hangindent = evalstring(context, mapping, args[3]) + except IndexError: + pass + + return templatefilters.fill(text, width, initindent, hangindent) + +@templatefunc('formatnode(node)') +def formatnode(context, mapping, args): + """Obtain the preferred form of a changeset hash. (DEPRECATED)""" + if len(args) != 1: + # i18n: "formatnode" is a keyword + raise error.ParseError(_("formatnode expects one argument")) + + ui = context.resource(mapping, 'ui') + node = evalstring(context, mapping, args[0]) + if ui.debugflag: + return node + return templatefilters.short(node) + +@templatefunc('mailmap(author)') +def mailmap(context, mapping, args): + """Return the author, updated according to the value + set in the .mailmap file""" + if len(args) != 1: + raise error.ParseError(_("mailmap expects one argument")) + + author = evalstring(context, mapping, args[0]) + + cache = context.resource(mapping, 'cache') + repo = context.resource(mapping, 'repo') + + if 'mailmap' not in cache: + data = repo.wvfs.tryread('.mailmap') + cache['mailmap'] = stringutil.parsemailmap(data) + + return stringutil.mapname(cache['mailmap'], author) + +@templatefunc('pad(text, width[, fillchar=\' \'[, left=False]])', + argspec='text width fillchar left') +def pad(context, mapping, args): + """Pad text with a + fill character.""" + if 'text' not in args or 'width' not in args: + # i18n: "pad" is a keyword + raise error.ParseError(_("pad() expects two to four arguments")) + + width = evalinteger(context, mapping, args['width'], + # i18n: "pad" is a keyword + _("pad() expects an integer width")) + + text = evalstring(context, mapping, args['text']) + + left = False + fillchar = ' ' + if 'fillchar' in args: + fillchar = evalstring(context, mapping, args['fillchar']) + if len(color.stripeffects(fillchar)) != 1: + # i18n: "pad" is a keyword + raise error.ParseError(_("pad() expects a single fill character")) + if 'left' in args: + left = evalboolean(context, mapping, args['left']) + + fillwidth = width - encoding.colwidth(color.stripeffects(text)) + if fillwidth <= 0: + return text + if left: + return fillchar * fillwidth + text + else: + return text + fillchar * fillwidth + +@templatefunc('indent(text, indentchars[, firstline])') +def indent(context, mapping, args): + """Indents all non-empty lines + with the characters given in the indentchars string. An optional + third parameter will override the indent for the first line only + if present.""" + if not (2 <= len(args) <= 3): + # i18n: "indent" is a keyword + raise error.ParseError(_("indent() expects two or three arguments")) + + text = evalstring(context, mapping, args[0]) + indent = evalstring(context, mapping, args[1]) + + if len(args) == 3: + firstline = evalstring(context, mapping, args[2]) + else: + firstline = indent + + # the indent function doesn't indent the first line, so we do it here + return templatefilters.indent(firstline + text, indent) + +@templatefunc('get(dict, key)') +def get(context, mapping, args): + """Get an attribute/key from an object. Some keywords + are complex types. This function allows you to obtain the value of an + attribute on these types.""" + if len(args) != 2: + # i18n: "get" is a keyword + raise error.ParseError(_("get() expects two arguments")) + + dictarg = evalfuncarg(context, mapping, args[0]) + if not util.safehasattr(dictarg, 'get'): + # i18n: "get" is a keyword + raise error.ParseError(_("get() expects a dict as first argument")) + + key = evalfuncarg(context, mapping, args[1]) + return templateutil.getdictitem(dictarg, key) + +@templatefunc('if(expr, then[, else])') +def if_(context, mapping, args): + """Conditionally execute based on the result of + an expression.""" + if not (2 <= len(args) <= 3): + # i18n: "if" is a keyword + raise error.ParseError(_("if expects two or three arguments")) + + test = evalboolean(context, mapping, args[0]) + if test: + return evalrawexp(context, mapping, args[1]) + elif len(args) == 3: + return evalrawexp(context, mapping, args[2]) + +@templatefunc('ifcontains(needle, haystack, then[, else])') +def ifcontains(context, mapping, args): + """Conditionally execute based + on whether the item "needle" is in "haystack".""" + if not (3 <= len(args) <= 4): + # i18n: "ifcontains" is a keyword + raise error.ParseError(_("ifcontains expects three or four arguments")) + + haystack = evalfuncarg(context, mapping, args[1]) + keytype = getattr(haystack, 'keytype', None) + try: + needle = evalrawexp(context, mapping, args[0]) + needle = templateutil.unwrapastype(context, mapping, needle, + keytype or bytes) + found = (needle in haystack) + except error.ParseError: + found = False + + if found: + return evalrawexp(context, mapping, args[2]) + elif len(args) == 4: + return evalrawexp(context, mapping, args[3]) + +@templatefunc('ifeq(expr1, expr2, then[, else])') +def ifeq(context, mapping, args): + """Conditionally execute based on + whether 2 items are equivalent.""" + if not (3 <= len(args) <= 4): + # i18n: "ifeq" is a keyword + raise error.ParseError(_("ifeq expects three or four arguments")) + + test = evalstring(context, mapping, args[0]) + match = evalstring(context, mapping, args[1]) + if test == match: + return evalrawexp(context, mapping, args[2]) + elif len(args) == 4: + return evalrawexp(context, mapping, args[3]) + +@templatefunc('join(list, sep)') +def join(context, mapping, args): + """Join items in a list with a delimiter.""" + if not (1 <= len(args) <= 2): + # i18n: "join" is a keyword + raise error.ParseError(_("join expects one or two arguments")) + + joinset = evalrawexp(context, mapping, args[0]) + joiner = " " + if len(args) > 1: + joiner = evalstring(context, mapping, args[1]) + if isinstance(joinset, templateutil.wrapped): + return joinset.join(context, mapping, joiner) + # TODO: perhaps a generator should be stringify()-ed here, but we can't + # because hgweb abuses it as a keyword that returns a list of dicts. + joinset = templateutil.unwrapvalue(context, mapping, joinset) + return templateutil.joinitems(pycompat.maybebytestr(joinset), joiner) + +@templatefunc('label(label, expr)') +def label(context, mapping, args): + """Apply a label to generated content. Content with + a label applied can result in additional post-processing, such as + automatic colorization.""" + if len(args) != 2: + # i18n: "label" is a keyword + raise error.ParseError(_("label expects two arguments")) + + ui = context.resource(mapping, 'ui') + thing = evalstring(context, mapping, args[1]) + # preserve unknown symbol as literal so effects like 'red', 'bold', + # etc. don't need to be quoted + label = evalstringliteral(context, mapping, args[0]) + + return ui.label(thing, label) + +@templatefunc('latesttag([pattern])') +def latesttag(context, mapping, args): + """The global tags matching the given pattern on the + most recent globally tagged ancestor of this changeset. + If no such tags exist, the "{tag}" template resolves to + the string "null".""" + if len(args) > 1: + # i18n: "latesttag" is a keyword + raise error.ParseError(_("latesttag expects at most one argument")) + + pattern = None + if len(args) == 1: + pattern = evalstring(context, mapping, args[0]) + return templatekw.showlatesttags(context, mapping, pattern) + +@templatefunc('localdate(date[, tz])') +def localdate(context, mapping, args): + """Converts a date to the specified timezone. + The default is local date.""" + if not (1 <= len(args) <= 2): + # i18n: "localdate" is a keyword + raise error.ParseError(_("localdate expects one or two arguments")) + + date = evaldate(context, mapping, args[0], + # i18n: "localdate" is a keyword + _("localdate expects a date information")) + if len(args) >= 2: + tzoffset = None + tz = evalfuncarg(context, mapping, args[1]) + if isinstance(tz, bytes): + tzoffset, remainder = dateutil.parsetimezone(tz) + if remainder: + tzoffset = None + if tzoffset is None: + try: + tzoffset = int(tz) + except (TypeError, ValueError): + # i18n: "localdate" is a keyword + raise error.ParseError(_("localdate expects a timezone")) + else: + tzoffset = dateutil.makedate()[1] + return (date[0], tzoffset) + +@templatefunc('max(iterable)') +def max_(context, mapping, args, **kwargs): + """Return the max of an iterable""" + if len(args) != 1: + # i18n: "max" is a keyword + raise error.ParseError(_("max expects one argument")) + + iterable = evalfuncarg(context, mapping, args[0]) + try: + x = max(pycompat.maybebytestr(iterable)) + except (TypeError, ValueError): + # i18n: "max" is a keyword + raise error.ParseError(_("max first argument should be an iterable")) + return templateutil.wraphybridvalue(iterable, x, x) + +@templatefunc('min(iterable)') +def min_(context, mapping, args, **kwargs): + """Return the min of an iterable""" + if len(args) != 1: + # i18n: "min" is a keyword + raise error.ParseError(_("min expects one argument")) + + iterable = evalfuncarg(context, mapping, args[0]) + try: + x = min(pycompat.maybebytestr(iterable)) + except (TypeError, ValueError): + # i18n: "min" is a keyword + raise error.ParseError(_("min first argument should be an iterable")) + return templateutil.wraphybridvalue(iterable, x, x) + +@templatefunc('mod(a, b)') +def mod(context, mapping, args): + """Calculate a mod b such that a / b + a mod b == a""" + if not len(args) == 2: + # i18n: "mod" is a keyword + raise error.ParseError(_("mod expects two arguments")) + + func = lambda a, b: a % b + return templateutil.runarithmetic(context, mapping, + (func, args[0], args[1])) + +@templatefunc('obsfateoperations(markers)') +def obsfateoperations(context, mapping, args): + """Compute obsfate related information based on markers (EXPERIMENTAL)""" + if len(args) != 1: + # i18n: "obsfateoperations" is a keyword + raise error.ParseError(_("obsfateoperations expects one argument")) + + markers = evalfuncarg(context, mapping, args[0]) + + try: + data = obsutil.markersoperations(markers) + return templateutil.hybridlist(data, name='operation') + except (TypeError, KeyError): + # i18n: "obsfateoperations" is a keyword + errmsg = _("obsfateoperations first argument should be an iterable") + raise error.ParseError(errmsg) + +@templatefunc('obsfatedate(markers)') +def obsfatedate(context, mapping, args): + """Compute obsfate related information based on markers (EXPERIMENTAL)""" + if len(args) != 1: + # i18n: "obsfatedate" is a keyword + raise error.ParseError(_("obsfatedate expects one argument")) + + markers = evalfuncarg(context, mapping, args[0]) + + try: + data = obsutil.markersdates(markers) + return templateutil.hybridlist(data, name='date', fmt='%d %d') + except (TypeError, KeyError): + # i18n: "obsfatedate" is a keyword + errmsg = _("obsfatedate first argument should be an iterable") + raise error.ParseError(errmsg) + +@templatefunc('obsfateusers(markers)') +def obsfateusers(context, mapping, args): + """Compute obsfate related information based on markers (EXPERIMENTAL)""" + if len(args) != 1: + # i18n: "obsfateusers" is a keyword + raise error.ParseError(_("obsfateusers expects one argument")) + + markers = evalfuncarg(context, mapping, args[0]) + + try: + data = obsutil.markersusers(markers) + return templateutil.hybridlist(data, name='user') + except (TypeError, KeyError, ValueError): + # i18n: "obsfateusers" is a keyword + msg = _("obsfateusers first argument should be an iterable of " + "obsmakers") + raise error.ParseError(msg) + +@templatefunc('obsfateverb(successors, markers)') +def obsfateverb(context, mapping, args): + """Compute obsfate related information based on successors (EXPERIMENTAL)""" + if len(args) != 2: + # i18n: "obsfateverb" is a keyword + raise error.ParseError(_("obsfateverb expects two arguments")) + + successors = evalfuncarg(context, mapping, args[0]) + markers = evalfuncarg(context, mapping, args[1]) + + try: + return obsutil.obsfateverb(successors, markers) + except TypeError: + # i18n: "obsfateverb" is a keyword + errmsg = _("obsfateverb first argument should be countable") + raise error.ParseError(errmsg) + +@templatefunc('relpath(path)') +def relpath(context, mapping, args): + """Convert a repository-absolute path into a filesystem path relative to + the current working directory.""" + if len(args) != 1: + # i18n: "relpath" is a keyword + raise error.ParseError(_("relpath expects one argument")) + + repo = context.resource(mapping, 'ctx').repo() + path = evalstring(context, mapping, args[0]) + return repo.pathto(path) + +@templatefunc('revset(query[, formatargs...])') +def revset(context, mapping, args): + """Execute a revision set query. See + :hg:`help revset`.""" + if not len(args) > 0: + # i18n: "revset" is a keyword + raise error.ParseError(_("revset expects one or more arguments")) + + raw = evalstring(context, mapping, args[0]) + ctx = context.resource(mapping, 'ctx') + repo = ctx.repo() + + def query(expr): + m = revsetmod.match(repo.ui, expr, lookup=revsetmod.lookupfn(repo)) + return m(repo) + + if len(args) > 1: + formatargs = [evalfuncarg(context, mapping, a) for a in args[1:]] + revs = query(revsetlang.formatspec(raw, *formatargs)) + revs = list(revs) + else: + cache = context.resource(mapping, 'cache') + revsetcache = cache.setdefault("revsetcache", {}) + if raw in revsetcache: + revs = revsetcache[raw] + else: + revs = query(raw) + revs = list(revs) + revsetcache[raw] = revs + return templatekw.showrevslist(context, mapping, "revision", revs) + +@templatefunc('rstdoc(text, style)') +def rstdoc(context, mapping, args): + """Format reStructuredText.""" + if len(args) != 2: + # i18n: "rstdoc" is a keyword + raise error.ParseError(_("rstdoc expects two arguments")) + + text = evalstring(context, mapping, args[0]) + style = evalstring(context, mapping, args[1]) + + return minirst.format(text, style=style, keep=['verbose']) + +@templatefunc('separate(sep, args)', argspec='sep *args') +def separate(context, mapping, args): + """Add a separator between non-empty arguments.""" + if 'sep' not in args: + # i18n: "separate" is a keyword + raise error.ParseError(_("separate expects at least one argument")) + + sep = evalstring(context, mapping, args['sep']) + first = True + for arg in args['args']: + argstr = evalstring(context, mapping, arg) + if not argstr: + continue + if first: + first = False + else: + yield sep + yield argstr + +@templatefunc('shortest(node, minlength=4)') +def shortest(context, mapping, args): + """Obtain the shortest representation of + a node.""" + if not (1 <= len(args) <= 2): + # i18n: "shortest" is a keyword + raise error.ParseError(_("shortest() expects one or two arguments")) + + hexnode = evalstring(context, mapping, args[0]) + + minlength = 4 + if len(args) > 1: + minlength = evalinteger(context, mapping, args[1], + # i18n: "shortest" is a keyword + _("shortest() expects an integer minlength")) + + repo = context.resource(mapping, 'ctx')._repo + if len(hexnode) > 40: + return hexnode + elif len(hexnode) == 40: + try: + node = bin(hexnode) + except TypeError: + return hexnode + else: + try: + node = scmutil.resolvehexnodeidprefix(repo, hexnode) + except (error.LookupError, error.WdirUnsupported): + return hexnode + if not node: + return hexnode + return scmutil.shortesthexnodeidprefix(repo, node, minlength) + +@templatefunc('strip(text[, chars])') +def strip(context, mapping, args): + """Strip characters from a string. By default, + strips all leading and trailing whitespace.""" + if not (1 <= len(args) <= 2): + # i18n: "strip" is a keyword + raise error.ParseError(_("strip expects one or two arguments")) + + text = evalstring(context, mapping, args[0]) + if len(args) == 2: + chars = evalstring(context, mapping, args[1]) + return text.strip(chars) + return text.strip() + +@templatefunc('sub(pattern, replacement, expression)') +def sub(context, mapping, args): + """Perform text substitution + using regular expressions.""" + if len(args) != 3: + # i18n: "sub" is a keyword + raise error.ParseError(_("sub expects three arguments")) + + pat = evalstring(context, mapping, args[0]) + rpl = evalstring(context, mapping, args[1]) + src = evalstring(context, mapping, args[2]) + try: + patre = re.compile(pat) + except re.error: + # i18n: "sub" is a keyword + raise error.ParseError(_("sub got an invalid pattern: %s") % pat) + try: + yield patre.sub(rpl, src) + except re.error: + # i18n: "sub" is a keyword + raise error.ParseError(_("sub got an invalid replacement: %s") % rpl) + +@templatefunc('startswith(pattern, text)') +def startswith(context, mapping, args): + """Returns the value from the "text" argument + if it begins with the content from the "pattern" argument.""" + if len(args) != 2: + # i18n: "startswith" is a keyword + raise error.ParseError(_("startswith expects two arguments")) + + patn = evalstring(context, mapping, args[0]) + text = evalstring(context, mapping, args[1]) + if text.startswith(patn): + return text + return '' + +@templatefunc('word(number, text[, separator])') +def word(context, mapping, args): + """Return the nth word from a string.""" + if not (2 <= len(args) <= 3): + # i18n: "word" is a keyword + raise error.ParseError(_("word expects two or three arguments, got %d") + % len(args)) + + num = evalinteger(context, mapping, args[0], + # i18n: "word" is a keyword + _("word expects an integer index")) + text = evalstring(context, mapping, args[1]) + if len(args) == 3: + splitter = evalstring(context, mapping, args[2]) + else: + splitter = None + + tokens = text.split(splitter) + if num >= len(tokens) or num < -len(tokens): + return '' + else: + return tokens[num] + +def loadfunction(ui, extname, registrarobj): + """Load template function from specified registrarobj + """ + for name, func in registrarobj._table.iteritems(): + funcs[name] = func + +# tell hggettext to extract docstrings from these functions: +i18nfunctions = funcs.values() diff -r fb92df8b634c -r ed5448edcbfa mercurial/templatekw.py --- a/mercurial/templatekw.py Wed Apr 04 10:35:09 2018 -0400 +++ b/mercurial/templatekw.py Wed Apr 18 15:32:08 2018 -0400 @@ -23,207 +23,59 @@ pycompat, registrar, scmutil, + templateutil, util, ) +from .utils import ( + stringutil, +) -class _hybrid(object): - """Wrapper for list or dict to support legacy template - - This class allows us to handle both: - - "{files}" (legacy command-line-specific list hack) and - - "{files % '{file}\n'}" (hgweb-style with inlining and function support) - and to access raw values: - - "{ifcontains(file, files, ...)}", "{ifcontains(key, extras, ...)}" - - "{get(extras, key)}" - - "{files|json}" - """ - - def __init__(self, gen, values, makemap, joinfmt, keytype=None): - if gen is not None: - self.gen = gen # generator or function returning generator - self._values = values - self._makemap = makemap - self.joinfmt = joinfmt - self.keytype = keytype # hint for 'x in y' where type(x) is unresolved - def gen(self): - """Default generator to stringify this as {join(self, ' ')}""" - for i, x in enumerate(self._values): - if i > 0: - yield ' ' - yield self.joinfmt(x) - def itermaps(self): - makemap = self._makemap - for x in self._values: - yield makemap(x) - def __contains__(self, x): - return x in self._values - def __getitem__(self, key): - return self._values[key] - def __len__(self): - return len(self._values) - def __iter__(self): - return iter(self._values) - def __getattr__(self, name): - if name not in ('get', 'items', 'iteritems', 'iterkeys', 'itervalues', - 'keys', 'values'): - raise AttributeError(name) - return getattr(self._values, name) - -class _mappable(object): - """Wrapper for non-list/dict object to support map operation - - This class allows us to handle both: - - "{manifest}" - - "{manifest % '{rev}:{node}'}" - - "{manifest.rev}" - - Unlike a _hybrid, this does not simulate the behavior of the underling - value. Use unwrapvalue() or unwraphybrid() to obtain the inner object. - """ +_hybrid = templateutil.hybrid +_mappable = templateutil.mappable +hybriddict = templateutil.hybriddict +hybridlist = templateutil.hybridlist +compatdict = templateutil.compatdict +compatlist = templateutil.compatlist +_showcompatlist = templateutil._showcompatlist - def __init__(self, gen, key, value, makemap): - if gen is not None: - self.gen = gen # generator or function returning generator - self._key = key - self._value = value # may be generator of strings - self._makemap = makemap - - def gen(self): - yield pycompat.bytestr(self._value) - - def tomap(self): - return self._makemap(self._key) - - def itermaps(self): - yield self.tomap() - -def hybriddict(data, key='key', value='value', fmt='%s=%s', gen=None): - """Wrap data to support both dict-like and string-like operations""" - return _hybrid(gen, data, lambda k: {key: k, value: data[k]}, - lambda k: fmt % (k, data[k])) - -def hybridlist(data, name, fmt='%s', gen=None): - """Wrap data to support both list-like and string-like operations""" - return _hybrid(gen, data, lambda x: {name: x}, lambda x: fmt % x) - -def unwraphybrid(thing): - """Return an object which can be stringified possibly by using a legacy - template""" - gen = getattr(thing, 'gen', None) - if gen is None: - return thing - if callable(gen): - return gen() - return gen - -def unwrapvalue(thing): - """Move the inner value object out of the wrapper""" - if not util.safehasattr(thing, '_value'): - return thing - return thing._value - -def wraphybridvalue(container, key, value): - """Wrap an element of hybrid container to be mappable - - The key is passed to the makemap function of the given container, which - should be an item generated by iter(container). - """ - makemap = getattr(container, '_makemap', None) - if makemap is None: - return value - if util.safehasattr(value, '_makemap'): - # a nested hybrid list/dict, which has its own way of map operation - return value - return _mappable(None, key, value, makemap) +def _showlist(name, values, templ, mapping, plural=None, separator=' '): + ui = mapping.get('ui') + if ui: + ui.deprecwarn("templatekw._showlist() is deprecated, use " + "templateutil._showcompatlist()", '4.6') + context = templ # this is actually a template context, not a templater + return _showcompatlist(context, mapping, name, values, plural, separator) def showdict(name, data, mapping, plural=None, key='key', value='value', - fmt='%s=%s', separator=' '): + fmt=None, separator=' '): + ui = mapping.get('ui') + if ui: + ui.deprecwarn("templatekw.showdict() is deprecated, use " + "templateutil.compatdict()", '4.6') c = [{key: k, value: v} for k, v in data.iteritems()] - f = _showlist(name, c, mapping, plural, separator) + f = _showlist(name, c, mapping['templ'], mapping, plural, separator) return hybriddict(data, key=key, value=value, fmt=fmt, gen=f) def showlist(name, values, mapping, plural=None, element=None, separator=' '): + ui = mapping.get('ui') + if ui: + ui.deprecwarn("templatekw.showlist() is deprecated, use " + "templateutil.compatlist()", '4.6') if not element: element = name - f = _showlist(name, values, mapping, plural, separator) + f = _showlist(name, values, mapping['templ'], mapping, plural, separator) return hybridlist(values, name=element, gen=f) -def _showlist(name, values, mapping, plural=None, separator=' '): - '''expand set of values. - name is name of key in template map. - values is list of strings or dicts. - plural is plural of name, if not simply name + 's'. - separator is used to join values as a string - - expansion works like this, given name 'foo'. - - if values is empty, expand 'no_foos'. - - if 'foo' not in template map, return values as a string, - joined by 'separator'. - - expand 'start_foos'. - - for each value, expand 'foo'. if 'last_foo' in template - map, expand it instead of 'foo' for last key. - - expand 'end_foos'. - ''' - templ = mapping['templ'] - strmapping = pycompat.strkwargs(mapping) - if not plural: - plural = name + 's' - if not values: - noname = 'no_' + plural - if noname in templ: - yield templ(noname, **strmapping) - return - if name not in templ: - if isinstance(values[0], bytes): - yield separator.join(values) - else: - for v in values: - yield dict(v, **strmapping) - return - startname = 'start_' + plural - if startname in templ: - yield templ(startname, **strmapping) - vmapping = mapping.copy() - def one(v, tag=name): - try: - vmapping.update(v) - except (AttributeError, ValueError): - try: - for a, b in v: - vmapping[a] = b - except ValueError: - vmapping[name] = v - return templ(tag, **pycompat.strkwargs(vmapping)) - lastname = 'last_' + name - if lastname in templ: - last = values.pop() - else: - last = None - for v in values: - yield one(v) - if last is not None: - yield one(last, tag=lastname) - endname = 'end_' + plural - if endname in templ: - yield templ(endname, **strmapping) - -def getfiles(repo, ctx, revcache): - if 'files' not in revcache: - revcache['files'] = repo.status(ctx.p1(), ctx)[:3] - return revcache['files'] - -def getlatesttags(repo, ctx, cache, pattern=None): +def getlatesttags(context, mapping, pattern=None): '''return date, distance and name for the latest tag of rev''' + repo = context.resource(mapping, 'repo') + ctx = context.resource(mapping, 'ctx') + cache = context.resource(mapping, 'cache') cachename = 'latesttags' if pattern is not None: cachename += '-' + pattern - match = util.stringmatcher(pattern)[2] + match = stringutil.stringmatcher(pattern)[2] else: match = util.always @@ -337,91 +189,91 @@ # filecopy is preserved for compatibility reasons defaulttempl['filecopy'] = defaulttempl['file_copy'] -# keywords are callables like: -# fn(repo, ctx, templ, cache, revcache, **args) -# with: -# repo - current repository instance -# ctx - the changectx being displayed -# templ - the templater instance -# cache - a cache dictionary for the whole templater run -# revcache - a cache dictionary for the current revision +# keywords are callables (see registrar.templatekeyword for details) keywords = {} - templatekeyword = registrar.templatekeyword(keywords) -@templatekeyword('author') -def showauthor(repo, ctx, templ, **args): +@templatekeyword('author', requires={'ctx'}) +def showauthor(context, mapping): """String. The unmodified author of the changeset.""" + ctx = context.resource(mapping, 'ctx') return ctx.user() -@templatekeyword('bisect') -def showbisect(repo, ctx, templ, **args): +@templatekeyword('bisect', requires={'repo', 'ctx'}) +def showbisect(context, mapping): """String. The changeset bisection status.""" + repo = context.resource(mapping, 'repo') + ctx = context.resource(mapping, 'ctx') return hbisect.label(repo, ctx.node()) -@templatekeyword('branch') -def showbranch(**args): +@templatekeyword('branch', requires={'ctx'}) +def showbranch(context, mapping): """String. The name of the branch on which the changeset was committed. """ - return args[r'ctx'].branch() + ctx = context.resource(mapping, 'ctx') + return ctx.branch() -@templatekeyword('branches') -def showbranches(**args): +@templatekeyword('branches', requires={'ctx'}) +def showbranches(context, mapping): """List of strings. The name of the branch on which the changeset was committed. Will be empty if the branch name was default. (DEPRECATED) """ - args = pycompat.byteskwargs(args) - branch = args['ctx'].branch() + ctx = context.resource(mapping, 'ctx') + branch = ctx.branch() if branch != 'default': - return showlist('branch', [branch], args, plural='branches') - return showlist('branch', [], args, plural='branches') + return compatlist(context, mapping, 'branch', [branch], + plural='branches') + return compatlist(context, mapping, 'branch', [], plural='branches') -@templatekeyword('bookmarks') -def showbookmarks(**args): +@templatekeyword('bookmarks', requires={'repo', 'ctx'}) +def showbookmarks(context, mapping): """List of strings. Any bookmarks associated with the changeset. Also sets 'active', the name of the active bookmark. """ - args = pycompat.byteskwargs(args) - repo = args['ctx']._repo - bookmarks = args['ctx'].bookmarks() + repo = context.resource(mapping, 'repo') + ctx = context.resource(mapping, 'ctx') + bookmarks = ctx.bookmarks() active = repo._activebookmark makemap = lambda v: {'bookmark': v, 'active': active, 'current': active} - f = _showlist('bookmark', bookmarks, args) + f = _showcompatlist(context, mapping, 'bookmark', bookmarks) return _hybrid(f, bookmarks, makemap, pycompat.identity) -@templatekeyword('children') -def showchildren(**args): +@templatekeyword('children', requires={'ctx'}) +def showchildren(context, mapping): """List of strings. The children of the changeset.""" - args = pycompat.byteskwargs(args) - ctx = args['ctx'] - childrevs = ['%d:%s' % (cctx, cctx) for cctx in ctx.children()] - return showlist('children', childrevs, args, element='child') + ctx = context.resource(mapping, 'ctx') + childrevs = ['%d:%s' % (cctx.rev(), cctx) for cctx in ctx.children()] + return compatlist(context, mapping, 'children', childrevs, element='child') # Deprecated, but kept alive for help generation a purpose. -@templatekeyword('currentbookmark') -def showcurrentbookmark(**args): +@templatekeyword('currentbookmark', requires={'repo', 'ctx'}) +def showcurrentbookmark(context, mapping): """String. The active bookmark, if it is associated with the changeset. (DEPRECATED)""" - return showactivebookmark(**args) + return showactivebookmark(context, mapping) -@templatekeyword('activebookmark') -def showactivebookmark(**args): +@templatekeyword('activebookmark', requires={'repo', 'ctx'}) +def showactivebookmark(context, mapping): """String. The active bookmark, if it is associated with the changeset.""" - active = args[r'repo']._activebookmark - if active and active in args[r'ctx'].bookmarks(): + repo = context.resource(mapping, 'repo') + ctx = context.resource(mapping, 'ctx') + active = repo._activebookmark + if active and active in ctx.bookmarks(): return active return '' -@templatekeyword('date') -def showdate(repo, ctx, templ, **args): +@templatekeyword('date', requires={'ctx'}) +def showdate(context, mapping): """Date information. The date when the changeset was committed.""" + ctx = context.resource(mapping, 'ctx') return ctx.date() -@templatekeyword('desc') -def showdescription(repo, ctx, templ, **args): +@templatekeyword('desc', requires={'ctx'}) +def showdescription(context, mapping): """String. The text of the changeset description.""" + ctx = context.resource(mapping, 'ctx') s = ctx.description() if isinstance(s, encoding.localstr): # try hard to preserve utf-8 bytes @@ -429,55 +281,64 @@ else: return s.strip() -@templatekeyword('diffstat') -def showdiffstat(repo, ctx, templ, **args): +@templatekeyword('diffstat', requires={'ctx'}) +def showdiffstat(context, mapping): """String. Statistics of changes with the following format: "modified files: +added/-removed lines" """ + ctx = context.resource(mapping, 'ctx') stats = patch.diffstatdata(util.iterlines(ctx.diff(noprefix=False))) maxname, maxtotal, adds, removes, binary = patch.diffstatsum(stats) - return '%s: +%s/-%s' % (len(stats), adds, removes) + return '%d: +%d/-%d' % (len(stats), adds, removes) -@templatekeyword('envvars') -def showenvvars(repo, **args): +@templatekeyword('envvars', requires={'ui'}) +def showenvvars(context, mapping): """A dictionary of environment variables. (EXPERIMENTAL)""" - args = pycompat.byteskwargs(args) - env = repo.ui.exportableenviron() + ui = context.resource(mapping, 'ui') + env = ui.exportableenviron() env = util.sortdict((k, env[k]) for k in sorted(env)) - return showdict('envvar', env, args, plural='envvars') + return compatdict(context, mapping, 'envvar', env, plural='envvars') -@templatekeyword('extras') -def showextras(**args): +@templatekeyword('extras', requires={'ctx'}) +def showextras(context, mapping): """List of dicts with key, value entries of the 'extras' field of this changeset.""" - args = pycompat.byteskwargs(args) - extras = args['ctx'].extra() + ctx = context.resource(mapping, 'ctx') + extras = ctx.extra() extras = util.sortdict((k, extras[k]) for k in sorted(extras)) makemap = lambda k: {'key': k, 'value': extras[k]} c = [makemap(k) for k in extras] - f = _showlist('extra', c, args, plural='extras') + f = _showcompatlist(context, mapping, 'extra', c, plural='extras') return _hybrid(f, extras, makemap, - lambda k: '%s=%s' % (k, util.escapestr(extras[k]))) + lambda k: '%s=%s' % (k, stringutil.escapestr(extras[k]))) -@templatekeyword('file_adds') -def showfileadds(**args): +def _showfilesbystat(context, mapping, name, index): + repo = context.resource(mapping, 'repo') + ctx = context.resource(mapping, 'ctx') + revcache = context.resource(mapping, 'revcache') + if 'files' not in revcache: + revcache['files'] = repo.status(ctx.p1(), ctx)[:3] + files = revcache['files'][index] + return compatlist(context, mapping, name, files, element='file') + +@templatekeyword('file_adds', requires={'repo', 'ctx', 'revcache'}) +def showfileadds(context, mapping): """List of strings. Files added by this changeset.""" - args = pycompat.byteskwargs(args) - repo, ctx, revcache = args['repo'], args['ctx'], args['revcache'] - return showlist('file_add', getfiles(repo, ctx, revcache)[1], args, - element='file') + return _showfilesbystat(context, mapping, 'file_add', 1) -@templatekeyword('file_copies') -def showfilecopies(**args): +@templatekeyword('file_copies', + requires={'repo', 'ctx', 'cache', 'revcache'}) +def showfilecopies(context, mapping): """List of strings. Files copied in this changeset with their sources. """ - args = pycompat.byteskwargs(args) - cache, ctx = args['cache'], args['ctx'] - copies = args['revcache'].get('copies') + repo = context.resource(mapping, 'repo') + ctx = context.resource(mapping, 'ctx') + cache = context.resource(mapping, 'cache') + copies = context.resource(mapping, 'revcache').get('copies') if copies is None: if 'getrenamed' not in cache: - cache['getrenamed'] = getrenamedfn(args['repo']) + cache['getrenamed'] = getrenamedfn(repo) copies = [] getrenamed = cache['getrenamed'] for fn in ctx.files(): @@ -486,51 +347,51 @@ copies.append((fn, rename[0])) copies = util.sortdict(copies) - return showdict('file_copy', copies, args, plural='file_copies', - key='name', value='source', fmt='%s (%s)') + return compatdict(context, mapping, 'file_copy', copies, + key='name', value='source', fmt='%s (%s)', + plural='file_copies') # showfilecopiesswitch() displays file copies only if copy records are # provided before calling the templater, usually with a --copies # command line switch. -@templatekeyword('file_copies_switch') -def showfilecopiesswitch(**args): +@templatekeyword('file_copies_switch', requires={'revcache'}) +def showfilecopiesswitch(context, mapping): """List of strings. Like "file_copies" but displayed only if the --copied switch is set. """ - args = pycompat.byteskwargs(args) - copies = args['revcache'].get('copies') or [] + copies = context.resource(mapping, 'revcache').get('copies') or [] copies = util.sortdict(copies) - return showdict('file_copy', copies, args, plural='file_copies', - key='name', value='source', fmt='%s (%s)') + return compatdict(context, mapping, 'file_copy', copies, + key='name', value='source', fmt='%s (%s)', + plural='file_copies') -@templatekeyword('file_dels') -def showfiledels(**args): +@templatekeyword('file_dels', requires={'repo', 'ctx', 'revcache'}) +def showfiledels(context, mapping): """List of strings. Files removed by this changeset.""" - args = pycompat.byteskwargs(args) - repo, ctx, revcache = args['repo'], args['ctx'], args['revcache'] - return showlist('file_del', getfiles(repo, ctx, revcache)[2], args, - element='file') + return _showfilesbystat(context, mapping, 'file_del', 2) -@templatekeyword('file_mods') -def showfilemods(**args): +@templatekeyword('file_mods', requires={'repo', 'ctx', 'revcache'}) +def showfilemods(context, mapping): """List of strings. Files modified by this changeset.""" - args = pycompat.byteskwargs(args) - repo, ctx, revcache = args['repo'], args['ctx'], args['revcache'] - return showlist('file_mod', getfiles(repo, ctx, revcache)[0], args, - element='file') + return _showfilesbystat(context, mapping, 'file_mod', 0) -@templatekeyword('files') -def showfiles(**args): +@templatekeyword('files', requires={'ctx'}) +def showfiles(context, mapping): """List of strings. All files modified, added, or removed by this changeset. """ - args = pycompat.byteskwargs(args) - return showlist('file', args['ctx'].files(), args) + ctx = context.resource(mapping, 'ctx') + return compatlist(context, mapping, 'file', ctx.files()) -@templatekeyword('graphnode') -def showgraphnode(repo, ctx, **args): +@templatekeyword('graphnode', requires={'repo', 'ctx'}) +def showgraphnode(context, mapping): """String. The character representing the changeset node in an ASCII revision graph.""" + repo = context.resource(mapping, 'repo') + ctx = context.resource(mapping, 'ctx') + return getgraphnode(repo, ctx) + +def getgraphnode(repo, ctx): wpnodes = repo.dirstate.parents() if wpnodes[1] == nullid: wpnodes = wpnodes[:1] @@ -545,33 +406,29 @@ else: return 'o' -@templatekeyword('graphwidth') -def showgraphwidth(repo, ctx, templ, **args): +@templatekeyword('graphwidth', requires=()) +def showgraphwidth(context, mapping): """Integer. The width of the graph drawn by 'log --graph' or zero.""" - # The value args['graphwidth'] will be this function, so we use an internal - # name to pass the value through props into this function. - return args.get('_graphwidth', 0) + # just hosts documentation; should be overridden by template mapping + return 0 -@templatekeyword('index') -def showindex(**args): +@templatekeyword('index', requires=()) +def showindex(context, mapping): """Integer. The current iteration of the loop. (0 indexed)""" # just hosts documentation; should be overridden by template mapping raise error.Abort(_("can't use index in this context")) -@templatekeyword('latesttag') -def showlatesttag(**args): +@templatekeyword('latesttag', requires={'repo', 'ctx', 'cache'}) +def showlatesttag(context, mapping): """List of strings. The global tags on the most recent globally tagged ancestor of this changeset. If no such tags exist, the list consists of the single string "null". """ - return showlatesttags(None, **args) + return showlatesttags(context, mapping, None) -def showlatesttags(pattern, **args): +def showlatesttags(context, mapping, pattern): """helper method for the latesttag keyword and function""" - args = pycompat.byteskwargs(args) - repo, ctx = args['repo'], args['ctx'] - cache = args['cache'] - latesttags = getlatesttags(repo, ctx, cache, pattern) + latesttags = getlatesttags(context, mapping, pattern) # latesttag[0] is an implementation detail for sorting csets on different # branches in a stable manner- it is the date the tagged cset was created, @@ -584,25 +441,27 @@ } tags = latesttags[2] - f = _showlist('latesttag', tags, args, separator=':') + f = _showcompatlist(context, mapping, 'latesttag', tags, separator=':') return _hybrid(f, tags, makemap, pycompat.identity) -@templatekeyword('latesttagdistance') -def showlatesttagdistance(repo, ctx, templ, cache, **args): +@templatekeyword('latesttagdistance', requires={'repo', 'ctx', 'cache'}) +def showlatesttagdistance(context, mapping): """Integer. Longest path to the latest tag.""" - return getlatesttags(repo, ctx, cache)[1] + return getlatesttags(context, mapping)[1] -@templatekeyword('changessincelatesttag') -def showchangessincelatesttag(repo, ctx, templ, cache, **args): +@templatekeyword('changessincelatesttag', requires={'repo', 'ctx', 'cache'}) +def showchangessincelatesttag(context, mapping): """Integer. All ancestors not in the latest tag.""" - latesttag = getlatesttags(repo, ctx, cache)[2][0] + tag = getlatesttags(context, mapping)[2][0] + mapping = context.overlaymap(mapping, {'tag': tag}) + return _showchangessincetag(context, mapping) - return _showchangessincetag(repo, ctx, tag=latesttag, **args) - -def _showchangessincetag(repo, ctx, **args): +def _showchangessincetag(context, mapping): + repo = context.resource(mapping, 'repo') + ctx = context.resource(mapping, 'ctx') offset = 0 revs = [ctx.rev()] - tag = args[r'tag'] + tag = context.symbol(mapping, 'tag') # The only() revset doesn't currently support wdir() if ctx.rev() is None: @@ -611,56 +470,59 @@ return len(repo.revs('only(%ld, %s)', revs, tag)) + offset -@templatekeyword('manifest') -def showmanifest(**args): - repo, ctx, templ = args[r'repo'], args[r'ctx'], args[r'templ'] +# teach templater latesttags.changes is switched to (context, mapping) API +_showchangessincetag._requires = {'repo', 'ctx'} + +@templatekeyword('manifest', requires={'repo', 'ctx'}) +def showmanifest(context, mapping): + repo = context.resource(mapping, 'repo') + ctx = context.resource(mapping, 'ctx') mnode = ctx.manifestnode() if mnode is None: # just avoid crash, we might want to use the 'ff...' hash in future return mrev = repo.manifestlog._revlog.rev(mnode) mhex = hex(mnode) - args = args.copy() - args.update({r'rev': mrev, r'node': mhex}) - f = templ('manifest', **args) + mapping = context.overlaymap(mapping, {'rev': mrev, 'node': mhex}) + f = context.process('manifest', mapping) # TODO: perhaps 'ctx' should be dropped from mapping because manifest # rev and node are completely different from changeset's. return _mappable(f, None, f, lambda x: {'rev': mrev, 'node': mhex}) -@templatekeyword('obsfate') -def showobsfate(**args): +@templatekeyword('obsfate', requires={'ui', 'repo', 'ctx'}) +def showobsfate(context, mapping): # this function returns a list containing pre-formatted obsfate strings. # # This function will be replaced by templates fragments when we will have # the verbosity templatekw available. - succsandmarkers = showsuccsandmarkers(**args) + succsandmarkers = showsuccsandmarkers(context, mapping) - args = pycompat.byteskwargs(args) - ui = args['ui'] - + ui = context.resource(mapping, 'ui') + repo = context.resource(mapping, 'repo') values = [] - for x in succsandmarkers: - values.append(obsutil.obsfateprinter(x['successors'], x['markers'], ui)) - - return showlist("fate", values, args) + for x in succsandmarkers.tovalue(context, mapping): + v = obsutil.obsfateprinter(ui, repo, x['successors'], x['markers'], + scmutil.formatchangeid) + values.append(v) -def shownames(namespace, **args): + return compatlist(context, mapping, "fate", values) + +def shownames(context, mapping, namespace): """helper method to generate a template keyword for a namespace""" - args = pycompat.byteskwargs(args) - ctx = args['ctx'] - repo = ctx.repo() + repo = context.resource(mapping, 'repo') + ctx = context.resource(mapping, 'ctx') ns = repo.names[namespace] names = ns.names(repo, ctx.node()) - return showlist(ns.templatename, names, args, plural=namespace) + return compatlist(context, mapping, ns.templatename, names, + plural=namespace) -@templatekeyword('namespaces') -def shownamespaces(**args): +@templatekeyword('namespaces', requires={'repo', 'ctx'}) +def shownamespaces(context, mapping): """Dict of lists. Names attached to this changeset per namespace.""" - args = pycompat.byteskwargs(args) - ctx = args['ctx'] - repo = ctx.repo() + repo = context.resource(mapping, 'repo') + ctx = context.resource(mapping, 'ctx') namespaces = util.sortdict() def makensmapfn(ns): @@ -669,10 +531,10 @@ for k, ns in repo.names.iteritems(): names = ns.names(repo, ctx.node()) - f = _showlist('name', names, args) + f = _showcompatlist(context, mapping, 'name', names) namespaces[k] = _hybrid(f, names, makensmapfn(ns), pycompat.identity) - f = _showlist('namespace', list(namespaces), args) + f = _showcompatlist(context, mapping, 'namespace', list(namespaces)) def makemap(ns): return { @@ -684,24 +546,27 @@ return _hybrid(f, namespaces, makemap, pycompat.identity) -@templatekeyword('node') -def shownode(repo, ctx, templ, **args): +@templatekeyword('node', requires={'ctx'}) +def shownode(context, mapping): """String. The changeset identification hash, as a 40 hexadecimal digit string. """ + ctx = context.resource(mapping, 'ctx') return ctx.hex() -@templatekeyword('obsolete') -def showobsolete(repo, ctx, templ, **args): +@templatekeyword('obsolete', requires={'ctx'}) +def showobsolete(context, mapping): """String. Whether the changeset is obsolete. (EXPERIMENTAL)""" + ctx = context.resource(mapping, 'ctx') if ctx.obsolete(): return 'obsolete' return '' -@templatekeyword('peerurls') -def showpeerurls(repo, **args): +@templatekeyword('peerurls', requires={'repo'}) +def showpeerurls(context, mapping): """A dictionary of repository locations defined in the [paths] section of your configuration file.""" + repo = context.resource(mapping, 'repo') # see commands.paths() for naming of dictionary keys paths = repo.ui.paths urls = util.sortdict((k, p.rawloc) for k, p in sorted(paths.iteritems())) @@ -712,40 +577,46 @@ return d return _hybrid(None, urls, makemap, lambda k: '%s=%s' % (k, urls[k])) -@templatekeyword("predecessors") -def showpredecessors(repo, ctx, **args): +@templatekeyword("predecessors", requires={'repo', 'ctx'}) +def showpredecessors(context, mapping): """Returns the list if the closest visible successors. (EXPERIMENTAL)""" + repo = context.resource(mapping, 'repo') + ctx = context.resource(mapping, 'ctx') predecessors = sorted(obsutil.closestpredecessors(repo, ctx.node())) predecessors = map(hex, predecessors) return _hybrid(None, predecessors, - lambda x: {'ctx': repo[x], 'revcache': {}}, + lambda x: {'ctx': repo[x]}, lambda x: scmutil.formatchangeid(repo[x])) -@templatekeyword("successorssets") -def showsuccessorssets(repo, ctx, **args): +@templatekeyword('reporoot', requires={'repo'}) +def showreporoot(context, mapping): + """String. The root directory of the current repository.""" + repo = context.resource(mapping, 'repo') + return repo.root + +@templatekeyword("successorssets", requires={'repo', 'ctx'}) +def showsuccessorssets(context, mapping): """Returns a string of sets of successors for a changectx. Format used is: [ctx1, ctx2], [ctx3] if ctx has been splitted into ctx1 and ctx2 while also diverged into ctx3. (EXPERIMENTAL)""" + repo = context.resource(mapping, 'repo') + ctx = context.resource(mapping, 'ctx') if not ctx.obsolete(): return '' - args = pycompat.byteskwargs(args) ssets = obsutil.successorssets(repo, ctx.node(), closest=True) ssets = [[hex(n) for n in ss] for ss in ssets] data = [] for ss in ssets: - h = _hybrid(None, ss, lambda x: {'ctx': repo[x], 'revcache': {}}, + h = _hybrid(None, ss, lambda x: {'ctx': repo[x]}, lambda x: scmutil.formatchangeid(repo[x])) data.append(h) # Format the successorssets def render(d): - t = [] - for i in d.gen(): - t.append(i) - return "".join(t) + return templateutil.stringify(context, mapping, d) def gen(data): yield "; ".join(render(d) for d in data) @@ -753,13 +624,15 @@ return _hybrid(gen(data), data, lambda x: {'successorset': x}, pycompat.identity) -@templatekeyword("succsandmarkers") -def showsuccsandmarkers(repo, ctx, **args): +@templatekeyword("succsandmarkers", requires={'repo', 'ctx'}) +def showsuccsandmarkers(context, mapping): """Returns a list of dict for each final successor of ctx. The dict contains successors node id in "successors" keys and the list of obs-markers from ctx to the set of successors in "markers". (EXPERIMENTAL) """ + repo = context.resource(mapping, 'repo') + ctx = context.resource(mapping, 'ctx') values = obsutil.successorsandmarkers(repo, ctx) @@ -774,7 +647,7 @@ successors = [hex(n) for n in successors] successors = _hybrid(None, successors, - lambda x: {'ctx': repo[x], 'revcache': {}}, + lambda x: {'ctx': repo[x]}, lambda x: scmutil.formatchangeid(repo[x])) # Format markers @@ -790,86 +663,89 @@ data.append({'successors': successors, 'markers': finalmarkers}) - f = _showlist('succsandmarkers', data, args) - return _hybrid(f, data, lambda x: x, pycompat.identity) + return templateutil.mappinglist(data) -@templatekeyword('p1rev') -def showp1rev(repo, ctx, templ, **args): +@templatekeyword('p1rev', requires={'ctx'}) +def showp1rev(context, mapping): """Integer. The repository-local revision number of the changeset's first parent, or -1 if the changeset has no parents.""" + ctx = context.resource(mapping, 'ctx') return ctx.p1().rev() -@templatekeyword('p2rev') -def showp2rev(repo, ctx, templ, **args): +@templatekeyword('p2rev', requires={'ctx'}) +def showp2rev(context, mapping): """Integer. The repository-local revision number of the changeset's second parent, or -1 if the changeset has no second parent.""" + ctx = context.resource(mapping, 'ctx') return ctx.p2().rev() -@templatekeyword('p1node') -def showp1node(repo, ctx, templ, **args): +@templatekeyword('p1node', requires={'ctx'}) +def showp1node(context, mapping): """String. The identification hash of the changeset's first parent, as a 40 digit hexadecimal string. If the changeset has no parents, all digits are 0.""" + ctx = context.resource(mapping, 'ctx') return ctx.p1().hex() -@templatekeyword('p2node') -def showp2node(repo, ctx, templ, **args): +@templatekeyword('p2node', requires={'ctx'}) +def showp2node(context, mapping): """String. The identification hash of the changeset's second parent, as a 40 digit hexadecimal string. If the changeset has no second parent, all digits are 0.""" + ctx = context.resource(mapping, 'ctx') return ctx.p2().hex() -@templatekeyword('parents') -def showparents(**args): +@templatekeyword('parents', requires={'repo', 'ctx'}) +def showparents(context, mapping): """List of strings. The parents of the changeset in "rev:node" format. If the changeset has only one "natural" parent (the predecessor revision) nothing is shown.""" - args = pycompat.byteskwargs(args) - repo = args['repo'] - ctx = args['ctx'] + repo = context.resource(mapping, 'repo') + ctx = context.resource(mapping, 'ctx') pctxs = scmutil.meaningfulparents(repo, ctx) prevs = [p.rev() for p in pctxs] parents = [[('rev', p.rev()), ('node', p.hex()), ('phase', p.phasestr())] for p in pctxs] - f = _showlist('parent', parents, args) - return _hybrid(f, prevs, lambda x: {'ctx': repo[x], 'revcache': {}}, + f = _showcompatlist(context, mapping, 'parent', parents) + return _hybrid(f, prevs, lambda x: {'ctx': repo[x]}, lambda x: scmutil.formatchangeid(repo[x]), keytype=int) -@templatekeyword('phase') -def showphase(repo, ctx, templ, **args): +@templatekeyword('phase', requires={'ctx'}) +def showphase(context, mapping): """String. The changeset phase name.""" + ctx = context.resource(mapping, 'ctx') return ctx.phasestr() -@templatekeyword('phaseidx') -def showphaseidx(repo, ctx, templ, **args): +@templatekeyword('phaseidx', requires={'ctx'}) +def showphaseidx(context, mapping): """Integer. The changeset phase index. (ADVANCED)""" + ctx = context.resource(mapping, 'ctx') return ctx.phase() -@templatekeyword('rev') -def showrev(repo, ctx, templ, **args): +@templatekeyword('rev', requires={'ctx'}) +def showrev(context, mapping): """Integer. The repository-local changeset revision number.""" + ctx = context.resource(mapping, 'ctx') return scmutil.intrev(ctx) -def showrevslist(name, revs, **args): +def showrevslist(context, mapping, name, revs): """helper to generate a list of revisions in which a mapped template will be evaluated""" - args = pycompat.byteskwargs(args) - repo = args['ctx'].repo() - f = _showlist(name, ['%d' % r for r in revs], args) + repo = context.resource(mapping, 'repo') + f = _showcompatlist(context, mapping, name, ['%d' % r for r in revs]) return _hybrid(f, revs, - lambda x: {name: x, 'ctx': repo[x], 'revcache': {}}, + lambda x: {name: x, 'ctx': repo[x]}, pycompat.identity, keytype=int) -@templatekeyword('subrepos') -def showsubrepos(**args): +@templatekeyword('subrepos', requires={'ctx'}) +def showsubrepos(context, mapping): """List of strings. Updated subrepositories in the changeset.""" - args = pycompat.byteskwargs(args) - ctx = args['ctx'] + ctx = context.resource(mapping, 'ctx') substate = ctx.substate if not substate: - return showlist('subrepo', [], args) + return compatlist(context, mapping, 'subrepo', []) psubstate = ctx.parents()[0].substate or {} subrepos = [] for sub in substate: @@ -878,46 +754,37 @@ for sub in psubstate: if sub not in substate: subrepos.append(sub) # removed in ctx - return showlist('subrepo', sorted(subrepos), args) + return compatlist(context, mapping, 'subrepo', sorted(subrepos)) # don't remove "showtags" definition, even though namespaces will put # a helper function for "tags" keyword into "keywords" map automatically, # because online help text is built without namespaces initialization -@templatekeyword('tags') -def showtags(**args): +@templatekeyword('tags', requires={'repo', 'ctx'}) +def showtags(context, mapping): """List of strings. Any tags associated with the changeset.""" - return shownames('tags', **args) - -@templatekeyword('termwidth') -def showtermwidth(repo, ctx, templ, **args): - """Integer. The width of the current terminal.""" - return repo.ui.termwidth() + return shownames(context, mapping, 'tags') -@templatekeyword('troubles') -def showtroubles(repo, **args): - """List of strings. Evolution troubles affecting the changeset. - (DEPRECATED) - """ - msg = ("'troubles' is deprecated, " - "use 'instabilities'") - repo.ui.deprecwarn(msg, '4.4') +@templatekeyword('termwidth', requires={'ui'}) +def showtermwidth(context, mapping): + """Integer. The width of the current terminal.""" + ui = context.resource(mapping, 'ui') + return ui.termwidth() - return showinstabilities(repo=repo, **args) - -@templatekeyword('instabilities') -def showinstabilities(**args): +@templatekeyword('instabilities', requires={'ctx'}) +def showinstabilities(context, mapping): """List of strings. Evolution instabilities affecting the changeset. (EXPERIMENTAL) """ - args = pycompat.byteskwargs(args) - return showlist('instability', args['ctx'].instabilities(), args, - plural='instabilities') + ctx = context.resource(mapping, 'ctx') + return compatlist(context, mapping, 'instability', ctx.instabilities(), + plural='instabilities') -@templatekeyword('verbosity') -def showverbosity(ui, **args): +@templatekeyword('verbosity', requires={'ui'}) +def showverbosity(context, mapping): """String. The current output verbosity in 'debug', 'quiet', 'verbose', or ''.""" - # see cmdutil.changeset_templater for priority of these flags + ui = context.resource(mapping, 'ui') + # see logcmdutil.changesettemplater for priority of these flags if ui.debugflag: return 'debug' elif ui.quiet: @@ -926,6 +793,31 @@ return 'verbose' return '' +@templatekeyword('whyunstable', requires={'repo', 'ctx'}) +def showwhyunstable(context, mapping): + """List of dicts explaining all instabilities of a changeset. + (EXPERIMENTAL) + """ + repo = context.resource(mapping, 'repo') + ctx = context.resource(mapping, 'ctx') + + def formatnode(ctx): + return '%s (%s)' % (scmutil.formatchangeid(ctx), ctx.phasestr()) + + entries = obsutil.whyunstable(repo, ctx) + + for entry in entries: + if entry.get('divergentnodes'): + dnodes = entry['divergentnodes'] + dnhybrid = _hybrid(None, [dnode.hex() for dnode in dnodes], + lambda x: {'ctx': repo[x]}, + lambda x: formatnode(repo[x])) + entry['divergentnodes'] = dnhybrid + + tmpl = ('{instability}:{if(divergentnodes, " ")}{divergentnodes} ' + '{reason} {node|short}') + return templateutil.mappinglist(entries, tmpl=tmpl, sep='\n') + def loadkeyword(ui, extname, registrarobj): """Load template keyword from specified registrarobj """ diff -r fb92df8b634c -r ed5448edcbfa mercurial/templater.py --- a/mercurial/templater.py Wed Apr 04 10:35:09 2018 -0400 +++ b/mercurial/templater.py Wed Apr 18 15:32:08 2018 -0400 @@ -5,30 +5,75 @@ # This software may be used and distributed according to the terms of the # GNU General Public License version 2 or any later version. +"""Slightly complicated template engine for commands and hgweb + +This module provides low-level interface to the template engine. See the +formatter and cmdutil modules if you are looking for high-level functions +such as ``cmdutil.rendertemplate(ctx, tmpl)``. + +Internal Data Types +------------------- + +Template keywords and functions take a dictionary of current symbols and +resources (a "mapping") and return result. Inputs and outputs must be one +of the following data types: + +bytes + a byte string, which is generally a human-readable text in local encoding. + +generator + a lazily-evaluated byte string, which is a possibly nested generator of + values of any printable types, and will be folded by ``stringify()`` + or ``flatten()``. + + BUG: hgweb overloads this type for mappings (i.e. some hgweb keywords + returns a generator of dicts.) + +None + sometimes represents an empty value, which can be stringified to ''. + +True, False, int, float + can be stringified as such. + +date tuple + a (unixtime, offset) tuple, which produces no meaningful output by itself. + +hybrid + represents a list/dict of printable values, which can also be converted + to mappings by % operator. + +mappable + represents a scalar printable value, also supports % operator. + +mappinggenerator, mappinglist + represents mappings (i.e. a list of dicts), which may have default + output format. + +mappedgenerator + a lazily-evaluated list of byte strings, which is e.g. a result of % + operation. +""" + from __future__ import absolute_import, print_function +import abc import os -import re -import types from .i18n import _ from . import ( - color, config, encoding, error, - minirst, - obsutil, parser, pycompat, - registrar, - revset as revsetmod, - revsetlang, - scmutil, templatefilters, - templatekw, + templatefuncs, + templateutil, util, ) +from .utils import ( + stringutil, +) # template parsing @@ -92,8 +137,8 @@ pos += 1 yield ('integer', program[s:pos], s) pos -= 1 - elif (c == '\\' and program[pos:pos + 2] in (r"\'", r'\"') - or c == 'r' and program[pos:pos + 3] in (r"r\'", r'r\"')): + elif (c == '\\' and program[pos:pos + 2] in (br"\'", br'\"') + or c == 'r' and program[pos:pos + 3] in (br"r\'", br'r\"')): # handle escaped quoted strings for compatibility with 2.9.2-3.4, # where some of nested templates were preprocessed as strings and # then compiled. therefore, \"...\" was allowed. (issue4733) @@ -138,7 +183,7 @@ yield ('symbol', sym, s) pos -= 1 elif c == term: - yield ('end', None, pos + 1) + yield ('end', None, pos) return else: raise error.ParseError(_("syntax error"), pos) @@ -161,36 +206,98 @@ ([('string', 'foo\\')], 6) """ parsed = [] + for typ, val, pos in _scantemplate(tmpl, start, stop, quote): + if typ == 'string': + parsed.append((typ, val)) + elif typ == 'template': + parsed.append(val) + elif typ == 'end': + return parsed, pos + else: + raise error.ProgrammingError('unexpected type: %s' % typ) + raise error.ProgrammingError('unterminated scanning of template') + +def scantemplate(tmpl, raw=False): + r"""Scan (type, start, end) positions of outermost elements in template + + If raw=True, a backslash is not taken as an escape character just like + r'' string in Python. Note that this is different from r'' literal in + template in that no template fragment can appear in r'', e.g. r'{foo}' + is a literal '{foo}', but ('{foo}', raw=True) is a template expression + 'foo'. + + >>> list(scantemplate(b'foo{bar}"baz')) + [('string', 0, 3), ('template', 3, 8), ('string', 8, 12)] + >>> list(scantemplate(b'outer{"inner"}outer')) + [('string', 0, 5), ('template', 5, 14), ('string', 14, 19)] + >>> list(scantemplate(b'foo\\{escaped}')) + [('string', 0, 5), ('string', 5, 13)] + >>> list(scantemplate(b'foo\\{escaped}', raw=True)) + [('string', 0, 4), ('template', 4, 13)] + """ + last = None + for typ, val, pos in _scantemplate(tmpl, 0, len(tmpl), raw=raw): + if last: + yield last + (pos,) + if typ == 'end': + return + else: + last = (typ, pos) + raise error.ProgrammingError('unterminated scanning of template') + +def _scantemplate(tmpl, start, stop, quote='', raw=False): + """Parse template string into chunks of strings and template expressions""" sepchars = '{' + quote + unescape = [parser.unescapestr, pycompat.identity][raw] pos = start p = parser.parser(elements) - while pos < stop: - n = min((tmpl.find(c, pos, stop) for c in sepchars), - key=lambda n: (n < 0, n)) - if n < 0: - parsed.append(('string', parser.unescapestr(tmpl[pos:stop]))) - pos = stop - break - c = tmpl[n:n + 1] - bs = (n - pos) - len(tmpl[pos:n].rstrip('\\')) - if bs % 2 == 1: - # escaped (e.g. '\{', '\\\{', but not '\\{') - parsed.append(('string', parser.unescapestr(tmpl[pos:n - 1]) + c)) - pos = n + 1 - continue - if n > pos: - parsed.append(('string', parser.unescapestr(tmpl[pos:n]))) - if c == quote: - return parsed, n + 1 + try: + while pos < stop: + n = min((tmpl.find(c, pos, stop) for c in sepchars), + key=lambda n: (n < 0, n)) + if n < 0: + yield ('string', unescape(tmpl[pos:stop]), pos) + pos = stop + break + c = tmpl[n:n + 1] + bs = 0 # count leading backslashes + if not raw: + bs = (n - pos) - len(tmpl[pos:n].rstrip('\\')) + if bs % 2 == 1: + # escaped (e.g. '\{', '\\\{', but not '\\{') + yield ('string', unescape(tmpl[pos:n - 1]) + c, pos) + pos = n + 1 + continue + if n > pos: + yield ('string', unescape(tmpl[pos:n]), pos) + if c == quote: + yield ('end', None, n + 1) + return - parseres, pos = p.parse(tokenize(tmpl, n + 1, stop, '}')) - if not tmpl.endswith('}', n + 1, pos): - raise error.ParseError(_("invalid token"), pos) - parsed.append(parseres) + parseres, pos = p.parse(tokenize(tmpl, n + 1, stop, '}')) + if not tmpl.startswith('}', pos): + raise error.ParseError(_("invalid token"), pos) + yield ('template', parseres, n) + pos += 1 - if quote: - raise error.ParseError(_("unterminated string"), start) - return parsed, pos + if quote: + raise error.ParseError(_("unterminated string"), start) + except error.ParseError as inst: + if len(inst.args) > 1: # has location + loc = inst.args[1] + # Offset the caret location by the number of newlines before the + # location of the error, since we will replace one-char newlines + # with the two-char literal r'\n'. + offset = tmpl[:loc].count('\n') + tmpl = tmpl.replace('\n', br'\n') + # We want the caret to point to the place in the template that + # failed to parse, but in a hint we get a open paren at the + # start. Therefore, we print "loc + 1" spaces (instead of "loc") + # to line up the caret with the location of the error. + inst.hint = (tmpl + '\n' + + ' ' * (loc + 1 + offset) + '^ ' + _('here')) + raise + yield ('end', None, pos) def _unnesttemplatelist(tree): """Expand list of templates to node tuple @@ -292,236 +399,48 @@ return context._load(exp[1]) raise error.ParseError(_("expected template specifier")) -def findsymbolicname(arg): - """Find symbolic name for the given compiled expression; returns None - if nothing found reliably""" - while True: - func, data = arg - if func is runsymbol: - return data - elif func is runfilter: - arg = data[0] - else: - return None - -def evalrawexp(context, mapping, arg): - """Evaluate given argument as a bare template object which may require - further processing (such as folding generator of strings)""" - func, data = arg - return func(context, mapping, data) - -def evalfuncarg(context, mapping, arg): - """Evaluate given argument as value type""" - thing = evalrawexp(context, mapping, arg) - thing = templatekw.unwrapvalue(thing) - # evalrawexp() may return string, generator of strings or arbitrary object - # such as date tuple, but filter does not want generator. - if isinstance(thing, types.GeneratorType): - thing = stringify(thing) - return thing - -def evalboolean(context, mapping, arg): - """Evaluate given argument as boolean, but also takes boolean literals""" - func, data = arg - if func is runsymbol: - thing = func(context, mapping, data, default=None) - if thing is None: - # not a template keyword, takes as a boolean literal - thing = util.parsebool(data) - else: - thing = func(context, mapping, data) - thing = templatekw.unwrapvalue(thing) - if isinstance(thing, bool): - return thing - # other objects are evaluated as strings, which means 0 is True, but - # empty dict/list should be False as they are expected to be '' - return bool(stringify(thing)) - -def evalinteger(context, mapping, arg, err=None): - v = evalfuncarg(context, mapping, arg) - try: - return int(v) - except (TypeError, ValueError): - raise error.ParseError(err or _('not an integer')) - -def evalstring(context, mapping, arg): - return stringify(evalrawexp(context, mapping, arg)) - -def evalstringliteral(context, mapping, arg): - """Evaluate given argument as string template, but returns symbol name - if it is unknown""" - func, data = arg - if func is runsymbol: - thing = func(context, mapping, data, default=data) - else: - thing = func(context, mapping, data) - return stringify(thing) - -_evalfuncbytype = { - bool: evalboolean, - bytes: evalstring, - int: evalinteger, -} - -def evalastype(context, mapping, arg, typ): - """Evaluate given argument and coerce its type""" - try: - f = _evalfuncbytype[typ] - except KeyError: - raise error.ProgrammingError('invalid type specified: %r' % typ) - return f(context, mapping, arg) - -def runinteger(context, mapping, data): - return int(data) - -def runstring(context, mapping, data): - return data - -def _recursivesymbolblocker(key): - def showrecursion(**args): - raise error.Abort(_("recursive reference '%s' in template") % key) - return showrecursion - def _runrecursivesymbol(context, mapping, key): raise error.Abort(_("recursive reference '%s' in template") % key) -def runsymbol(context, mapping, key, default=''): - v = context.symbol(mapping, key) - if v is None: - # put poison to cut recursion. we can't move this to parsing phase - # because "x = {x}" is allowed if "x" is a keyword. (issue4758) - safemapping = mapping.copy() - safemapping[key] = _recursivesymbolblocker(key) - try: - v = context.process(key, safemapping) - except TemplateNotFound: - v = default - if callable(v): - # TODO: templatekw functions will be updated to take (context, mapping) - # pair instead of **props - props = context._resources.copy() - props.update(mapping) - return v(**pycompat.strkwargs(props)) - return v - def buildtemplate(exp, context): ctmpl = [compileexp(e, context, methods) for e in exp[1:]] - return (runtemplate, ctmpl) - -def runtemplate(context, mapping, template): - for arg in template: - yield evalrawexp(context, mapping, arg) + return (templateutil.runtemplate, ctmpl) def buildfilter(exp, context): n = getsymbol(exp[2]) if n in context._filters: filt = context._filters[n] arg = compileexp(exp[1], context, methods) - return (runfilter, (arg, filt)) - if n in funcs: - f = funcs[n] + return (templateutil.runfilter, (arg, filt)) + if n in context._funcs: + f = context._funcs[n] args = _buildfuncargs(exp[1], context, methods, n, f._argspec) return (f, args) raise error.ParseError(_("unknown function '%s'") % n) -def runfilter(context, mapping, data): - arg, filt = data - thing = evalfuncarg(context, mapping, arg) - try: - return filt(thing) - except (ValueError, AttributeError, TypeError): - sym = findsymbolicname(arg) - if sym: - msg = (_("template filter '%s' is not compatible with keyword '%s'") - % (pycompat.sysbytes(filt.__name__), sym)) - else: - msg = (_("incompatible use of template filter '%s'") - % pycompat.sysbytes(filt.__name__)) - raise error.Abort(msg) - def buildmap(exp, context): darg = compileexp(exp[1], context, methods) targ = gettemplate(exp[2], context) - return (runmap, (darg, targ)) - -def runmap(context, mapping, data): - darg, targ = data - d = evalrawexp(context, mapping, darg) - if util.safehasattr(d, 'itermaps'): - diter = d.itermaps() - else: - try: - diter = iter(d) - except TypeError: - sym = findsymbolicname(darg) - if sym: - raise error.ParseError(_("keyword '%s' is not iterable") % sym) - else: - raise error.ParseError(_("%r is not iterable") % d) - - for i, v in enumerate(diter): - lm = mapping.copy() - lm['index'] = i - if isinstance(v, dict): - lm.update(v) - lm['originalnode'] = mapping.get('node') - yield evalrawexp(context, lm, targ) - else: - # v is not an iterable of dicts, this happen when 'key' - # has been fully expanded already and format is useless. - # If so, return the expanded value. - yield v + return (templateutil.runmap, (darg, targ)) def buildmember(exp, context): darg = compileexp(exp[1], context, methods) memb = getsymbol(exp[2]) - return (runmember, (darg, memb)) - -def runmember(context, mapping, data): - darg, memb = data - d = evalrawexp(context, mapping, darg) - if util.safehasattr(d, 'tomap'): - lm = mapping.copy() - lm.update(d.tomap()) - return runsymbol(context, lm, memb) - if util.safehasattr(d, 'get'): - return _getdictitem(d, memb) - - sym = findsymbolicname(darg) - if sym: - raise error.ParseError(_("keyword '%s' has no member") % sym) - else: - raise error.ParseError(_("%r has no member") % d) + return (templateutil.runmember, (darg, memb)) def buildnegate(exp, context): arg = compileexp(exp[1], context, exprmethods) - return (runnegate, arg) - -def runnegate(context, mapping, data): - data = evalinteger(context, mapping, data, - _('negation needs an integer argument')) - return -data + return (templateutil.runnegate, arg) def buildarithmetic(exp, context, func): left = compileexp(exp[1], context, exprmethods) right = compileexp(exp[2], context, exprmethods) - return (runarithmetic, (func, left, right)) - -def runarithmetic(context, mapping, data): - func, left, right = data - left = evalinteger(context, mapping, left, - _('arithmetic only defined on integers')) - right = evalinteger(context, mapping, right, - _('arithmetic only defined on integers')) - try: - return func(left, right) - except ZeroDivisionError: - raise error.Abort(_('division by zero is not defined')) + return (templateutil.runarithmetic, (func, left, right)) def buildfunc(exp, context): n = getsymbol(exp[1]) - if n in funcs: - f = funcs[n] + if n in context._funcs: + f = context._funcs[n] args = _buildfuncargs(exp[2], context, exprmethods, n, f._argspec) return (f, args) if n in context._filters: @@ -529,14 +448,14 @@ if len(args) != 1: raise error.ParseError(_("filter %s expects one argument") % n) f = context._filters[n] - return (runfilter, (args[0], f)) + return (templateutil.runfilter, (args[0], f)) raise error.ParseError(_("unknown function '%s'") % n) def _buildfuncargs(exp, context, curmethods, funcname, argspec): """Compile parsed tree of function arguments into list or dict of (func, data) pairs - >>> context = engine(lambda t: (runsymbol, t)) + >>> context = engine(lambda t: (templateutil.runsymbol, t)) >>> def fargs(expr, argspec): ... x = _parseexpr(expr) ... n = getsymbol(x[1]) @@ -572,647 +491,11 @@ def buildkeyvaluepair(exp, content): raise error.ParseError(_("can't use a key-value pair in this context")) -# dict of template built-in functions -funcs = {} - -templatefunc = registrar.templatefunc(funcs) - -@templatefunc('date(date[, fmt])') -def date(context, mapping, args): - """Format a date. See :hg:`help dates` for formatting - strings. The default is a Unix date format, including the timezone: - "Mon Sep 04 15:13:13 2006 0700".""" - if not (1 <= len(args) <= 2): - # i18n: "date" is a keyword - raise error.ParseError(_("date expects one or two arguments")) - - date = evalfuncarg(context, mapping, args[0]) - fmt = None - if len(args) == 2: - fmt = evalstring(context, mapping, args[1]) - try: - if fmt is None: - return util.datestr(date) - else: - return util.datestr(date, fmt) - except (TypeError, ValueError): - # i18n: "date" is a keyword - raise error.ParseError(_("date expects a date information")) - -@templatefunc('dict([[key=]value...])', argspec='*args **kwargs') -def dict_(context, mapping, args): - """Construct a dict from key-value pairs. A key may be omitted if - a value expression can provide an unambiguous name.""" - data = util.sortdict() - - for v in args['args']: - k = findsymbolicname(v) - if not k: - raise error.ParseError(_('dict key cannot be inferred')) - if k in data or k in args['kwargs']: - raise error.ParseError(_("duplicated dict key '%s' inferred") % k) - data[k] = evalfuncarg(context, mapping, v) - - data.update((k, evalfuncarg(context, mapping, v)) - for k, v in args['kwargs'].iteritems()) - return templatekw.hybriddict(data) - -@templatefunc('diff([includepattern [, excludepattern]])') -def diff(context, mapping, args): - """Show a diff, optionally - specifying files to include or exclude.""" - if len(args) > 2: - # i18n: "diff" is a keyword - raise error.ParseError(_("diff expects zero, one, or two arguments")) - - def getpatterns(i): - if i < len(args): - s = evalstring(context, mapping, args[i]).strip() - if s: - return [s] - return [] - - ctx = context.resource(mapping, 'ctx') - chunks = ctx.diff(match=ctx.match([], getpatterns(0), getpatterns(1))) - - return ''.join(chunks) - -@templatefunc('extdata(source)', argspec='source') -def extdata(context, mapping, args): - """Show a text read from the specified extdata source. (EXPERIMENTAL)""" - if 'source' not in args: - # i18n: "extdata" is a keyword - raise error.ParseError(_('extdata expects one argument')) - - source = evalstring(context, mapping, args['source']) - cache = context.resource(mapping, 'cache').setdefault('extdata', {}) - ctx = context.resource(mapping, 'ctx') - if source in cache: - data = cache[source] - else: - data = cache[source] = scmutil.extdatasource(ctx.repo(), source) - return data.get(ctx.rev(), '') - -@templatefunc('files(pattern)') -def files(context, mapping, args): - """All files of the current changeset matching the pattern. See - :hg:`help patterns`.""" - if not len(args) == 1: - # i18n: "files" is a keyword - raise error.ParseError(_("files expects one argument")) - - raw = evalstring(context, mapping, args[0]) - ctx = context.resource(mapping, 'ctx') - m = ctx.match([raw]) - files = list(ctx.matches(m)) - # TODO: pass (context, mapping) pair to keyword function - props = context._resources.copy() - props.update(mapping) - return templatekw.showlist("file", files, props) - -@templatefunc('fill(text[, width[, initialident[, hangindent]]])') -def fill(context, mapping, args): - """Fill many - paragraphs with optional indentation. See the "fill" filter.""" - if not (1 <= len(args) <= 4): - # i18n: "fill" is a keyword - raise error.ParseError(_("fill expects one to four arguments")) - - text = evalstring(context, mapping, args[0]) - width = 76 - initindent = '' - hangindent = '' - if 2 <= len(args) <= 4: - width = evalinteger(context, mapping, args[1], - # i18n: "fill" is a keyword - _("fill expects an integer width")) - try: - initindent = evalstring(context, mapping, args[2]) - hangindent = evalstring(context, mapping, args[3]) - except IndexError: - pass - - return templatefilters.fill(text, width, initindent, hangindent) - -@templatefunc('formatnode(node)') -def formatnode(context, mapping, args): - """Obtain the preferred form of a changeset hash. (DEPRECATED)""" - if len(args) != 1: - # i18n: "formatnode" is a keyword - raise error.ParseError(_("formatnode expects one argument")) - - ui = context.resource(mapping, 'ui') - node = evalstring(context, mapping, args[0]) - if ui.debugflag: - return node - return templatefilters.short(node) - -@templatefunc('pad(text, width[, fillchar=\' \'[, left=False]])', - argspec='text width fillchar left') -def pad(context, mapping, args): - """Pad text with a - fill character.""" - if 'text' not in args or 'width' not in args: - # i18n: "pad" is a keyword - raise error.ParseError(_("pad() expects two to four arguments")) - - width = evalinteger(context, mapping, args['width'], - # i18n: "pad" is a keyword - _("pad() expects an integer width")) - - text = evalstring(context, mapping, args['text']) - - left = False - fillchar = ' ' - if 'fillchar' in args: - fillchar = evalstring(context, mapping, args['fillchar']) - if len(color.stripeffects(fillchar)) != 1: - # i18n: "pad" is a keyword - raise error.ParseError(_("pad() expects a single fill character")) - if 'left' in args: - left = evalboolean(context, mapping, args['left']) - - fillwidth = width - encoding.colwidth(color.stripeffects(text)) - if fillwidth <= 0: - return text - if left: - return fillchar * fillwidth + text - else: - return text + fillchar * fillwidth - -@templatefunc('indent(text, indentchars[, firstline])') -def indent(context, mapping, args): - """Indents all non-empty lines - with the characters given in the indentchars string. An optional - third parameter will override the indent for the first line only - if present.""" - if not (2 <= len(args) <= 3): - # i18n: "indent" is a keyword - raise error.ParseError(_("indent() expects two or three arguments")) - - text = evalstring(context, mapping, args[0]) - indent = evalstring(context, mapping, args[1]) - - if len(args) == 3: - firstline = evalstring(context, mapping, args[2]) - else: - firstline = indent - - # the indent function doesn't indent the first line, so we do it here - return templatefilters.indent(firstline + text, indent) - -@templatefunc('get(dict, key)') -def get(context, mapping, args): - """Get an attribute/key from an object. Some keywords - are complex types. This function allows you to obtain the value of an - attribute on these types.""" - if len(args) != 2: - # i18n: "get" is a keyword - raise error.ParseError(_("get() expects two arguments")) - - dictarg = evalfuncarg(context, mapping, args[0]) - if not util.safehasattr(dictarg, 'get'): - # i18n: "get" is a keyword - raise error.ParseError(_("get() expects a dict as first argument")) - - key = evalfuncarg(context, mapping, args[1]) - return _getdictitem(dictarg, key) - -def _getdictitem(dictarg, key): - val = dictarg.get(key) - if val is None: - return - return templatekw.wraphybridvalue(dictarg, key, val) - -@templatefunc('if(expr, then[, else])') -def if_(context, mapping, args): - """Conditionally execute based on the result of - an expression.""" - if not (2 <= len(args) <= 3): - # i18n: "if" is a keyword - raise error.ParseError(_("if expects two or three arguments")) - - test = evalboolean(context, mapping, args[0]) - if test: - yield evalrawexp(context, mapping, args[1]) - elif len(args) == 3: - yield evalrawexp(context, mapping, args[2]) - -@templatefunc('ifcontains(needle, haystack, then[, else])') -def ifcontains(context, mapping, args): - """Conditionally execute based - on whether the item "needle" is in "haystack".""" - if not (3 <= len(args) <= 4): - # i18n: "ifcontains" is a keyword - raise error.ParseError(_("ifcontains expects three or four arguments")) - - haystack = evalfuncarg(context, mapping, args[1]) - try: - needle = evalastype(context, mapping, args[0], - getattr(haystack, 'keytype', None) or bytes) - found = (needle in haystack) - except error.ParseError: - found = False - - if found: - yield evalrawexp(context, mapping, args[2]) - elif len(args) == 4: - yield evalrawexp(context, mapping, args[3]) - -@templatefunc('ifeq(expr1, expr2, then[, else])') -def ifeq(context, mapping, args): - """Conditionally execute based on - whether 2 items are equivalent.""" - if not (3 <= len(args) <= 4): - # i18n: "ifeq" is a keyword - raise error.ParseError(_("ifeq expects three or four arguments")) - - test = evalstring(context, mapping, args[0]) - match = evalstring(context, mapping, args[1]) - if test == match: - yield evalrawexp(context, mapping, args[2]) - elif len(args) == 4: - yield evalrawexp(context, mapping, args[3]) - -@templatefunc('join(list, sep)') -def join(context, mapping, args): - """Join items in a list with a delimiter.""" - if not (1 <= len(args) <= 2): - # i18n: "join" is a keyword - raise error.ParseError(_("join expects one or two arguments")) - - # TODO: perhaps this should be evalfuncarg(), but it can't because hgweb - # abuses generator as a keyword that returns a list of dicts. - joinset = evalrawexp(context, mapping, args[0]) - joinset = templatekw.unwrapvalue(joinset) - joinfmt = getattr(joinset, 'joinfmt', pycompat.identity) - joiner = " " - if len(args) > 1: - joiner = evalstring(context, mapping, args[1]) - - first = True - for x in joinset: - if first: - first = False - else: - yield joiner - yield joinfmt(x) - -@templatefunc('label(label, expr)') -def label(context, mapping, args): - """Apply a label to generated content. Content with - a label applied can result in additional post-processing, such as - automatic colorization.""" - if len(args) != 2: - # i18n: "label" is a keyword - raise error.ParseError(_("label expects two arguments")) - - ui = context.resource(mapping, 'ui') - thing = evalstring(context, mapping, args[1]) - # preserve unknown symbol as literal so effects like 'red', 'bold', - # etc. don't need to be quoted - label = evalstringliteral(context, mapping, args[0]) - - return ui.label(thing, label) - -@templatefunc('latesttag([pattern])') -def latesttag(context, mapping, args): - """The global tags matching the given pattern on the - most recent globally tagged ancestor of this changeset. - If no such tags exist, the "{tag}" template resolves to - the string "null".""" - if len(args) > 1: - # i18n: "latesttag" is a keyword - raise error.ParseError(_("latesttag expects at most one argument")) - - pattern = None - if len(args) == 1: - pattern = evalstring(context, mapping, args[0]) - - # TODO: pass (context, mapping) pair to keyword function - props = context._resources.copy() - props.update(mapping) - return templatekw.showlatesttags(pattern, **pycompat.strkwargs(props)) - -@templatefunc('localdate(date[, tz])') -def localdate(context, mapping, args): - """Converts a date to the specified timezone. - The default is local date.""" - if not (1 <= len(args) <= 2): - # i18n: "localdate" is a keyword - raise error.ParseError(_("localdate expects one or two arguments")) - - date = evalfuncarg(context, mapping, args[0]) - try: - date = util.parsedate(date) - except AttributeError: # not str nor date tuple - # i18n: "localdate" is a keyword - raise error.ParseError(_("localdate expects a date information")) - if len(args) >= 2: - tzoffset = None - tz = evalfuncarg(context, mapping, args[1]) - if isinstance(tz, str): - tzoffset, remainder = util.parsetimezone(tz) - if remainder: - tzoffset = None - if tzoffset is None: - try: - tzoffset = int(tz) - except (TypeError, ValueError): - # i18n: "localdate" is a keyword - raise error.ParseError(_("localdate expects a timezone")) - else: - tzoffset = util.makedate()[1] - return (date[0], tzoffset) - -@templatefunc('max(iterable)') -def max_(context, mapping, args, **kwargs): - """Return the max of an iterable""" - if len(args) != 1: - # i18n: "max" is a keyword - raise error.ParseError(_("max expects one argument")) - - iterable = evalfuncarg(context, mapping, args[0]) - try: - x = max(iterable) - except (TypeError, ValueError): - # i18n: "max" is a keyword - raise error.ParseError(_("max first argument should be an iterable")) - return templatekw.wraphybridvalue(iterable, x, x) - -@templatefunc('min(iterable)') -def min_(context, mapping, args, **kwargs): - """Return the min of an iterable""" - if len(args) != 1: - # i18n: "min" is a keyword - raise error.ParseError(_("min expects one argument")) - - iterable = evalfuncarg(context, mapping, args[0]) - try: - x = min(iterable) - except (TypeError, ValueError): - # i18n: "min" is a keyword - raise error.ParseError(_("min first argument should be an iterable")) - return templatekw.wraphybridvalue(iterable, x, x) - -@templatefunc('mod(a, b)') -def mod(context, mapping, args): - """Calculate a mod b such that a / b + a mod b == a""" - if not len(args) == 2: - # i18n: "mod" is a keyword - raise error.ParseError(_("mod expects two arguments")) - - func = lambda a, b: a % b - return runarithmetic(context, mapping, (func, args[0], args[1])) - -@templatefunc('obsfateoperations(markers)') -def obsfateoperations(context, mapping, args): - """Compute obsfate related information based on markers (EXPERIMENTAL)""" - if len(args) != 1: - # i18n: "obsfateoperations" is a keyword - raise error.ParseError(_("obsfateoperations expects one argument")) - - markers = evalfuncarg(context, mapping, args[0]) - - try: - data = obsutil.markersoperations(markers) - return templatekw.hybridlist(data, name='operation') - except (TypeError, KeyError): - # i18n: "obsfateoperations" is a keyword - errmsg = _("obsfateoperations first argument should be an iterable") - raise error.ParseError(errmsg) - -@templatefunc('obsfatedate(markers)') -def obsfatedate(context, mapping, args): - """Compute obsfate related information based on markers (EXPERIMENTAL)""" - if len(args) != 1: - # i18n: "obsfatedate" is a keyword - raise error.ParseError(_("obsfatedate expects one argument")) - - markers = evalfuncarg(context, mapping, args[0]) - - try: - data = obsutil.markersdates(markers) - return templatekw.hybridlist(data, name='date', fmt='%d %d') - except (TypeError, KeyError): - # i18n: "obsfatedate" is a keyword - errmsg = _("obsfatedate first argument should be an iterable") - raise error.ParseError(errmsg) - -@templatefunc('obsfateusers(markers)') -def obsfateusers(context, mapping, args): - """Compute obsfate related information based on markers (EXPERIMENTAL)""" - if len(args) != 1: - # i18n: "obsfateusers" is a keyword - raise error.ParseError(_("obsfateusers expects one argument")) - - markers = evalfuncarg(context, mapping, args[0]) - - try: - data = obsutil.markersusers(markers) - return templatekw.hybridlist(data, name='user') - except (TypeError, KeyError, ValueError): - # i18n: "obsfateusers" is a keyword - msg = _("obsfateusers first argument should be an iterable of " - "obsmakers") - raise error.ParseError(msg) - -@templatefunc('obsfateverb(successors, markers)') -def obsfateverb(context, mapping, args): - """Compute obsfate related information based on successors (EXPERIMENTAL)""" - if len(args) != 2: - # i18n: "obsfateverb" is a keyword - raise error.ParseError(_("obsfateverb expects two arguments")) - - successors = evalfuncarg(context, mapping, args[0]) - markers = evalfuncarg(context, mapping, args[1]) - - try: - return obsutil.obsfateverb(successors, markers) - except TypeError: - # i18n: "obsfateverb" is a keyword - errmsg = _("obsfateverb first argument should be countable") - raise error.ParseError(errmsg) - -@templatefunc('relpath(path)') -def relpath(context, mapping, args): - """Convert a repository-absolute path into a filesystem path relative to - the current working directory.""" - if len(args) != 1: - # i18n: "relpath" is a keyword - raise error.ParseError(_("relpath expects one argument")) - - repo = context.resource(mapping, 'ctx').repo() - path = evalstring(context, mapping, args[0]) - return repo.pathto(path) - -@templatefunc('revset(query[, formatargs...])') -def revset(context, mapping, args): - """Execute a revision set query. See - :hg:`help revset`.""" - if not len(args) > 0: - # i18n: "revset" is a keyword - raise error.ParseError(_("revset expects one or more arguments")) - - raw = evalstring(context, mapping, args[0]) - ctx = context.resource(mapping, 'ctx') - repo = ctx.repo() - - def query(expr): - m = revsetmod.match(repo.ui, expr, repo=repo) - return m(repo) - - if len(args) > 1: - formatargs = [evalfuncarg(context, mapping, a) for a in args[1:]] - revs = query(revsetlang.formatspec(raw, *formatargs)) - revs = list(revs) - else: - cache = context.resource(mapping, 'cache') - revsetcache = cache.setdefault("revsetcache", {}) - if raw in revsetcache: - revs = revsetcache[raw] - else: - revs = query(raw) - revs = list(revs) - revsetcache[raw] = revs - - # TODO: pass (context, mapping) pair to keyword function - props = context._resources.copy() - props.update(mapping) - return templatekw.showrevslist("revision", revs, - **pycompat.strkwargs(props)) - -@templatefunc('rstdoc(text, style)') -def rstdoc(context, mapping, args): - """Format reStructuredText.""" - if len(args) != 2: - # i18n: "rstdoc" is a keyword - raise error.ParseError(_("rstdoc expects two arguments")) - - text = evalstring(context, mapping, args[0]) - style = evalstring(context, mapping, args[1]) - - return minirst.format(text, style=style, keep=['verbose']) - -@templatefunc('separate(sep, args)', argspec='sep *args') -def separate(context, mapping, args): - """Add a separator between non-empty arguments.""" - if 'sep' not in args: - # i18n: "separate" is a keyword - raise error.ParseError(_("separate expects at least one argument")) - - sep = evalstring(context, mapping, args['sep']) - first = True - for arg in args['args']: - argstr = evalstring(context, mapping, arg) - if not argstr: - continue - if first: - first = False - else: - yield sep - yield argstr - -@templatefunc('shortest(node, minlength=4)') -def shortest(context, mapping, args): - """Obtain the shortest representation of - a node.""" - if not (1 <= len(args) <= 2): - # i18n: "shortest" is a keyword - raise error.ParseError(_("shortest() expects one or two arguments")) - - node = evalstring(context, mapping, args[0]) - - minlength = 4 - if len(args) > 1: - minlength = evalinteger(context, mapping, args[1], - # i18n: "shortest" is a keyword - _("shortest() expects an integer minlength")) - - # _partialmatch() of filtered changelog could take O(len(repo)) time, - # which would be unacceptably slow. so we look for hash collision in - # unfiltered space, which means some hashes may be slightly longer. - cl = context.resource(mapping, 'ctx')._repo.unfiltered().changelog - return cl.shortest(node, minlength) - -@templatefunc('strip(text[, chars])') -def strip(context, mapping, args): - """Strip characters from a string. By default, - strips all leading and trailing whitespace.""" - if not (1 <= len(args) <= 2): - # i18n: "strip" is a keyword - raise error.ParseError(_("strip expects one or two arguments")) - - text = evalstring(context, mapping, args[0]) - if len(args) == 2: - chars = evalstring(context, mapping, args[1]) - return text.strip(chars) - return text.strip() - -@templatefunc('sub(pattern, replacement, expression)') -def sub(context, mapping, args): - """Perform text substitution - using regular expressions.""" - if len(args) != 3: - # i18n: "sub" is a keyword - raise error.ParseError(_("sub expects three arguments")) - - pat = evalstring(context, mapping, args[0]) - rpl = evalstring(context, mapping, args[1]) - src = evalstring(context, mapping, args[2]) - try: - patre = re.compile(pat) - except re.error: - # i18n: "sub" is a keyword - raise error.ParseError(_("sub got an invalid pattern: %s") % pat) - try: - yield patre.sub(rpl, src) - except re.error: - # i18n: "sub" is a keyword - raise error.ParseError(_("sub got an invalid replacement: %s") % rpl) - -@templatefunc('startswith(pattern, text)') -def startswith(context, mapping, args): - """Returns the value from the "text" argument - if it begins with the content from the "pattern" argument.""" - if len(args) != 2: - # i18n: "startswith" is a keyword - raise error.ParseError(_("startswith expects two arguments")) - - patn = evalstring(context, mapping, args[0]) - text = evalstring(context, mapping, args[1]) - if text.startswith(patn): - return text - return '' - -@templatefunc('word(number, text[, separator])') -def word(context, mapping, args): - """Return the nth word from a string.""" - if not (2 <= len(args) <= 3): - # i18n: "word" is a keyword - raise error.ParseError(_("word expects two or three arguments, got %d") - % len(args)) - - num = evalinteger(context, mapping, args[0], - # i18n: "word" is a keyword - _("word expects an integer index")) - text = evalstring(context, mapping, args[1]) - if len(args) == 3: - splitter = evalstring(context, mapping, args[2]) - else: - splitter = None - - tokens = text.split(splitter) - if num >= len(tokens) or num < -len(tokens): - return '' - else: - return tokens[num] - # methods to interpret function arguments or inner expressions (e.g. {_(x)}) exprmethods = { - "integer": lambda e, c: (runinteger, e[1]), - "string": lambda e, c: (runstring, e[1]), - "symbol": lambda e, c: (runsymbol, e[1]), + "integer": lambda e, c: (templateutil.runinteger, e[1]), + "string": lambda e, c: (templateutil.runstring, e[1]), + "symbol": lambda e, c: (templateutil.runsymbol, e[1]), "template": buildtemplate, "group": lambda e, c: compileexp(e[1], c, exprmethods), ".": buildmember, @@ -1252,41 +535,47 @@ # template engine -stringify = templatefilters.stringify - -def _flatten(thing): - '''yield a single stream from a possibly nested set of iterators''' - thing = templatekw.unwraphybrid(thing) - if isinstance(thing, bytes): - yield thing - elif isinstance(thing, str): - # We can only hit this on Python 3, and it's here to guard - # against infinite recursion. - raise error.ProgrammingError('Mercurial IO including templates is done' - ' with bytes, not strings') - elif thing is None: - pass - elif not util.safehasattr(thing, '__iter__'): - yield pycompat.bytestr(thing) - else: - for i in thing: - i = templatekw.unwraphybrid(i) - if isinstance(i, bytes): - yield i - elif i is None: - pass - elif not util.safehasattr(i, '__iter__'): - yield pycompat.bytestr(i) - else: - for j in _flatten(i): - yield j - def unquotestring(s): '''unwrap quotes if any; otherwise returns unmodified string''' if len(s) < 2 or s[0] not in "'\"" or s[0] != s[-1]: return s return s[1:-1] +class resourcemapper(object): + """Mapper of internal template resources""" + + __metaclass__ = abc.ABCMeta + + @abc.abstractmethod + def availablekeys(self, context, mapping): + """Return a set of available resource keys based on the given mapping""" + + @abc.abstractmethod + def knownkeys(self): + """Return a set of supported resource keys""" + + @abc.abstractmethod + def lookup(self, context, mapping, key): + """Return a resource for the key if available; otherwise None""" + + @abc.abstractmethod + def populatemap(self, context, origmapping, newmapping): + """Return a dict of additional mapping items which should be paired + with the given new mapping""" + +class nullresourcemapper(resourcemapper): + def availablekeys(self, context, mapping): + return set() + + def knownkeys(self): + return set() + + def lookup(self, context, mapping, key): + return None + + def populatemap(self, context, origmapping, newmapping): + return {} + class engine(object): '''template expansion engine. @@ -1313,34 +602,63 @@ if filters is None: filters = {} self._filters = filters + self._funcs = templatefuncs.funcs # make this a parameter if needed if defaults is None: defaults = {} if resources is None: - resources = {} + resources = nullresourcemapper() self._defaults = defaults self._resources = resources self._aliasmap = _aliasrules.buildmap(aliases) self._cache = {} # key: (func, data) + self._tmplcache = {} # literal template: (func, data) + + def overlaymap(self, origmapping, newmapping): + """Create combined mapping from the original mapping and partial + mapping to override the original""" + # do not copy symbols which overrides the defaults depending on + # new resources, so the defaults will be re-evaluated (issue5612) + knownres = self._resources.knownkeys() + newres = self._resources.availablekeys(self, newmapping) + mapping = {k: v for k, v in origmapping.iteritems() + if (k in knownres # not a symbol per self.symbol() + or newres.isdisjoint(self._defaultrequires(k)))} + mapping.update(newmapping) + mapping.update( + self._resources.populatemap(self, origmapping, newmapping)) + return mapping + + def _defaultrequires(self, key): + """Resource keys required by the specified default symbol function""" + v = self._defaults.get(key) + if v is None or not callable(v): + return () + return getattr(v, '_requires', ()) def symbol(self, mapping, key): """Resolve symbol to value or function; None if nothing found""" v = None - if key not in self._resources: + if key not in self._resources.knownkeys(): v = mapping.get(key) if v is None: v = self._defaults.get(key) return v + def availableresourcekeys(self, mapping): + """Return a set of available resource keys based on the given mapping""" + return self._resources.availablekeys(self, mapping) + + def knownresourcekeys(self): + """Return a set of supported resource keys""" + return self._resources.knownkeys() + def resource(self, mapping, key): """Return internal data (e.g. cache) used for keyword/function evaluation""" - v = None - if key in self._resources: - v = mapping.get(key) + v = self._resources.lookup(self, mapping, key) if v is None: - v = self._resources.get(key) - if v is None: - raise error.Abort(_('template resource not available: %s') % key) + raise templateutil.ResourceUnavailable( + _('template resource not available: %s') % key) return v def _load(self, t): @@ -1358,12 +676,46 @@ raise return self._cache[t] + def _parse(self, tmpl): + """Parse and cache a literal template""" + if tmpl not in self._tmplcache: + x = parse(tmpl) + self._tmplcache[tmpl] = compileexp(x, self, methods) + return self._tmplcache[tmpl] + + def preload(self, t): + """Load, parse, and cache the specified template if available""" + try: + self._load(t) + return True + except templateutil.TemplateNotFound: + return False + def process(self, t, mapping): '''Perform expansion. t is name of map element to expand. mapping contains added elements for use during expansion. Is a generator.''' func, data = self._load(t) - return _flatten(func(self, mapping, data)) + return self._expand(func, data, mapping) + + def expand(self, tmpl, mapping): + """Perform expansion over a literal template + + No user aliases will be expanded since this is supposed to be called + with an internal template string. + """ + func, data = self._parse(tmpl) + return self._expand(func, data, mapping) + + def _expand(self, func, data, mapping): + # populate additional items only if they don't exist in the given + # mapping. this is slightly different from overlaymap() because the + # initial 'revcache' may contain pre-computed items. + extramapping = self._resources.populatemap(self, {}, mapping) + if extramapping: + extramapping.update(mapping) + mapping = extramapping + return templateutil.flatten(self, mapping, func(self, mapping, data)) engines = {'default': engine} @@ -1431,9 +783,6 @@ aliases.extend(conf['templatealias'].items()) return cache, tmap, aliases -class TemplateNotFound(error.Abort): - pass - class templater(object): def __init__(self, filters=None, defaults=None, resources=None, @@ -1443,8 +792,8 @@ - ``filters``: a dict of functions to transform a value into another. - ``defaults``: a dict of symbol values/functions; may be overridden by a ``mapping`` dict. - - ``resources``: a dict of internal data (e.g. cache), inaccessible - from user template; may be overridden by a ``mapping`` dict. + - ``resources``: a resourcemapper object to look up internal data + (e.g. cache), inaccessible from user template. - ``cache``: a dict of preloaded template fragments. - ``aliases``: a list of alias (name, replacement) pairs. @@ -1455,8 +804,6 @@ filters = {} if defaults is None: defaults = {} - if resources is None: - resources = {} if cache is None: cache = {} self.cache = cache.copy() @@ -1464,8 +811,7 @@ self.filters = templatefilters.filters.copy() self.filters.update(filters) self.defaults = defaults - self._resources = {'templ': self} - self._resources.update(resources) + self._resources = resources self._aliases = aliases self.minchunk, self.maxchunk = minchunk, maxchunk self.ecache = {} @@ -1490,20 +836,26 @@ try: self.cache[t] = util.readfile(self.map[t][1]) except KeyError as inst: - raise TemplateNotFound(_('"%s" not in template map') % - inst.args[0]) + raise templateutil.TemplateNotFound( + _('"%s" not in template map') % inst.args[0]) except IOError as inst: - raise IOError(inst.args[0], _('template file %s: %s') % - (self.map[t][1], inst.args[1])) + reason = (_('template file %s: %s') + % (self.map[t][1], + stringutil.forcebytestr(inst.args[1]))) + raise IOError(inst.args[0], encoding.strfromlocal(reason)) return self.cache[t] - def render(self, mapping): + def renderdefault(self, mapping): """Render the default unnamed template and return result as string""" - mapping = pycompat.strkwargs(mapping) - return stringify(self('', **mapping)) + return self.render('', mapping) - def __call__(self, t, **mapping): - mapping = pycompat.byteskwargs(mapping) + def render(self, t, mapping): + """Render the specified named template and return result as string""" + return b''.join(self.generate(t, mapping)) + + def generate(self, t, mapping): + """Return a generator that renders the specified named template and + yields chunks""" ttype = t in self.map and self.map[t][0] or 'default' if ttype not in self.ecache: try: @@ -1546,16 +898,16 @@ if paths is None: paths = templatepaths() - elif isinstance(paths, str): + elif isinstance(paths, bytes): paths = [paths] - if isinstance(styles, str): + if isinstance(styles, bytes): styles = [styles] for style in styles: # only plain name is allowed to honor template paths if (not style - or style in (os.curdir, os.pardir) + or style in (pycompat.oscurdir, pycompat.ospardir) or pycompat.ossep in style or pycompat.osaltsep and pycompat.osaltsep in style): continue @@ -1569,12 +921,3 @@ return style, mapfile raise RuntimeError("No hgweb templates found in %r" % paths) - -def loadfunction(ui, extname, registrarobj): - """Load template function from specified registrarobj - """ - for name, func in registrarobj._table.iteritems(): - funcs[name] = func - -# tell hggettext to extract docstrings from these functions: -i18nfunctions = funcs.values() diff -r fb92df8b634c -r ed5448edcbfa mercurial/templates/gitweb/changeset.tmpl --- a/mercurial/templates/gitweb/changeset.tmpl Wed Apr 04 10:35:09 2018 -0400 +++ b/mercurial/templates/gitweb/changeset.tmpl Wed Apr 18 15:32:08 2018 -0400 @@ -44,7 +44,8 @@ changeset {rev} {node|short} -{if(obsolete, 'obsolete{succsandmarkers%obsfateentry}')} +{if(obsolete, succsandmarkers%obsfateentry)} +{if(instabilities, whyunstable%whyunstableentry)} {ifeq(count(parent), '2', parent%changesetparentdiff, parent%changesetparent)} {child%changesetchild} diff -r fb92df8b634c -r ed5448edcbfa mercurial/templates/gitweb/map --- a/mercurial/templates/gitweb/map Wed Apr 04 10:35:09 2018 -0400 +++ b/mercurial/templates/gitweb/map Wed Apr 18 15:32:08 2018 -0400 @@ -57,7 +57,6 @@ fileellipses = '...' changelogentry = changelogentry.tmpl -searchentry = changelogentry.tmpl changeset = changeset.tmpl manifest = manifest.tmpl direntry = ' @@ -275,7 +274,20 @@ obsfatesuccessors = '{if(successors, ' as ')}{successors%successorlink}' obsfateverb = '{obsfateverb(successors, markers)}' obsfateoperations = '{if(obsfateoperations(markers), ' using {join(obsfateoperations(markers), ', ')}')}' -obsfateentry = '{obsfateverb}{obsfateoperations}{obsfatesuccessors}' +obsfateusers = '{if(obsfateusers(markers), ' by {join(obsfateusers(markers)%'{user|obfuscate}', ', ')}')}' +obsfatedate = '{if(obsfatedate(markers), ' {ifeq(min(obsfatedate(markers)), max(obsfatedate(markers)), '{min(obsfatedate(markers))|rfc822date}', 'between {min(obsfatedate(markers))|rfc822date} and {max(obsfatedate(markers))|rfc822date}')}')}' +obsfateentry = ' + + obsolete + {obsfateverb}{obsfateoperations}{obsfatesuccessors}{obsfateusers}{obsfatedate} + ' +instabilitychangesetlink = '{node|short}' +divergentnode = '{instabilitychangesetlink} ({phase})' +whyunstableentry = ' + + unstable + {instability}: {if(divergentnodes, divergentnodes%divergentnode)} {reason} {instabilitychangesetlink} + ' shortlogentry = ' {date|rfc822date} diff -r fb92df8b634c -r ed5448edcbfa mercurial/templates/gitweb/search.tmpl --- a/mercurial/templates/gitweb/search.tmpl Wed Apr 04 10:35:09 2018 -0400 +++ b/mercurial/templates/gitweb/search.tmpl Wed Apr 18 15:32:08 2018 -0400 @@ -30,6 +30,6 @@
searching for {query|escape}
-{entries} +{entries%changelogentry} {footer} diff -r fb92df8b634c -r ed5448edcbfa mercurial/templates/gitweb/summary.tmpl --- a/mercurial/templates/gitweb/summary.tmpl Wed Apr 04 10:35:09 2018 -0400 +++ b/mercurial/templates/gitweb/summary.tmpl Wed Apr 18 15:32:08 2018 -0400 @@ -36,13 +36,13 @@ -{shortlog} +{shortlog%shortlogentry}
...
-{tags} +{tags%tagentry}
...
diff -r fb92df8b634c -r ed5448edcbfa mercurial/templates/json/map --- a/mercurial/templates/json/map Wed Apr 04 10:35:09 2018 -0400 +++ b/mercurial/templates/json/map Wed Apr 18 15:32:08 2018 -0400 @@ -18,9 +18,8 @@ search = '\{ "node": {node|json}, "query": {query|json}, - "entries": [{join(entries%searchentry, ", ")}] + "entries": [{join(entries%changelistentry, ", ")}] }' -searchentry = '{changelistentry}' # changelog and shortlog are the same web API but with different # number of entries. changelog = changelist.tmpl @@ -143,7 +142,7 @@ "author": {author|utf8|json}, "parents": [{join(parent%changesetparent, ", ")}], "children": [{join(child%changesetparent, ", ")}], - "diff": [{join(diff%diffblock, ", ")}] + "diff": [{join(diff, ", ")}] }' diffblock = '\{ "blockno": {blockno|json}, diff -r fb92df8b634c -r ed5448edcbfa mercurial/templates/monoblue/changeset.tmpl --- a/mercurial/templates/monoblue/changeset.tmpl Wed Apr 04 10:35:09 2018 -0400 +++ b/mercurial/templates/monoblue/changeset.tmpl Wed Apr 18 15:32:08 2018 -0400 @@ -48,7 +48,8 @@ {branch%changesetbranch}
changeset {rev}
{node|short}
- {if(obsolete, '
obsolete
{succsandmarkers%obsfateentry}
')} + {if(obsolete, succsandmarkers%obsfateentry)} + {if(instabilities, whyunstable%whyunstableentry)} {ifeq(count(parent), '2', parent%changesetparentdiff, parent%changesetparent)} {child%changesetchild} diff -r fb92df8b634c -r ed5448edcbfa mercurial/templates/monoblue/map --- a/mercurial/templates/monoblue/map Wed Apr 04 10:35:09 2018 -0400 +++ b/mercurial/templates/monoblue/map Wed Apr 18 15:32:08 2018 -0400 @@ -57,7 +57,6 @@ fileellipses = '...' changelogentry = changelogentry.tmpl -searchentry = changelogentry.tmpl changeset = changeset.tmpl manifest = manifest.tmpl direntry = ' @@ -233,7 +232,16 @@ obsfatesuccessors = '{if(successors, ' as ')}{successors%successorlink}' obsfateverb = '{obsfateverb(successors, markers)}' obsfateoperations = '{if(obsfateoperations(markers), ' using {join(obsfateoperations(markers), ', ')}')}' -obsfateentry = '{obsfateverb}{obsfateoperations}{obsfatesuccessors}' +obsfateusers = '{if(obsfateusers(markers), ' by {join(obsfateusers(markers)%'{user|obfuscate}', ', ')}')}' +obsfatedate = '{if(obsfatedate(markers), ' {ifeq(min(obsfatedate(markers)), max(obsfatedate(markers)), '{min(obsfatedate(markers))|rfc822date}', 'between {min(obsfatedate(markers))|rfc822date} and {max(obsfatedate(markers))|rfc822date}')}')}' +obsfateentry = ' +
obsolete
+
{obsfateverb}{obsfateoperations}{obsfatesuccessors}{obsfateusers}{obsfatedate}
' +instabilitychangesetlink = '{node|short}' +divergentnode = '{instabilitychangesetlink} ({phase})' +whyunstableentry = ' +
unstable
+
{instability}: {if(divergentnodes, divergentnodes%divergentnode)} {reason} {instabilitychangesetlink}
' shortlogentry = ' {date|rfc822date} diff -r fb92df8b634c -r ed5448edcbfa mercurial/templates/monoblue/search.tmpl --- a/mercurial/templates/monoblue/search.tmpl Wed Apr 04 10:35:09 2018 -0400 +++ b/mercurial/templates/monoblue/search.tmpl Wed Apr 18 15:32:08 2018 -0400 @@ -26,6 +26,6 @@ - {entries} + {entries%changelogentry} {footer} diff -r fb92df8b634c -r ed5448edcbfa mercurial/templates/monoblue/summary.tmpl --- a/mercurial/templates/monoblue/summary.tmpl Wed Apr 04 10:35:09 2018 -0400 +++ b/mercurial/templates/monoblue/summary.tmpl Wed Apr 18 15:32:08 2018 -0400 @@ -39,7 +39,7 @@

Changes

-{shortlog} +{shortlog%shortlogentry} @@ -47,7 +47,7 @@

Tags

...
-{tags} +{tags%tagentry} diff -r fb92df8b634c -r ed5448edcbfa mercurial/templates/paper/changeset.tmpl --- a/mercurial/templates/paper/changeset.tmpl Wed Apr 04 10:35:09 2018 -0400 +++ b/mercurial/templates/paper/changeset.tmpl Wed Apr 18 15:32:08 2018 -0400 @@ -51,7 +51,11 @@ {if(obsolete, ' - + +')} +{if(instabilities, ' + + ')} diff -r fb92df8b634c -r ed5448edcbfa mercurial/templates/paper/map --- a/mercurial/templates/paper/map Wed Apr 04 10:35:09 2018 -0400 +++ b/mercurial/templates/paper/map Wed Apr 18 15:32:08 2018 -0400 @@ -33,7 +33,6 @@ diffstatlink = diffstat.tmpl diffstatnolink = diffstat.tmpl changelogentry = shortlogentry.tmpl -searchentry = shortlogentry.tmpl changeset = changeset.tmpl manifest = manifest.tmpl @@ -213,7 +212,12 @@ obsfatesuccessors = '{if(successors, ' as ')}{successors%successorlink}' obsfateverb = '{obsfateverb(successors, markers)}' obsfateoperations = '{if(obsfateoperations(markers), ' using {join(obsfateoperations(markers), ', ')}')}' -obsfateentry = '{obsfateverb}{obsfateoperations}{obsfatesuccessors}' +obsfateusers = '{if(obsfateusers(markers), ' by {join(obsfateusers(markers)%'{user|obfuscate}', ', ')}')}' +obsfatedate = '{if(obsfatedate(markers), ' {ifeq(min(obsfatedate(markers)), max(obsfatedate(markers)), '{min(obsfatedate(markers))|rfc822date}', 'between {min(obsfatedate(markers))|rfc822date} and {max(obsfatedate(markers))|rfc822date}')}')}' +obsfateentry = '{obsfateverb}{obsfateoperations}{obsfatesuccessors}{obsfateusers}{obsfatedate}' +instabilitychangesetlink = '{node|short}' +divergentnode = '{instabilitychangesetlink} ({phase})' +whyunstableentry = '{instability}: {if(divergentnodes, divergentnodes%divergentnode)} {reason} {instabilitychangesetlink}' filediffparent = ' diff -r fb92df8b634c -r ed5448edcbfa mercurial/templates/paper/search.tmpl --- a/mercurial/templates/paper/search.tmpl Wed Apr 04 10:35:09 2018 -0400 +++ b/mercurial/templates/paper/search.tmpl Wed Apr 18 15:32:08 2018 -0400 @@ -49,7 +49,7 @@ -{entries} +{entries%changelogentry}
...
obsolete{succsandmarkers%obsfateentry}{join(succsandmarkers%obsfateentry, '
\n')}
unstable{join(whyunstable%whyunstableentry, '
\n')}
parents
diff -r fb92df8b634c -r ed5448edcbfa mercurial/templates/raw/map --- a/mercurial/templates/raw/map Wed Apr 04 10:35:09 2018 -0400 +++ b/mercurial/templates/raw/map Wed Apr 18 15:32:08 2018 -0400 @@ -3,7 +3,6 @@ changelog = changelog.tmpl changelogentry = logentry.tmpl search = search.tmpl -searchentry = logentry.tmpl mimetype = 'text/plain; charset={encoding}' header = '' footer = '' diff -r fb92df8b634c -r ed5448edcbfa mercurial/templates/spartan/changelogentry.tmpl --- a/mercurial/templates/spartan/changelogentry.tmpl Wed Apr 04 10:35:09 2018 -0400 +++ b/mercurial/templates/spartan/changelogentry.tmpl Wed Apr 18 15:32:08 2018 -0400 @@ -22,14 +22,8 @@ phase: {phase|escape} ')} - {if(obsolete, ' - obsolete: - {succsandmarkers%obsfateentry} - ')} - {ifeq(count(instabilities), '0', '', ' - instabilities: - {instabilities%"{instability} "|escape} - ')} + {if(obsolete, succsandmarkers%obsfateentry)} + {if(instabilities, whyunstable%whyunstableentry)} files: {files} diff -r fb92df8b634c -r ed5448edcbfa mercurial/templates/spartan/changeset.tmpl --- a/mercurial/templates/spartan/changeset.tmpl Wed Apr 04 10:35:09 2018 -0400 +++ b/mercurial/templates/spartan/changeset.tmpl Wed Apr 18 15:32:08 2018 -0400 @@ -37,14 +37,8 @@ phase: {phase|escape} ')} -{if(obsolete, ' - obsolete: - {succsandmarkers%obsfateentry} -')} -{ifeq(count(instabilities), '0', '', ' - instabilities: - {instabilities%"{instability} "|escape} -')} +{if(obsolete, succsandmarkers%obsfateentry)} +{if(instabilities, whyunstable%whyunstableentry)} files: {files} diff -r fb92df8b634c -r ed5448edcbfa mercurial/templates/spartan/map --- a/mercurial/templates/spartan/map Wed Apr 04 10:35:09 2018 -0400 +++ b/mercurial/templates/spartan/map Wed Apr 18 15:32:08 2018 -0400 @@ -17,7 +17,6 @@ filenolink = '{file|escape} ' fileellipses = '...' changelogentry = changelogentry.tmpl -searchentry = changelogentry.tmpl changeset = changeset.tmpl manifest = manifest.tmpl @@ -170,7 +169,20 @@ obsfatesuccessors = '{if(successors, ' as ')}{successors%successorlink}' obsfateverb = '{obsfateverb(successors, markers)}' obsfateoperations = '{if(obsfateoperations(markers), ' using {join(obsfateoperations(markers), ', ')}')}' -obsfateentry = '{obsfateverb}{obsfateoperations}{obsfatesuccessors}' +obsfateusers = '{if(obsfateusers(markers), ' by {join(obsfateusers(markers)%'{user|obfuscate}', ', ')}')}' +obsfatedate = '{if(obsfatedate(markers), ' {ifeq(min(obsfatedate(markers)), max(obsfatedate(markers)), '{min(obsfatedate(markers))|rfc822date}', 'between {min(obsfatedate(markers))|rfc822date} and {max(obsfatedate(markers))|rfc822date}')}')}' +obsfateentry = ' + + obsolete: + {obsfateverb}{obsfateoperations}{obsfatesuccessors}{obsfateusers}{obsfatedate} + ' +instabilitychangesetlink = '{node|short}' +divergentnode = '{instabilitychangesetlink} ({phase})' +whyunstableentry = ' + + unstable: + {instability}: {if(divergentnodes, divergentnodes%divergentnode)} {reason} {instabilitychangesetlink} + ' filediffparent = ' parent {rev}: diff -r fb92df8b634c -r ed5448edcbfa mercurial/templates/spartan/search.tmpl --- a/mercurial/templates/spartan/search.tmpl Wed Apr 04 10:35:09 2018 -0400 +++ b/mercurial/templates/spartan/search.tmpl Wed Apr 18 15:32:08 2018 -0400 @@ -24,7 +24,7 @@

-{entries} +{entries%changelogentry}
{sessionvars%hiddenformentry} diff -r fb92df8b634c -r ed5448edcbfa mercurial/templates/static/followlines.js --- a/mercurial/templates/static/followlines.js Wed Apr 04 10:35:09 2018 -0400 +++ b/mercurial/templates/static/followlines.js Wed Apr 18 15:32:08 2018 -0400 @@ -76,7 +76,7 @@ if ( childSupportElms.length > 0 ) { btnSupportElm = childSupportElms[0]; } - var refNode = btnSupportElm.children[0]; // node to insert