mercurial/pathencode.c
changeset 17616 9535a0dc41f2
parent 17606 318fb32b980e
child 17691 c6c7e466dd3a
equal deleted inserted replaced
17615:9e2dc0d292cd 17616:9535a0dc41f2
     4  Copyright 2012 Facebook
     4  Copyright 2012 Facebook
     5 
     5 
     6  This software may be used and distributed according to the terms of
     6  This software may be used and distributed according to the terms of
     7  the GNU General Public License, incorporated herein by reference.
     7  the GNU General Public License, incorporated herein by reference.
     8 */
     8 */
       
     9 
       
    10 /*
       
    11  * An implementation of the name encoding scheme used by the fncache
       
    12  * store.  The common case is of a path < 120 bytes long, which is
       
    13  * handled either in a single pass with no allocations or two passes
       
    14  * with a single allocation.  For longer paths, multiple passes are
       
    15  * required.
       
    16  */
     9 
    17 
    10 #include <Python.h>
    18 #include <Python.h>
    11 #include <assert.h>
    19 #include <assert.h>
    12 #include <ctype.h>
    20 #include <ctype.h>
    13 #include <stdlib.h>
    21 #include <stdlib.h>
    14 #include <string.h>
    22 #include <string.h>
    15 
    23 
    16 #include "util.h"
    24 #include "util.h"
       
    25 
       
    26 /* state machine for the fast path */
       
    27 enum path_state {
       
    28 	START,   /* first byte of a path component */
       
    29 	A,       /* "AUX" */
       
    30 	AU,
       
    31 	THIRD,   /* third of a 3-byte sequence, e.g. "AUX", "NUL" */
       
    32 	C,       /* "CON" or "COMn" */
       
    33 	CO,
       
    34 	COMLPT,  /* "COM" or "LPT" */
       
    35 	COMLPTn,
       
    36 	L,
       
    37 	LP,
       
    38 	N,
       
    39 	NU,
       
    40 	P,       /* "PRN" */
       
    41 	PR,
       
    42 	LDOT,    /* leading '.' */
       
    43 	DOT,     /* '.' in a non-leading position */
       
    44 	H,       /* ".h" */
       
    45 	HGDI,    /* ".hg", ".d", or ".i" */
       
    46 	SPACE,
       
    47 	DEFAULT, /* byte of a path component after the first */
       
    48 };
    17 
    49 
    18 /* state machine for dir-encoding */
    50 /* state machine for dir-encoding */
    19 enum dir_state {
    51 enum dir_state {
    20 	DDOT,
    52 	DDOT,
    21 	DH,
    53 	DH,
    22 	DHGDI,
    54 	DHGDI,
    23 	DDEFAULT,
    55 	DDEFAULT,
    24 };
    56 };
    25 
    57 
       
    58 static inline int isset(const uint32_t bitset[], char c)
       
    59 {
       
    60 	return bitset[((uint8_t)c) >> 5] & (1 << (((uint8_t)c) & 31));
       
    61 }
       
    62 
    26 static inline void charcopy(char *dest, Py_ssize_t *destlen, size_t destsize,
    63 static inline void charcopy(char *dest, Py_ssize_t *destlen, size_t destsize,
    27                             char c)
    64                             char c)
    28 {
    65 {
    29 	if (dest) {
    66 	if (dest) {
    30 		assert(*destlen < destsize);
    67 		assert(*destlen < destsize);
    39 	if (dest) {
    76 	if (dest) {
    40 		assert(*destlen + len < destsize);
    77 		assert(*destlen + len < destsize);
    41 		memcpy((void *)&dest[*destlen], src, len);
    78 		memcpy((void *)&dest[*destlen], src, len);
    42 	}
    79 	}
    43 	*destlen += len;
    80 	*destlen += len;
       
    81 }
       
    82 
       
    83 static inline void hexencode(char *dest, Py_ssize_t *destlen, size_t destsize,
       
    84 			     uint8_t c)
       
    85 {
       
    86 	static const char hexdigit[] = "0123456789abcdef";
       
    87 
       
    88 	charcopy(dest, destlen, destsize, hexdigit[c >> 4]);
       
    89 	charcopy(dest, destlen, destsize, hexdigit[c & 15]);
       
    90 }
       
    91 
       
    92 /* 3-byte escape: tilde followed by two hex digits */
       
    93 static inline void escape3(char *dest, Py_ssize_t *destlen, size_t destsize,
       
    94 			   char c)
       
    95 {
       
    96 	charcopy(dest, destlen, destsize, '~');
       
    97 	hexencode(dest, destlen, destsize, c);
    44 }
    98 }
    45 
    99 
    46 static Py_ssize_t _encodedir(char *dest, size_t destsize,
   100 static Py_ssize_t _encodedir(char *dest, size_t destsize,
    47                              const char *src, Py_ssize_t len)
   101                              const char *src, Py_ssize_t len)
    48 {
   102 {
   121 			   len + 1);
   175 			   len + 1);
   122 	}
   176 	}
   123 
   177 
   124 	return newobj;
   178 	return newobj;
   125 }
   179 }
       
   180 
       
   181 static Py_ssize_t _encode(const uint32_t twobytes[8], const uint32_t onebyte[8],
       
   182 			  char *dest, Py_ssize_t destlen, size_t destsize,
       
   183 			  const char *src, Py_ssize_t len,
       
   184 			  int encodedir)
       
   185 {
       
   186 	enum path_state state = START;
       
   187 	Py_ssize_t i = 0;
       
   188 
       
   189 	/*
       
   190 	 * Python strings end with a zero byte, which we use as a
       
   191 	 * terminal token as they are not valid inside path names.
       
   192 	 */
       
   193 
       
   194 	while (i < len) {
       
   195 		switch (state) {
       
   196 		case START:
       
   197 			switch (src[i]) {
       
   198 			case '/':
       
   199 				charcopy(dest, &destlen, destsize, src[i++]);
       
   200 				break;
       
   201 			case '.':
       
   202 				state = LDOT;
       
   203 				escape3(dest, &destlen, destsize, src[i++]);
       
   204 				break;
       
   205 			case ' ':
       
   206 				state = DEFAULT;
       
   207 				escape3(dest, &destlen, destsize, src[i++]);
       
   208 				break;
       
   209 			case 'a':
       
   210 				state = A;
       
   211 				charcopy(dest, &destlen, destsize, src[i++]);
       
   212 				break;
       
   213 			case 'c':
       
   214 				state = C;
       
   215 				charcopy(dest, &destlen, destsize, src[i++]);
       
   216 				break;
       
   217 			case 'l':
       
   218 				state = L;
       
   219 				charcopy(dest, &destlen, destsize, src[i++]);
       
   220 				break;
       
   221 			case 'n':
       
   222 				state = N;
       
   223 				charcopy(dest, &destlen, destsize, src[i++]);
       
   224 				break;
       
   225 			case 'p':
       
   226 				state = P;
       
   227 				charcopy(dest, &destlen, destsize, src[i++]);
       
   228 				break;
       
   229 			default:
       
   230 				state = DEFAULT;
       
   231 				break;
       
   232 			}
       
   233 			break;
       
   234 		case A:
       
   235 			if (src[i] == 'u') {
       
   236 				state = AU;
       
   237 				charcopy(dest, &destlen, destsize, src[i++]);
       
   238 			}
       
   239 			else state = DEFAULT;
       
   240 			break;
       
   241 		case AU:
       
   242 			if (src[i] == 'x') {
       
   243 				state = THIRD;
       
   244 				i++;
       
   245 			}
       
   246 			else state = DEFAULT;
       
   247 			break;
       
   248 		case THIRD:
       
   249 			state = DEFAULT;
       
   250 			switch (src[i]) {
       
   251 			case '.':
       
   252 			case '/':
       
   253 			case '\0':
       
   254 				escape3(dest, &destlen, destsize, src[i - 1]);
       
   255 				break;
       
   256 			default:
       
   257 				i--;
       
   258 				break;
       
   259 			}
       
   260 			break;
       
   261 		case C:
       
   262 			if (src[i] == 'o') {
       
   263 				state = CO;
       
   264 				charcopy(dest, &destlen, destsize, src[i++]);
       
   265 			}
       
   266 			else state = DEFAULT;
       
   267 			break;
       
   268 		case CO:
       
   269 			if (src[i] == 'm') {
       
   270 				state = COMLPT;
       
   271 				i++;
       
   272 			}
       
   273 			else if (src[i] == 'n') {
       
   274 				state = THIRD;
       
   275 				i++;
       
   276 			}
       
   277 			else state = DEFAULT;
       
   278 			break;
       
   279 		case COMLPT:
       
   280 			switch (src[i]) {
       
   281 			case '1': case '2': case '3': case '4': case '5':
       
   282 			case '6': case '7': case '8': case '9':
       
   283 				state = COMLPTn;
       
   284 				i++;
       
   285 				break;
       
   286 			default:
       
   287 				state = DEFAULT;
       
   288 				charcopy(dest, &destlen, destsize, src[i - 1]);
       
   289 				break;
       
   290 			}
       
   291 			break;
       
   292 		case COMLPTn:
       
   293 			state = DEFAULT;
       
   294 			switch (src[i]) {
       
   295 			case '.':
       
   296 			case '/':
       
   297 			case '\0':
       
   298 				escape3(dest, &destlen, destsize, src[i - 2]);
       
   299 				charcopy(dest, &destlen, destsize, src[i - 1]);
       
   300 				break;
       
   301 			default:
       
   302 				memcopy(dest, &destlen, destsize,
       
   303 					&src[i - 2], 2);
       
   304 				break;
       
   305 			}
       
   306 			break;
       
   307 		case L:
       
   308 			if (src[i] == 'p') {
       
   309 				state = LP;
       
   310 				charcopy(dest, &destlen, destsize, src[i++]);
       
   311 			}
       
   312 			else state = DEFAULT;
       
   313 			break;
       
   314 		case LP:
       
   315 			if (src[i] == 't') {
       
   316 				state = COMLPT;
       
   317 				i++;
       
   318 			}
       
   319 			else state = DEFAULT;
       
   320 			break;
       
   321 		case N:
       
   322 			if (src[i] == 'u') {
       
   323 				state = NU;
       
   324 				charcopy(dest, &destlen, destsize, src[i++]);
       
   325 			}
       
   326 			else state = DEFAULT;
       
   327 			break;
       
   328 		case NU:
       
   329 			if (src[i] == 'l') {
       
   330 				state = THIRD;
       
   331 				i++;
       
   332 			}
       
   333 			else state = DEFAULT;
       
   334 			break;
       
   335 		case P:
       
   336 			if (src[i] == 'r') {
       
   337 				state = PR;
       
   338 				charcopy(dest, &destlen, destsize, src[i++]);
       
   339 			}
       
   340 			else state = DEFAULT;
       
   341 			break;
       
   342 		case PR:
       
   343 			if (src[i] == 'n') {
       
   344 				state = THIRD;
       
   345 				i++;
       
   346 			}
       
   347 			else state = DEFAULT;
       
   348 			break;
       
   349 		case LDOT:
       
   350 			switch (src[i]) {
       
   351 			case 'd':
       
   352 			case 'i':
       
   353 				state = HGDI;
       
   354 				charcopy(dest, &destlen, destsize, src[i++]);
       
   355 				break;
       
   356 			case 'h':
       
   357 				state = H;
       
   358 				charcopy(dest, &destlen, destsize, src[i++]);
       
   359 				break;
       
   360 			default:
       
   361 				state = DEFAULT;
       
   362 				break;
       
   363 			}
       
   364 			break;
       
   365 		case DOT:
       
   366 			switch (src[i]) {
       
   367 			case '/':
       
   368 			case '\0':
       
   369 				state = START;
       
   370 				memcopy(dest, &destlen, destsize, "~2e", 3);
       
   371 				charcopy(dest, &destlen, destsize, src[i++]);
       
   372 				break;
       
   373 			case 'd':
       
   374 			case 'i':
       
   375 				state = HGDI;
       
   376 				charcopy(dest, &destlen, destsize, '.');
       
   377 				charcopy(dest, &destlen, destsize, src[i++]);
       
   378 				break;
       
   379 			case 'h':
       
   380 				state = H;
       
   381 				memcopy(dest, &destlen, destsize, ".h", 2);
       
   382 				i++;
       
   383 				break;
       
   384 			default:
       
   385 				state = DEFAULT;
       
   386 				charcopy(dest, &destlen, destsize, '.');
       
   387 				break;
       
   388 			}
       
   389 			break;
       
   390 		case H:
       
   391 			if (src[i] == 'g') {
       
   392 				state = HGDI;
       
   393 				charcopy(dest, &destlen, destsize, src[i++]);
       
   394 			}
       
   395 			else state = DEFAULT;
       
   396 			break;
       
   397 		case HGDI:
       
   398 			if (src[i] == '/') {
       
   399 				state = START;
       
   400 				if (encodedir)
       
   401 					memcopy(dest, &destlen, destsize, ".hg",
       
   402 						3);
       
   403 				charcopy(dest, &destlen, destsize, src[i++]);
       
   404 			}
       
   405 			else state = DEFAULT;
       
   406 			break;
       
   407 		case SPACE:
       
   408 			switch (src[i]) {
       
   409 			case '/':
       
   410 			case '\0':
       
   411 				state = START;
       
   412 				memcopy(dest, &destlen, destsize, "~20", 3);
       
   413 				charcopy(dest, &destlen, destsize, src[i++]);
       
   414 				break;
       
   415 			default:
       
   416 				state = DEFAULT;
       
   417 				charcopy(dest, &destlen, destsize, ' ');
       
   418 				break;
       
   419 			}
       
   420 			break;
       
   421 		case DEFAULT:
       
   422 			while (isset(onebyte, src[i])) {
       
   423 				charcopy(dest, &destlen, destsize, src[i++]);
       
   424 				if (i == len)
       
   425 					goto done;
       
   426 			}
       
   427 			switch (src[i]) {
       
   428 			case '.':
       
   429 				state = DOT;
       
   430 				i++;
       
   431 				break;
       
   432 			case ' ':
       
   433 				state = SPACE;
       
   434 				i++;
       
   435 				break;
       
   436 			case '/':
       
   437 				state = START;
       
   438 				charcopy(dest, &destlen, destsize, '/');
       
   439 				i++;
       
   440 				break;
       
   441 			default:
       
   442 				if (isset(onebyte, src[i])) {
       
   443 					do {
       
   444 						charcopy(dest, &destlen,
       
   445 							 destsize, src[i++]);
       
   446 					} while (i < len &&
       
   447 						 isset(onebyte, src[i]));
       
   448 				}
       
   449 				else if (isset(twobytes, src[i])) {
       
   450 					char c = src[i++];
       
   451 					charcopy(dest, &destlen, destsize, '_');
       
   452 					charcopy(dest, &destlen, destsize,
       
   453 						 c == '_' ? '_' : c + 32);
       
   454 				}
       
   455 				else
       
   456 					escape3(dest, &destlen, destsize,
       
   457 						src[i++]);
       
   458 				break;
       
   459 			}
       
   460 			break;
       
   461 		}
       
   462 	}
       
   463 done:
       
   464 	return destlen;
       
   465 }
       
   466 
       
   467 static Py_ssize_t basicencode(char *dest, size_t destsize,
       
   468 			      const char *src, Py_ssize_t len)
       
   469 {
       
   470 	static const uint32_t twobytes[8] = { 0, 0, 0x87fffffe };
       
   471 
       
   472 	static const uint32_t onebyte[8] = {
       
   473 		1, 0x2bff3bfa, 0x68000001, 0x2fffffff,
       
   474 	};
       
   475 
       
   476 	Py_ssize_t destlen = 0;
       
   477 
       
   478 	if (len < 5 || memcmp(src, "data/", 5) != 0) {
       
   479 		memcopy(dest, &destlen, destsize, src, len);
       
   480 		return destlen;
       
   481 	}
       
   482 
       
   483 	memcopy(dest, &destlen, destsize, "data/", 5);
       
   484 
       
   485 	return _encode(twobytes, onebyte, dest, destlen, destsize,
       
   486 		       src + 5, len - 5, 1);
       
   487 }
       
   488 
       
   489 static const Py_ssize_t maxstorepathlen = 120;
       
   490 
       
   491 /*
       
   492  * We currently implement only basic encoding.
       
   493  *
       
   494  * If a name is too long to encode due to Windows path name limits,
       
   495  * this function returns None.
       
   496  */
       
   497 PyObject *pathencode(PyObject *self, PyObject *args)
       
   498 {
       
   499 	Py_ssize_t len, newlen;
       
   500 	PyObject *pathobj, *newobj;
       
   501 	char *path;
       
   502 
       
   503 	if (!PyArg_ParseTuple(args, "O:pathencode", &pathobj))
       
   504 		return NULL;
       
   505 
       
   506 	if (PyString_AsStringAndSize(pathobj, &path, &len) == -1) {
       
   507 		PyErr_SetString(PyExc_TypeError, "expected a string");
       
   508 		return NULL;
       
   509 	}
       
   510 
       
   511 	newlen = len ? basicencode(NULL, 0, path, len + 1) : 1;
       
   512 
       
   513 	if (newlen <= maxstorepathlen + 1) {
       
   514 		if (newlen == len + 1) {
       
   515 			Py_INCREF(pathobj);
       
   516 			return pathobj;
       
   517 		}
       
   518 
       
   519 		newobj = PyString_FromStringAndSize(NULL, newlen);
       
   520 
       
   521 		if (newobj) {
       
   522 			PyString_GET_SIZE(newobj)--;
       
   523 			basicencode(PyString_AS_STRING(newobj), newlen, path,
       
   524 				    len + 1);
       
   525 		}
       
   526 	} else {
       
   527 		newobj = Py_None;
       
   528 		Py_INCREF(newobj);
       
   529 	}
       
   530 
       
   531 	return newobj;
       
   532 }