4 Copyright 2012 Facebook |
4 Copyright 2012 Facebook |
5 |
5 |
6 This software may be used and distributed according to the terms of |
6 This software may be used and distributed according to the terms of |
7 the GNU General Public License, incorporated herein by reference. |
7 the GNU General Public License, incorporated herein by reference. |
8 */ |
8 */ |
|
9 |
|
10 /* |
|
11 * An implementation of the name encoding scheme used by the fncache |
|
12 * store. The common case is of a path < 120 bytes long, which is |
|
13 * handled either in a single pass with no allocations or two passes |
|
14 * with a single allocation. For longer paths, multiple passes are |
|
15 * required. |
|
16 */ |
9 |
17 |
10 #include <Python.h> |
18 #include <Python.h> |
11 #include <assert.h> |
19 #include <assert.h> |
12 #include <ctype.h> |
20 #include <ctype.h> |
13 #include <stdlib.h> |
21 #include <stdlib.h> |
14 #include <string.h> |
22 #include <string.h> |
15 |
23 |
16 #include "util.h" |
24 #include "util.h" |
|
25 |
|
26 /* state machine for the fast path */ |
|
27 enum path_state { |
|
28 START, /* first byte of a path component */ |
|
29 A, /* "AUX" */ |
|
30 AU, |
|
31 THIRD, /* third of a 3-byte sequence, e.g. "AUX", "NUL" */ |
|
32 C, /* "CON" or "COMn" */ |
|
33 CO, |
|
34 COMLPT, /* "COM" or "LPT" */ |
|
35 COMLPTn, |
|
36 L, |
|
37 LP, |
|
38 N, |
|
39 NU, |
|
40 P, /* "PRN" */ |
|
41 PR, |
|
42 LDOT, /* leading '.' */ |
|
43 DOT, /* '.' in a non-leading position */ |
|
44 H, /* ".h" */ |
|
45 HGDI, /* ".hg", ".d", or ".i" */ |
|
46 SPACE, |
|
47 DEFAULT, /* byte of a path component after the first */ |
|
48 }; |
17 |
49 |
18 /* state machine for dir-encoding */ |
50 /* state machine for dir-encoding */ |
19 enum dir_state { |
51 enum dir_state { |
20 DDOT, |
52 DDOT, |
21 DH, |
53 DH, |
22 DHGDI, |
54 DHGDI, |
23 DDEFAULT, |
55 DDEFAULT, |
24 }; |
56 }; |
25 |
57 |
|
58 static inline int isset(const uint32_t bitset[], char c) |
|
59 { |
|
60 return bitset[((uint8_t)c) >> 5] & (1 << (((uint8_t)c) & 31)); |
|
61 } |
|
62 |
26 static inline void charcopy(char *dest, Py_ssize_t *destlen, size_t destsize, |
63 static inline void charcopy(char *dest, Py_ssize_t *destlen, size_t destsize, |
27 char c) |
64 char c) |
28 { |
65 { |
29 if (dest) { |
66 if (dest) { |
30 assert(*destlen < destsize); |
67 assert(*destlen < destsize); |
121 len + 1); |
175 len + 1); |
122 } |
176 } |
123 |
177 |
124 return newobj; |
178 return newobj; |
125 } |
179 } |
|
180 |
|
181 static Py_ssize_t _encode(const uint32_t twobytes[8], const uint32_t onebyte[8], |
|
182 char *dest, Py_ssize_t destlen, size_t destsize, |
|
183 const char *src, Py_ssize_t len, |
|
184 int encodedir) |
|
185 { |
|
186 enum path_state state = START; |
|
187 Py_ssize_t i = 0; |
|
188 |
|
189 /* |
|
190 * Python strings end with a zero byte, which we use as a |
|
191 * terminal token as they are not valid inside path names. |
|
192 */ |
|
193 |
|
194 while (i < len) { |
|
195 switch (state) { |
|
196 case START: |
|
197 switch (src[i]) { |
|
198 case '/': |
|
199 charcopy(dest, &destlen, destsize, src[i++]); |
|
200 break; |
|
201 case '.': |
|
202 state = LDOT; |
|
203 escape3(dest, &destlen, destsize, src[i++]); |
|
204 break; |
|
205 case ' ': |
|
206 state = DEFAULT; |
|
207 escape3(dest, &destlen, destsize, src[i++]); |
|
208 break; |
|
209 case 'a': |
|
210 state = A; |
|
211 charcopy(dest, &destlen, destsize, src[i++]); |
|
212 break; |
|
213 case 'c': |
|
214 state = C; |
|
215 charcopy(dest, &destlen, destsize, src[i++]); |
|
216 break; |
|
217 case 'l': |
|
218 state = L; |
|
219 charcopy(dest, &destlen, destsize, src[i++]); |
|
220 break; |
|
221 case 'n': |
|
222 state = N; |
|
223 charcopy(dest, &destlen, destsize, src[i++]); |
|
224 break; |
|
225 case 'p': |
|
226 state = P; |
|
227 charcopy(dest, &destlen, destsize, src[i++]); |
|
228 break; |
|
229 default: |
|
230 state = DEFAULT; |
|
231 break; |
|
232 } |
|
233 break; |
|
234 case A: |
|
235 if (src[i] == 'u') { |
|
236 state = AU; |
|
237 charcopy(dest, &destlen, destsize, src[i++]); |
|
238 } |
|
239 else state = DEFAULT; |
|
240 break; |
|
241 case AU: |
|
242 if (src[i] == 'x') { |
|
243 state = THIRD; |
|
244 i++; |
|
245 } |
|
246 else state = DEFAULT; |
|
247 break; |
|
248 case THIRD: |
|
249 state = DEFAULT; |
|
250 switch (src[i]) { |
|
251 case '.': |
|
252 case '/': |
|
253 case '\0': |
|
254 escape3(dest, &destlen, destsize, src[i - 1]); |
|
255 break; |
|
256 default: |
|
257 i--; |
|
258 break; |
|
259 } |
|
260 break; |
|
261 case C: |
|
262 if (src[i] == 'o') { |
|
263 state = CO; |
|
264 charcopy(dest, &destlen, destsize, src[i++]); |
|
265 } |
|
266 else state = DEFAULT; |
|
267 break; |
|
268 case CO: |
|
269 if (src[i] == 'm') { |
|
270 state = COMLPT; |
|
271 i++; |
|
272 } |
|
273 else if (src[i] == 'n') { |
|
274 state = THIRD; |
|
275 i++; |
|
276 } |
|
277 else state = DEFAULT; |
|
278 break; |
|
279 case COMLPT: |
|
280 switch (src[i]) { |
|
281 case '1': case '2': case '3': case '4': case '5': |
|
282 case '6': case '7': case '8': case '9': |
|
283 state = COMLPTn; |
|
284 i++; |
|
285 break; |
|
286 default: |
|
287 state = DEFAULT; |
|
288 charcopy(dest, &destlen, destsize, src[i - 1]); |
|
289 break; |
|
290 } |
|
291 break; |
|
292 case COMLPTn: |
|
293 state = DEFAULT; |
|
294 switch (src[i]) { |
|
295 case '.': |
|
296 case '/': |
|
297 case '\0': |
|
298 escape3(dest, &destlen, destsize, src[i - 2]); |
|
299 charcopy(dest, &destlen, destsize, src[i - 1]); |
|
300 break; |
|
301 default: |
|
302 memcopy(dest, &destlen, destsize, |
|
303 &src[i - 2], 2); |
|
304 break; |
|
305 } |
|
306 break; |
|
307 case L: |
|
308 if (src[i] == 'p') { |
|
309 state = LP; |
|
310 charcopy(dest, &destlen, destsize, src[i++]); |
|
311 } |
|
312 else state = DEFAULT; |
|
313 break; |
|
314 case LP: |
|
315 if (src[i] == 't') { |
|
316 state = COMLPT; |
|
317 i++; |
|
318 } |
|
319 else state = DEFAULT; |
|
320 break; |
|
321 case N: |
|
322 if (src[i] == 'u') { |
|
323 state = NU; |
|
324 charcopy(dest, &destlen, destsize, src[i++]); |
|
325 } |
|
326 else state = DEFAULT; |
|
327 break; |
|
328 case NU: |
|
329 if (src[i] == 'l') { |
|
330 state = THIRD; |
|
331 i++; |
|
332 } |
|
333 else state = DEFAULT; |
|
334 break; |
|
335 case P: |
|
336 if (src[i] == 'r') { |
|
337 state = PR; |
|
338 charcopy(dest, &destlen, destsize, src[i++]); |
|
339 } |
|
340 else state = DEFAULT; |
|
341 break; |
|
342 case PR: |
|
343 if (src[i] == 'n') { |
|
344 state = THIRD; |
|
345 i++; |
|
346 } |
|
347 else state = DEFAULT; |
|
348 break; |
|
349 case LDOT: |
|
350 switch (src[i]) { |
|
351 case 'd': |
|
352 case 'i': |
|
353 state = HGDI; |
|
354 charcopy(dest, &destlen, destsize, src[i++]); |
|
355 break; |
|
356 case 'h': |
|
357 state = H; |
|
358 charcopy(dest, &destlen, destsize, src[i++]); |
|
359 break; |
|
360 default: |
|
361 state = DEFAULT; |
|
362 break; |
|
363 } |
|
364 break; |
|
365 case DOT: |
|
366 switch (src[i]) { |
|
367 case '/': |
|
368 case '\0': |
|
369 state = START; |
|
370 memcopy(dest, &destlen, destsize, "~2e", 3); |
|
371 charcopy(dest, &destlen, destsize, src[i++]); |
|
372 break; |
|
373 case 'd': |
|
374 case 'i': |
|
375 state = HGDI; |
|
376 charcopy(dest, &destlen, destsize, '.'); |
|
377 charcopy(dest, &destlen, destsize, src[i++]); |
|
378 break; |
|
379 case 'h': |
|
380 state = H; |
|
381 memcopy(dest, &destlen, destsize, ".h", 2); |
|
382 i++; |
|
383 break; |
|
384 default: |
|
385 state = DEFAULT; |
|
386 charcopy(dest, &destlen, destsize, '.'); |
|
387 break; |
|
388 } |
|
389 break; |
|
390 case H: |
|
391 if (src[i] == 'g') { |
|
392 state = HGDI; |
|
393 charcopy(dest, &destlen, destsize, src[i++]); |
|
394 } |
|
395 else state = DEFAULT; |
|
396 break; |
|
397 case HGDI: |
|
398 if (src[i] == '/') { |
|
399 state = START; |
|
400 if (encodedir) |
|
401 memcopy(dest, &destlen, destsize, ".hg", |
|
402 3); |
|
403 charcopy(dest, &destlen, destsize, src[i++]); |
|
404 } |
|
405 else state = DEFAULT; |
|
406 break; |
|
407 case SPACE: |
|
408 switch (src[i]) { |
|
409 case '/': |
|
410 case '\0': |
|
411 state = START; |
|
412 memcopy(dest, &destlen, destsize, "~20", 3); |
|
413 charcopy(dest, &destlen, destsize, src[i++]); |
|
414 break; |
|
415 default: |
|
416 state = DEFAULT; |
|
417 charcopy(dest, &destlen, destsize, ' '); |
|
418 break; |
|
419 } |
|
420 break; |
|
421 case DEFAULT: |
|
422 while (isset(onebyte, src[i])) { |
|
423 charcopy(dest, &destlen, destsize, src[i++]); |
|
424 if (i == len) |
|
425 goto done; |
|
426 } |
|
427 switch (src[i]) { |
|
428 case '.': |
|
429 state = DOT; |
|
430 i++; |
|
431 break; |
|
432 case ' ': |
|
433 state = SPACE; |
|
434 i++; |
|
435 break; |
|
436 case '/': |
|
437 state = START; |
|
438 charcopy(dest, &destlen, destsize, '/'); |
|
439 i++; |
|
440 break; |
|
441 default: |
|
442 if (isset(onebyte, src[i])) { |
|
443 do { |
|
444 charcopy(dest, &destlen, |
|
445 destsize, src[i++]); |
|
446 } while (i < len && |
|
447 isset(onebyte, src[i])); |
|
448 } |
|
449 else if (isset(twobytes, src[i])) { |
|
450 char c = src[i++]; |
|
451 charcopy(dest, &destlen, destsize, '_'); |
|
452 charcopy(dest, &destlen, destsize, |
|
453 c == '_' ? '_' : c + 32); |
|
454 } |
|
455 else |
|
456 escape3(dest, &destlen, destsize, |
|
457 src[i++]); |
|
458 break; |
|
459 } |
|
460 break; |
|
461 } |
|
462 } |
|
463 done: |
|
464 return destlen; |
|
465 } |
|
466 |
|
467 static Py_ssize_t basicencode(char *dest, size_t destsize, |
|
468 const char *src, Py_ssize_t len) |
|
469 { |
|
470 static const uint32_t twobytes[8] = { 0, 0, 0x87fffffe }; |
|
471 |
|
472 static const uint32_t onebyte[8] = { |
|
473 1, 0x2bff3bfa, 0x68000001, 0x2fffffff, |
|
474 }; |
|
475 |
|
476 Py_ssize_t destlen = 0; |
|
477 |
|
478 if (len < 5 || memcmp(src, "data/", 5) != 0) { |
|
479 memcopy(dest, &destlen, destsize, src, len); |
|
480 return destlen; |
|
481 } |
|
482 |
|
483 memcopy(dest, &destlen, destsize, "data/", 5); |
|
484 |
|
485 return _encode(twobytes, onebyte, dest, destlen, destsize, |
|
486 src + 5, len - 5, 1); |
|
487 } |
|
488 |
|
489 static const Py_ssize_t maxstorepathlen = 120; |
|
490 |
|
491 /* |
|
492 * We currently implement only basic encoding. |
|
493 * |
|
494 * If a name is too long to encode due to Windows path name limits, |
|
495 * this function returns None. |
|
496 */ |
|
497 PyObject *pathencode(PyObject *self, PyObject *args) |
|
498 { |
|
499 Py_ssize_t len, newlen; |
|
500 PyObject *pathobj, *newobj; |
|
501 char *path; |
|
502 |
|
503 if (!PyArg_ParseTuple(args, "O:pathencode", &pathobj)) |
|
504 return NULL; |
|
505 |
|
506 if (PyString_AsStringAndSize(pathobj, &path, &len) == -1) { |
|
507 PyErr_SetString(PyExc_TypeError, "expected a string"); |
|
508 return NULL; |
|
509 } |
|
510 |
|
511 newlen = len ? basicencode(NULL, 0, path, len + 1) : 1; |
|
512 |
|
513 if (newlen <= maxstorepathlen + 1) { |
|
514 if (newlen == len + 1) { |
|
515 Py_INCREF(pathobj); |
|
516 return pathobj; |
|
517 } |
|
518 |
|
519 newobj = PyString_FromStringAndSize(NULL, newlen); |
|
520 |
|
521 if (newobj) { |
|
522 PyString_GET_SIZE(newobj)--; |
|
523 basicencode(PyString_AS_STRING(newobj), newlen, path, |
|
524 len + 1); |
|
525 } |
|
526 } else { |
|
527 newobj = Py_None; |
|
528 Py_INCREF(newobj); |
|
529 } |
|
530 |
|
531 return newobj; |
|
532 } |