Showing content from http://coverage.livinglogic.de/Objects/unicodeobject.c.html below:
Python code coverage: Objects/unicodeobject.c
1 n/a /* 2 n/a 3 n/a Unicode implementation based on original code by Fredrik Lundh, 4 n/a modified by Marc-Andre Lemburg <mal@lemburg.com>. 5 n/a 6 n/a Major speed upgrades to the method implementations at the Reykjavik 7 n/a NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke. 8 n/a 9 n/a Copyright (c) Corporation for National Research Initiatives. 10 n/a 11 n/a -------------------------------------------------------------------- 12 n/a The original string type implementation is: 13 n/a 14 n/a Copyright (c) 1999 by Secret Labs AB 15 n/a Copyright (c) 1999 by Fredrik Lundh 16 n/a 17 n/a By obtaining, using, and/or copying this software and/or its 18 n/a associated documentation, you agree that you have read, understood, 19 n/a and will comply with the following terms and conditions: 20 n/a 21 n/a Permission to use, copy, modify, and distribute this software and its 22 n/a associated documentation for any purpose and without fee is hereby 23 n/a granted, provided that the above copyright notice appears in all 24 n/a copies, and that both that copyright notice and this permission notice 25 n/a appear in supporting documentation, and that the name of Secret Labs 26 n/a AB or the author not be used in advertising or publicity pertaining to 27 n/a distribution of the software without specific, written prior 28 n/a permission. 29 n/a 30 n/a SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO 31 n/a THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND 32 n/a FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR 33 n/a ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 34 n/a WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN 35 n/a ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT 36 n/a OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 37 n/a -------------------------------------------------------------------- 38 n/a 39 n/a */ 40 n/a 41 n/a #define PY_SSIZE_T_CLEAN 42 n/a #include "Python.h" 43 n/a #include "ucnhash.h" 44 n/a #include "bytes_methods.h" 45 n/a #include "stringlib/eq.h" 46 n/a 47 n/a #ifdef MS_WINDOWS 48 n/a #include <windows.h> 49 n/a #endif 50 n/a 51 n/a /*[clinic input] 52 n/a class str "PyObject *" "&PyUnicode_Type" 53 n/a [clinic start generated code]*/ 54 n/a /*[clinic end generated code: output=da39a3ee5e6b4b0d input=4884c934de622cf6]*/ 55 n/a 56 n/a /*[python input] 57 n/a class Py_UCS4_converter(CConverter): 58 n/a type = 'Py_UCS4' 59 n/a converter = 'convert_uc' 60 n/a 61 n/a def converter_init(self): 62 n/a if self.default is not unspecified: 63 n/a self.c_default = ascii(self.default) 64 n/a if len(self.c_default) > 4 or self.c_default[0] != "'": 65 n/a self.c_default = hex(ord(self.default)) 66 n/a 67 n/a [python start generated code]*/ 68 n/a /*[python end generated code: output=da39a3ee5e6b4b0d input=88f5dd06cd8e7a61]*/ 69 n/a 70 n/a /* --- Globals ------------------------------------------------------------ 71 n/a 72 n/a NOTE: In the interpreter's initialization phase, some globals are currently 73 n/a initialized dynamically as needed. In the process Unicode objects may 74 n/a be created before the Unicode type is ready. 75 n/a 76 n/a */ 77 n/a 78 n/a 79 n/a #ifdef __cplusplus 80 n/a extern "C" { 81 n/a #endif 82 n/a 83 n/a /* Maximum code point of Unicode 6.0: 0x10ffff (1,114,111) */ 84 n/a #define MAX_UNICODE 0x10ffff 85 n/a 86 n/a #ifdef Py_DEBUG 87 n/a # define _PyUnicode_CHECK(op) _PyUnicode_CheckConsistency(op, 0) 88 n/a #else 89 n/a # define _PyUnicode_CHECK(op) PyUnicode_Check(op) 90 n/a #endif 91 n/a 92 n/a #define _PyUnicode_UTF8(op) \ 93 n/a (((PyCompactUnicodeObject*)(op))->utf8) 94 n/a #define PyUnicode_UTF8(op) \ 95 n/a (assert(_PyUnicode_CHECK(op)), \ 96 n/a assert(PyUnicode_IS_READY(op)), \ 97 n/a PyUnicode_IS_COMPACT_ASCII(op) ? \ 98 n/a ((char*)((PyASCIIObject*)(op) + 1)) : \ 99 n/a _PyUnicode_UTF8(op)) 100 n/a #define _PyUnicode_UTF8_LENGTH(op) \ 101 n/a (((PyCompactUnicodeObject*)(op))->utf8_length) 102 n/a #define PyUnicode_UTF8_LENGTH(op) \ 103 n/a (assert(_PyUnicode_CHECK(op)), \ 104 n/a assert(PyUnicode_IS_READY(op)), \ 105 n/a PyUnicode_IS_COMPACT_ASCII(op) ? \ 106 n/a ((PyASCIIObject*)(op))->length : \ 107 n/a _PyUnicode_UTF8_LENGTH(op)) 108 n/a #define _PyUnicode_WSTR(op) \ 109 n/a (((PyASCIIObject*)(op))->wstr) 110 n/a #define _PyUnicode_WSTR_LENGTH(op) \ 111 n/a (((PyCompactUnicodeObject*)(op))->wstr_length) 112 n/a #define _PyUnicode_LENGTH(op) \ 113 n/a (((PyASCIIObject *)(op))->length) 114 n/a #define _PyUnicode_STATE(op) \ 115 n/a (((PyASCIIObject *)(op))->state) 116 n/a #define _PyUnicode_HASH(op) \ 117 n/a (((PyASCIIObject *)(op))->hash) 118 n/a #define _PyUnicode_KIND(op) \ 119 n/a (assert(_PyUnicode_CHECK(op)), \ 120 n/a ((PyASCIIObject *)(op))->state.kind) 121 n/a #define _PyUnicode_GET_LENGTH(op) \ 122 n/a (assert(_PyUnicode_CHECK(op)), \ 123 n/a ((PyASCIIObject *)(op))->length) 124 n/a #define _PyUnicode_DATA_ANY(op) \ 125 n/a (((PyUnicodeObject*)(op))->data.any) 126 n/a 127 n/a #undef PyUnicode_READY 128 n/a #define PyUnicode_READY(op) \ 129 n/a (assert(_PyUnicode_CHECK(op)), \ 130 n/a (PyUnicode_IS_READY(op) ? \ 131 n/a 0 : \ 132 n/a _PyUnicode_Ready(op))) 133 n/a 134 n/a #define _PyUnicode_SHARE_UTF8(op) \ 135 n/a (assert(_PyUnicode_CHECK(op)), \ 136 n/a assert(!PyUnicode_IS_COMPACT_ASCII(op)), \ 137 n/a (_PyUnicode_UTF8(op) == PyUnicode_DATA(op))) 138 n/a #define _PyUnicode_SHARE_WSTR(op) \ 139 n/a (assert(_PyUnicode_CHECK(op)), \ 140 n/a (_PyUnicode_WSTR(unicode) == PyUnicode_DATA(op))) 141 n/a 142 n/a /* true if the Unicode object has an allocated UTF-8 memory block 143 n/a (not shared with other data) */ 144 n/a #define _PyUnicode_HAS_UTF8_MEMORY(op) \ 145 n/a ((!PyUnicode_IS_COMPACT_ASCII(op) \ 146 n/a && _PyUnicode_UTF8(op) \ 147 n/a && _PyUnicode_UTF8(op) != PyUnicode_DATA(op))) 148 n/a 149 n/a /* true if the Unicode object has an allocated wstr memory block 150 n/a (not shared with other data) */ 151 n/a #define _PyUnicode_HAS_WSTR_MEMORY(op) \ 152 n/a ((_PyUnicode_WSTR(op) && \ 153 n/a (!PyUnicode_IS_READY(op) || \ 154 n/a _PyUnicode_WSTR(op) != PyUnicode_DATA(op)))) 155 n/a 156 n/a /* Generic helper macro to convert characters of different types. 157 n/a from_type and to_type have to be valid type names, begin and end 158 n/a are pointers to the source characters which should be of type 159 n/a "from_type *". to is a pointer of type "to_type *" and points to the 160 n/a buffer where the result characters are written to. */ 161 n/a #define _PyUnicode_CONVERT_BYTES(from_type, to_type, begin, end, to) \ 162 n/a do { \ 163 n/a to_type *_to = (to_type *)(to); \ 164 n/a const from_type *_iter = (from_type *)(begin); \ 165 n/a const from_type *_end = (from_type *)(end); \ 166 n/a Py_ssize_t n = (_end) - (_iter); \ 167 n/a const from_type *_unrolled_end = \ 168 n/a _iter + _Py_SIZE_ROUND_DOWN(n, 4); \ 169 n/a while (_iter < (_unrolled_end)) { \ 170 n/a _to[0] = (to_type) _iter[0]; \ 171 n/a _to[1] = (to_type) _iter[1]; \ 172 n/a _to[2] = (to_type) _iter[2]; \ 173 n/a _to[3] = (to_type) _iter[3]; \ 174 n/a _iter += 4; _to += 4; \ 175 n/a } \ 176 n/a while (_iter < (_end)) \ 177 n/a *_to++ = (to_type) *_iter++; \ 178 n/a } while (0) 179 n/a 180 n/a #ifdef MS_WINDOWS 181 n/a /* On Windows, overallocate by 50% is the best factor */ 182 n/a # define OVERALLOCATE_FACTOR 2 183 n/a #else 184 n/a /* On Linux, overallocate by 25% is the best factor */ 185 n/a # define OVERALLOCATE_FACTOR 4 186 n/a #endif 187 n/a 188 n/a /* This dictionary holds all interned unicode strings. Note that references 189 n/a to strings in this dictionary are *not* counted in the string's ob_refcnt. 190 n/a When the interned string reaches a refcnt of 0 the string deallocation 191 n/a function will delete the reference from this dictionary. 192 n/a 193 n/a Another way to look at this is that to say that the actual reference 194 n/a count of a string is: s->ob_refcnt + (s->state ? 2 : 0) 195 n/a */ 196 n/a static PyObject *interned = NULL; 197 n/a 198 n/a /* The empty Unicode object is shared to improve performance. */ 199 n/a static PyObject *unicode_empty = NULL; 200 n/a 201 n/a #define _Py_INCREF_UNICODE_EMPTY() \ 202 n/a do { \ 203 n/a if (unicode_empty != NULL) \ 204 n/a Py_INCREF(unicode_empty); \ 205 n/a else { \ 206 n/a unicode_empty = PyUnicode_New(0, 0); \ 207 n/a if (unicode_empty != NULL) { \ 208 n/a Py_INCREF(unicode_empty); \ 209 n/a assert(_PyUnicode_CheckConsistency(unicode_empty, 1)); \ 210 n/a } \ 211 n/a } \ 212 n/a } while (0) 213 n/a 214 n/a #define _Py_RETURN_UNICODE_EMPTY() \ 215 n/a do { \ 216 n/a _Py_INCREF_UNICODE_EMPTY(); \ 217 n/a return unicode_empty; \ 218 n/a } while (0) 219 n/a 220 n/a /* Forward declaration */ 221 n/a static inline int 222 n/a _PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter *writer, Py_UCS4 ch); 223 n/a 224 n/a /* List of static strings. */ 225 n/a static _Py_Identifier *static_strings = NULL; 226 n/a 227 n/a /* Single character Unicode strings in the Latin-1 range are being 228 n/a shared as well. */ 229 n/a static PyObject *unicode_latin1[256] = {NULL}; 230 n/a 231 n/a /* Fast detection of the most frequent whitespace characters */ 232 n/a const unsigned char _Py_ascii_whitespace[] = { 233 n/a 0, 0, 0, 0, 0, 0, 0, 0, 234 n/a /* case 0x0009: * CHARACTER TABULATION */ 235 n/a /* case 0x000A: * LINE FEED */ 236 n/a /* case 0x000B: * LINE TABULATION */ 237 n/a /* case 0x000C: * FORM FEED */ 238 n/a /* case 0x000D: * CARRIAGE RETURN */ 239 n/a 0, 1, 1, 1, 1, 1, 0, 0, 240 n/a 0, 0, 0, 0, 0, 0, 0, 0, 241 n/a /* case 0x001C: * FILE SEPARATOR */ 242 n/a /* case 0x001D: * GROUP SEPARATOR */ 243 n/a /* case 0x001E: * RECORD SEPARATOR */ 244 n/a /* case 0x001F: * UNIT SEPARATOR */ 245 n/a 0, 0, 0, 0, 1, 1, 1, 1, 246 n/a /* case 0x0020: * SPACE */ 247 n/a 1, 0, 0, 0, 0, 0, 0, 0, 248 n/a 0, 0, 0, 0, 0, 0, 0, 0, 249 n/a 0, 0, 0, 0, 0, 0, 0, 0, 250 n/a 0, 0, 0, 0, 0, 0, 0, 0, 251 n/a 252 n/a 0, 0, 0, 0, 0, 0, 0, 0, 253 n/a 0, 0, 0, 0, 0, 0, 0, 0, 254 n/a 0, 0, 0, 0, 0, 0, 0, 0, 255 n/a 0, 0, 0, 0, 0, 0, 0, 0, 256 n/a 0, 0, 0, 0, 0, 0, 0, 0, 257 n/a 0, 0, 0, 0, 0, 0, 0, 0, 258 n/a 0, 0, 0, 0, 0, 0, 0, 0, 259 n/a 0, 0, 0, 0, 0, 0, 0, 0 260 n/a }; 261 n/a 262 n/a /* forward */ 263 n/a static PyUnicodeObject *_PyUnicode_New(Py_ssize_t length); 264 n/a static PyObject* get_latin1_char(unsigned char ch); 265 n/a static int unicode_modifiable(PyObject *unicode); 266 n/a 267 n/a 268 n/a static PyObject * 269 n/a _PyUnicode_FromUCS1(const Py_UCS1 *s, Py_ssize_t size); 270 n/a static PyObject * 271 n/a _PyUnicode_FromUCS2(const Py_UCS2 *s, Py_ssize_t size); 272 n/a static PyObject * 273 n/a _PyUnicode_FromUCS4(const Py_UCS4 *s, Py_ssize_t size); 274 n/a 275 n/a static PyObject * 276 n/a unicode_encode_call_errorhandler(const char *errors, 277 n/a PyObject **errorHandler,const char *encoding, const char *reason, 278 n/a PyObject *unicode, PyObject **exceptionObject, 279 n/a Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos); 280 n/a 281 n/a static void 282 n/a raise_encode_exception(PyObject **exceptionObject, 283 n/a const char *encoding, 284 n/a PyObject *unicode, 285 n/a Py_ssize_t startpos, Py_ssize_t endpos, 286 n/a const char *reason); 287 n/a 288 n/a /* Same for linebreaks */ 289 n/a static const unsigned char ascii_linebreak[] = { 290 n/a 0, 0, 0, 0, 0, 0, 0, 0, 291 n/a /* 0x000A, * LINE FEED */ 292 n/a /* 0x000B, * LINE TABULATION */ 293 n/a /* 0x000C, * FORM FEED */ 294 n/a /* 0x000D, * CARRIAGE RETURN */ 295 n/a 0, 0, 1, 1, 1, 1, 0, 0, 296 n/a 0, 0, 0, 0, 0, 0, 0, 0, 297 n/a /* 0x001C, * FILE SEPARATOR */ 298 n/a /* 0x001D, * GROUP SEPARATOR */ 299 n/a /* 0x001E, * RECORD SEPARATOR */ 300 n/a 0, 0, 0, 0, 1, 1, 1, 0, 301 n/a 0, 0, 0, 0, 0, 0, 0, 0, 302 n/a 0, 0, 0, 0, 0, 0, 0, 0, 303 n/a 0, 0, 0, 0, 0, 0, 0, 0, 304 n/a 0, 0, 0, 0, 0, 0, 0, 0, 305 n/a 306 n/a 0, 0, 0, 0, 0, 0, 0, 0, 307 n/a 0, 0, 0, 0, 0, 0, 0, 0, 308 n/a 0, 0, 0, 0, 0, 0, 0, 0, 309 n/a 0, 0, 0, 0, 0, 0, 0, 0, 310 n/a 0, 0, 0, 0, 0, 0, 0, 0, 311 n/a 0, 0, 0, 0, 0, 0, 0, 0, 312 n/a 0, 0, 0, 0, 0, 0, 0, 0, 313 n/a 0, 0, 0, 0, 0, 0, 0, 0 314 n/a }; 315 n/a 316 n/a static int convert_uc(PyObject *obj, void *addr); 317 n/a 318 n/a #include "clinic/unicodeobject.c.h" 319 n/a 320 n/a typedef enum { 321 n/a _Py_ERROR_UNKNOWN=0, 322 n/a _Py_ERROR_STRICT, 323 n/a _Py_ERROR_SURROGATEESCAPE, 324 n/a _Py_ERROR_REPLACE, 325 n/a _Py_ERROR_IGNORE, 326 n/a _Py_ERROR_BACKSLASHREPLACE, 327 n/a _Py_ERROR_SURROGATEPASS, 328 n/a _Py_ERROR_XMLCHARREFREPLACE, 329 n/a _Py_ERROR_OTHER 330 n/a } _Py_error_handler; 331 n/a 332 n/a static _Py_error_handler 333 n/a get_error_handler(const char *errors) 334 n/a { 335 n/a if (errors == NULL || strcmp(errors, "strict") == 0) { 336 n/a return _Py_ERROR_STRICT; 337 n/a } 338 n/a if (strcmp(errors, "surrogateescape") == 0) { 339 n/a return _Py_ERROR_SURROGATEESCAPE; 340 n/a } 341 n/a if (strcmp(errors, "replace") == 0) { 342 n/a return _Py_ERROR_REPLACE; 343 n/a } 344 n/a if (strcmp(errors, "ignore") == 0) { 345 n/a return _Py_ERROR_IGNORE; 346 n/a } 347 n/a if (strcmp(errors, "backslashreplace") == 0) { 348 n/a return _Py_ERROR_BACKSLASHREPLACE; 349 n/a } 350 n/a if (strcmp(errors, "surrogatepass") == 0) { 351 n/a return _Py_ERROR_SURROGATEPASS; 352 n/a } 353 n/a if (strcmp(errors, "xmlcharrefreplace") == 0) { 354 n/a return _Py_ERROR_XMLCHARREFREPLACE; 355 n/a } 356 n/a return _Py_ERROR_OTHER; 357 n/a } 358 n/a 359 n/a /* The max unicode value is always 0x10FFFF while using the PEP-393 API. 360 n/a This function is kept for backward compatibility with the old API. */ 361 n/a Py_UNICODE 362 n/a PyUnicode_GetMax(void) 363 n/a { 364 n/a #ifdef Py_UNICODE_WIDE 365 n/a return 0x10FFFF; 366 n/a #else 367 n/a /* This is actually an illegal character, so it should 368 n/a not be passed to unichr. */ 369 n/a return 0xFFFF; 370 n/a #endif 371 n/a } 372 n/a 373 n/a #ifdef Py_DEBUG 374 n/a int 375 n/a _PyUnicode_CheckConsistency(PyObject *op, int check_content) 376 n/a { 377 n/a PyASCIIObject *ascii; 378 n/a unsigned int kind; 379 n/a 380 n/a assert(PyUnicode_Check(op)); 381 n/a 382 n/a ascii = (PyASCIIObject *)op; 383 n/a kind = ascii->state.kind; 384 n/a 385 n/a if (ascii->state.ascii == 1 && ascii->state.compact == 1) { 386 n/a assert(kind == PyUnicode_1BYTE_KIND); 387 n/a assert(ascii->state.ready == 1); 388 n/a } 389 n/a else { 390 n/a PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op; 391 n/a void *data; 392 n/a 393 n/a if (ascii->state.compact == 1) { 394 n/a data = compact + 1; 395 n/a assert(kind == PyUnicode_1BYTE_KIND 396 n/a || kind == PyUnicode_2BYTE_KIND 397 n/a || kind == PyUnicode_4BYTE_KIND); 398 n/a assert(ascii->state.ascii == 0); 399 n/a assert(ascii->state.ready == 1); 400 n/a assert (compact->utf8 != data); 401 n/a } 402 n/a else { 403 n/a PyUnicodeObject *unicode = (PyUnicodeObject *)op; 404 n/a 405 n/a data = unicode->data.any; 406 n/a if (kind == PyUnicode_WCHAR_KIND) { 407 n/a assert(ascii->length == 0); 408 n/a assert(ascii->hash == -1); 409 n/a assert(ascii->state.compact == 0); 410 n/a assert(ascii->state.ascii == 0); 411 n/a assert(ascii->state.ready == 0); 412 n/a assert(ascii->state.interned == SSTATE_NOT_INTERNED); 413 n/a assert(ascii->wstr != NULL); 414 n/a assert(data == NULL); 415 n/a assert(compact->utf8 == NULL); 416 n/a } 417 n/a else { 418 n/a assert(kind == PyUnicode_1BYTE_KIND 419 n/a || kind == PyUnicode_2BYTE_KIND 420 n/a || kind == PyUnicode_4BYTE_KIND); 421 n/a assert(ascii->state.compact == 0); 422 n/a assert(ascii->state.ready == 1); 423 n/a assert(data != NULL); 424 n/a if (ascii->state.ascii) { 425 n/a assert (compact->utf8 == data); 426 n/a assert (compact->utf8_length == ascii->length); 427 n/a } 428 n/a else 429 n/a assert (compact->utf8 != data); 430 n/a } 431 n/a } 432 n/a if (kind != PyUnicode_WCHAR_KIND) { 433 n/a if ( 434 n/a #if SIZEOF_WCHAR_T == 2 435 n/a kind == PyUnicode_2BYTE_KIND 436 n/a #else 437 n/a kind == PyUnicode_4BYTE_KIND 438 n/a #endif 439 n/a ) 440 n/a { 441 n/a assert(ascii->wstr == data); 442 n/a assert(compact->wstr_length == ascii->length); 443 n/a } else 444 n/a assert(ascii->wstr != data); 445 n/a } 446 n/a 447 n/a if (compact->utf8 == NULL) 448 n/a assert(compact->utf8_length == 0); 449 n/a if (ascii->wstr == NULL) 450 n/a assert(compact->wstr_length == 0); 451 n/a } 452 n/a /* check that the best kind is used */ 453 n/a if (check_content && kind != PyUnicode_WCHAR_KIND) 454 n/a { 455 n/a Py_ssize_t i; 456 n/a Py_UCS4 maxchar = 0; 457 n/a void *data; 458 n/a Py_UCS4 ch; 459 n/a 460 n/a data = PyUnicode_DATA(ascii); 461 n/a for (i=0; i < ascii->length; i++) 462 n/a { 463 n/a ch = PyUnicode_READ(kind, data, i); 464 n/a if (ch > maxchar) 465 n/a maxchar = ch; 466 n/a } 467 n/a if (kind == PyUnicode_1BYTE_KIND) { 468 n/a if (ascii->state.ascii == 0) { 469 n/a assert(maxchar >= 128); 470 n/a assert(maxchar <= 255); 471 n/a } 472 n/a else 473 n/a assert(maxchar < 128); 474 n/a } 475 n/a else if (kind == PyUnicode_2BYTE_KIND) { 476 n/a assert(maxchar >= 0x100); 477 n/a assert(maxchar <= 0xFFFF); 478 n/a } 479 n/a else { 480 n/a assert(maxchar >= 0x10000); 481 n/a assert(maxchar <= MAX_UNICODE); 482 n/a } 483 n/a assert(PyUnicode_READ(kind, data, ascii->length) == 0); 484 n/a } 485 n/a return 1; 486 n/a } 487 n/a #endif 488 n/a 489 n/a static PyObject* 490 n/a unicode_result_wchar(PyObject *unicode) 491 n/a { 492 n/a #ifndef Py_DEBUG 493 n/a Py_ssize_t len; 494 n/a 495 n/a len = _PyUnicode_WSTR_LENGTH(unicode); 496 n/a if (len == 0) { 497 n/a Py_DECREF(unicode); 498 n/a _Py_RETURN_UNICODE_EMPTY(); 499 n/a } 500 n/a 501 n/a if (len == 1) { 502 n/a wchar_t ch = _PyUnicode_WSTR(unicode)[0]; 503 n/a if ((Py_UCS4)ch < 256) { 504 n/a PyObject *latin1_char = get_latin1_char((unsigned char)ch); 505 n/a Py_DECREF(unicode); 506 n/a return latin1_char; 507 n/a } 508 n/a } 509 n/a 510 n/a if (_PyUnicode_Ready(unicode) < 0) { 511 n/a Py_DECREF(unicode); 512 n/a return NULL; 513 n/a } 514 n/a #else 515 n/a assert(Py_REFCNT(unicode) == 1); 516 n/a 517 n/a /* don't make the result ready in debug mode to ensure that the caller 518 n/a makes the string ready before using it */ 519 n/a assert(_PyUnicode_CheckConsistency(unicode, 1)); 520 n/a #endif 521 n/a return unicode; 522 n/a } 523 n/a 524 n/a static PyObject* 525 n/a unicode_result_ready(PyObject *unicode) 526 n/a { 527 n/a Py_ssize_t length; 528 n/a 529 n/a length = PyUnicode_GET_LENGTH(unicode); 530 n/a if (length == 0) { 531 n/a if (unicode != unicode_empty) { 532 n/a Py_DECREF(unicode); 533 n/a _Py_RETURN_UNICODE_EMPTY(); 534 n/a } 535 n/a return unicode_empty; 536 n/a } 537 n/a 538 n/a if (length == 1) { 539 n/a void *data = PyUnicode_DATA(unicode); 540 n/a int kind = PyUnicode_KIND(unicode); 541 n/a Py_UCS4 ch = PyUnicode_READ(kind, data, 0); 542 n/a if (ch < 256) { 543 n/a PyObject *latin1_char = unicode_latin1[ch]; 544 n/a if (latin1_char != NULL) { 545 n/a if (unicode != latin1_char) { 546 n/a Py_INCREF(latin1_char); 547 n/a Py_DECREF(unicode); 548 n/a } 549 n/a return latin1_char; 550 n/a } 551 n/a else { 552 n/a assert(_PyUnicode_CheckConsistency(unicode, 1)); 553 n/a Py_INCREF(unicode); 554 n/a unicode_latin1[ch] = unicode; 555 n/a return unicode; 556 n/a } 557 n/a } 558 n/a } 559 n/a 560 n/a assert(_PyUnicode_CheckConsistency(unicode, 1)); 561 n/a return unicode; 562 n/a } 563 n/a 564 n/a static PyObject* 565 n/a unicode_result(PyObject *unicode) 566 n/a { 567 n/a assert(_PyUnicode_CHECK(unicode)); 568 n/a if (PyUnicode_IS_READY(unicode)) 569 n/a return unicode_result_ready(unicode); 570 n/a else 571 n/a return unicode_result_wchar(unicode); 572 n/a } 573 n/a 574 n/a static PyObject* 575 n/a unicode_result_unchanged(PyObject *unicode) 576 n/a { 577 n/a if (PyUnicode_CheckExact(unicode)) { 578 n/a if (PyUnicode_READY(unicode) == -1) 579 n/a return NULL; 580 n/a Py_INCREF(unicode); 581 n/a return unicode; 582 n/a } 583 n/a else 584 n/a /* Subtype -- return genuine unicode string with the same value. */ 585 n/a return _PyUnicode_Copy(unicode); 586 n/a } 587 n/a 588 n/a /* Implementation of the "backslashreplace" error handler for 8-bit encodings: 589 n/a ASCII, Latin1, UTF-8, etc. */ 590 n/a static char* 591 n/a backslashreplace(_PyBytesWriter *writer, char *str, 592 n/a PyObject *unicode, Py_ssize_t collstart, Py_ssize_t collend) 593 n/a { 594 n/a Py_ssize_t size, i; 595 n/a Py_UCS4 ch; 596 n/a enum PyUnicode_Kind kind; 597 n/a void *data; 598 n/a 599 n/a assert(PyUnicode_IS_READY(unicode)); 600 n/a kind = PyUnicode_KIND(unicode); 601 n/a data = PyUnicode_DATA(unicode); 602 n/a 603 n/a size = 0; 604 n/a /* determine replacement size */ 605 n/a for (i = collstart; i < collend; ++i) { 606 n/a Py_ssize_t incr; 607 n/a 608 n/a ch = PyUnicode_READ(kind, data, i); 609 n/a if (ch < 0x100) 610 n/a incr = 2+2; 611 n/a else if (ch < 0x10000) 612 n/a incr = 2+4; 613 n/a else { 614 n/a assert(ch <= MAX_UNICODE); 615 n/a incr = 2+8; 616 n/a } 617 n/a if (size > PY_SSIZE_T_MAX - incr) { 618 n/a PyErr_SetString(PyExc_OverflowError, 619 n/a "encoded result is too long for a Python string"); 620 n/a return NULL; 621 n/a } 622 n/a size += incr; 623 n/a } 624 n/a 625 n/a str = _PyBytesWriter_Prepare(writer, str, size); 626 n/a if (str == NULL) 627 n/a return NULL; 628 n/a 629 n/a /* generate replacement */ 630 n/a for (i = collstart; i < collend; ++i) { 631 n/a ch = PyUnicode_READ(kind, data, i); 632 n/a *str++ = '\\'; 633 n/a if (ch >= 0x00010000) { 634 n/a *str++ = 'U'; 635 n/a *str++ = Py_hexdigits[(ch>>28)&0xf]; 636 n/a *str++ = Py_hexdigits[(ch>>24)&0xf]; 637 n/a *str++ = Py_hexdigits[(ch>>20)&0xf]; 638 n/a *str++ = Py_hexdigits[(ch>>16)&0xf]; 639 n/a *str++ = Py_hexdigits[(ch>>12)&0xf]; 640 n/a *str++ = Py_hexdigits[(ch>>8)&0xf]; 641 n/a } 642 n/a else if (ch >= 0x100) { 643 n/a *str++ = 'u'; 644 n/a *str++ = Py_hexdigits[(ch>>12)&0xf]; 645 n/a *str++ = Py_hexdigits[(ch>>8)&0xf]; 646 n/a } 647 n/a else 648 n/a *str++ = 'x'; 649 n/a *str++ = Py_hexdigits[(ch>>4)&0xf]; 650 n/a *str++ = Py_hexdigits[ch&0xf]; 651 n/a } 652 n/a return str; 653 n/a } 654 n/a 655 n/a /* Implementation of the "xmlcharrefreplace" error handler for 8-bit encodings: 656 n/a ASCII, Latin1, UTF-8, etc. */ 657 n/a static char* 658 n/a xmlcharrefreplace(_PyBytesWriter *writer, char *str, 659 n/a PyObject *unicode, Py_ssize_t collstart, Py_ssize_t collend) 660 n/a { 661 n/a Py_ssize_t size, i; 662 n/a Py_UCS4 ch; 663 n/a enum PyUnicode_Kind kind; 664 n/a void *data; 665 n/a 666 n/a assert(PyUnicode_IS_READY(unicode)); 667 n/a kind = PyUnicode_KIND(unicode); 668 n/a data = PyUnicode_DATA(unicode); 669 n/a 670 n/a size = 0; 671 n/a /* determine replacement size */ 672 n/a for (i = collstart; i < collend; ++i) { 673 n/a Py_ssize_t incr; 674 n/a 675 n/a ch = PyUnicode_READ(kind, data, i); 676 n/a if (ch < 10) 677 n/a incr = 2+1+1; 678 n/a else if (ch < 100) 679 n/a incr = 2+2+1; 680 n/a else if (ch < 1000) 681 n/a incr = 2+3+1; 682 n/a else if (ch < 10000) 683 n/a incr = 2+4+1; 684 n/a else if (ch < 100000) 685 n/a incr = 2+5+1; 686 n/a else if (ch < 1000000) 687 n/a incr = 2+6+1; 688 n/a else { 689 n/a assert(ch <= MAX_UNICODE); 690 n/a incr = 2+7+1; 691 n/a } 692 n/a if (size > PY_SSIZE_T_MAX - incr) { 693 n/a PyErr_SetString(PyExc_OverflowError, 694 n/a "encoded result is too long for a Python string"); 695 n/a return NULL; 696 n/a } 697 n/a size += incr; 698 n/a } 699 n/a 700 n/a str = _PyBytesWriter_Prepare(writer, str, size); 701 n/a if (str == NULL) 702 n/a return NULL; 703 n/a 704 n/a /* generate replacement */ 705 n/a for (i = collstart; i < collend; ++i) { 706 n/a str += sprintf(str, "&#%d;", PyUnicode_READ(kind, data, i)); 707 n/a } 708 n/a return str; 709 n/a } 710 n/a 711 n/a /* --- Bloom Filters ----------------------------------------------------- */ 712 n/a 713 n/a /* stuff to implement simple "bloom filters" for Unicode characters. 714 n/a to keep things simple, we use a single bitmask, using the least 5 715 n/a bits from each unicode characters as the bit index. */ 716 n/a 717 n/a /* the linebreak mask is set up by Unicode_Init below */ 718 n/a 719 n/a #if LONG_BIT >= 128 720 n/a #define BLOOM_WIDTH 128 721 n/a #elif LONG_BIT >= 64 722 n/a #define BLOOM_WIDTH 64 723 n/a #elif LONG_BIT >= 32 724 n/a #define BLOOM_WIDTH 32 725 n/a #else 726 n/a #error "LONG_BIT is smaller than 32" 727 n/a #endif 728 n/a 729 n/a #define BLOOM_MASK unsigned long 730 n/a 731 n/a static BLOOM_MASK bloom_linebreak = ~(BLOOM_MASK)0; 732 n/a 733 n/a #define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1))))) 734 n/a 735 n/a #define BLOOM_LINEBREAK(ch) \ 736 n/a ((ch) < 128U ? ascii_linebreak[(ch)] : \ 737 n/a (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch))) 738 n/a 739 n/a static inline BLOOM_MASK 740 n/a make_bloom_mask(int kind, void* ptr, Py_ssize_t len) 741 n/a { 742 n/a #define BLOOM_UPDATE(TYPE, MASK, PTR, LEN) \ 743 n/a do { \ 744 n/a TYPE *data = (TYPE *)PTR; \ 745 n/a TYPE *end = data + LEN; \ 746 n/a Py_UCS4 ch; \ 747 n/a for (; data != end; data++) { \ 748 n/a ch = *data; \ 749 n/a MASK |= (1UL << (ch & (BLOOM_WIDTH - 1))); \ 750 n/a } \ 751 n/a break; \ 752 n/a } while (0) 753 n/a 754 n/a /* calculate simple bloom-style bitmask for a given unicode string */ 755 n/a 756 n/a BLOOM_MASK mask; 757 n/a 758 n/a mask = 0; 759 n/a switch (kind) { 760 n/a case PyUnicode_1BYTE_KIND: 761 n/a BLOOM_UPDATE(Py_UCS1, mask, ptr, len); 762 n/a break; 763 n/a case PyUnicode_2BYTE_KIND: 764 n/a BLOOM_UPDATE(Py_UCS2, mask, ptr, len); 765 n/a break; 766 n/a case PyUnicode_4BYTE_KIND: 767 n/a BLOOM_UPDATE(Py_UCS4, mask, ptr, len); 768 n/a break; 769 n/a default: 770 n/a assert(0); 771 n/a } 772 n/a return mask; 773 n/a 774 n/a #undef BLOOM_UPDATE 775 n/a } 776 n/a 777 n/a static int 778 n/a ensure_unicode(PyObject *obj) 779 n/a { 780 n/a if (!PyUnicode_Check(obj)) { 781 n/a PyErr_Format(PyExc_TypeError, 782 n/a "must be str, not %.100s", 783 n/a Py_TYPE(obj)->tp_name); 784 n/a return -1; 785 n/a } 786 n/a return PyUnicode_READY(obj); 787 n/a } 788 n/a 789 n/a /* Compilation of templated routines */ 790 n/a 791 n/a #include "stringlib/asciilib.h" 792 n/a #include "stringlib/fastsearch.h" 793 n/a #include "stringlib/partition.h" 794 n/a #include "stringlib/split.h" 795 n/a #include "stringlib/count.h" 796 n/a #include "stringlib/find.h" 797 n/a #include "stringlib/find_max_char.h" 798 n/a #include "stringlib/localeutil.h" 799 n/a #include "stringlib/undef.h" 800 n/a 801 n/a #include "stringlib/ucs1lib.h" 802 n/a #include "stringlib/fastsearch.h" 803 n/a #include "stringlib/partition.h" 804 n/a #include "stringlib/split.h" 805 n/a #include "stringlib/count.h" 806 n/a #include "stringlib/find.h" 807 n/a #include "stringlib/replace.h" 808 n/a #include "stringlib/find_max_char.h" 809 n/a #include "stringlib/localeutil.h" 810 n/a #include "stringlib/undef.h" 811 n/a 812 n/a #include "stringlib/ucs2lib.h" 813 n/a #include "stringlib/fastsearch.h" 814 n/a #include "stringlib/partition.h" 815 n/a #include "stringlib/split.h" 816 n/a #include "stringlib/count.h" 817 n/a #include "stringlib/find.h" 818 n/a #include "stringlib/replace.h" 819 n/a #include "stringlib/find_max_char.h" 820 n/a #include "stringlib/localeutil.h" 821 n/a #include "stringlib/undef.h" 822 n/a 823 n/a #include "stringlib/ucs4lib.h" 824 n/a #include "stringlib/fastsearch.h" 825 n/a #include "stringlib/partition.h" 826 n/a #include "stringlib/split.h" 827 n/a #include "stringlib/count.h" 828 n/a #include "stringlib/find.h" 829 n/a #include "stringlib/replace.h" 830 n/a #include "stringlib/find_max_char.h" 831 n/a #include "stringlib/localeutil.h" 832 n/a #include "stringlib/undef.h" 833 n/a 834 n/a #include "stringlib/unicodedefs.h" 835 n/a #include "stringlib/fastsearch.h" 836 n/a #include "stringlib/count.h" 837 n/a #include "stringlib/find.h" 838 n/a #include "stringlib/undef.h" 839 n/a 840 n/a /* --- Unicode Object ----------------------------------------------------- */ 841 n/a 842 n/a static PyObject * 843 n/a fixup(PyObject *self, Py_UCS4 (*fixfct)(PyObject *s)); 844 n/a 845 n/a static inline Py_ssize_t 846 n/a findchar(const void *s, int kind, 847 n/a Py_ssize_t size, Py_UCS4 ch, 848 n/a int direction) 849 n/a { 850 n/a switch (kind) { 851 n/a case PyUnicode_1BYTE_KIND: 852 n/a if ((Py_UCS1) ch != ch) 853 n/a return -1; 854 n/a if (direction > 0) 855 n/a return ucs1lib_find_char((Py_UCS1 *) s, size, (Py_UCS1) ch); 856 n/a else 857 n/a return ucs1lib_rfind_char((Py_UCS1 *) s, size, (Py_UCS1) ch); 858 n/a case PyUnicode_2BYTE_KIND: 859 n/a if ((Py_UCS2) ch != ch) 860 n/a return -1; 861 n/a if (direction > 0) 862 n/a return ucs2lib_find_char((Py_UCS2 *) s, size, (Py_UCS2) ch); 863 n/a else 864 n/a return ucs2lib_rfind_char((Py_UCS2 *) s, size, (Py_UCS2) ch); 865 n/a case PyUnicode_4BYTE_KIND: 866 n/a if (direction > 0) 867 n/a return ucs4lib_find_char((Py_UCS4 *) s, size, ch); 868 n/a else 869 n/a return ucs4lib_rfind_char((Py_UCS4 *) s, size, ch); 870 n/a default: 871 n/a assert(0); 872 n/a return -1; 873 n/a } 874 n/a } 875 n/a 876 n/a #ifdef Py_DEBUG 877 n/a /* Fill the data of a Unicode string with invalid characters to detect bugs 878 n/a earlier. 879 n/a 880 n/a _PyUnicode_CheckConsistency(str, 1) detects invalid characters, at least for 881 n/a ASCII and UCS-4 strings. U+00FF is invalid in ASCII and U+FFFFFFFF is an 882 n/a invalid character in Unicode 6.0. */ 883 n/a static void 884 n/a unicode_fill_invalid(PyObject *unicode, Py_ssize_t old_length) 885 n/a { 886 n/a int kind = PyUnicode_KIND(unicode); 887 n/a Py_UCS1 *data = PyUnicode_1BYTE_DATA(unicode); 888 n/a Py_ssize_t length = _PyUnicode_LENGTH(unicode); 889 n/a if (length <= old_length) 890 n/a return; 891 n/a memset(data + old_length * kind, 0xff, (length - old_length) * kind); 892 n/a } 893 n/a #endif 894 n/a 895 n/a static PyObject* 896 n/a resize_compact(PyObject *unicode, Py_ssize_t length) 897 n/a { 898 n/a Py_ssize_t char_size; 899 n/a Py_ssize_t struct_size; 900 n/a Py_ssize_t new_size; 901 n/a int share_wstr; 902 n/a PyObject *new_unicode; 903 n/a #ifdef Py_DEBUG 904 n/a Py_ssize_t old_length = _PyUnicode_LENGTH(unicode); 905 n/a #endif 906 n/a 907 n/a assert(unicode_modifiable(unicode)); 908 n/a assert(PyUnicode_IS_READY(unicode)); 909 n/a assert(PyUnicode_IS_COMPACT(unicode)); 910 n/a 911 n/a char_size = PyUnicode_KIND(unicode); 912 n/a if (PyUnicode_IS_ASCII(unicode)) 913 n/a struct_size = sizeof(PyASCIIObject); 914 n/a else 915 n/a struct_size = sizeof(PyCompactUnicodeObject); 916 n/a share_wstr = _PyUnicode_SHARE_WSTR(unicode); 917 n/a 918 n/a if (length > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1)) { 919 n/a PyErr_NoMemory(); 920 n/a return NULL; 921 n/a } 922 n/a new_size = (struct_size + (length + 1) * char_size); 923 n/a 924 n/a if (_PyUnicode_HAS_UTF8_MEMORY(unicode)) { 925 n/a PyObject_DEL(_PyUnicode_UTF8(unicode)); 926 n/a _PyUnicode_UTF8(unicode) = NULL; 927 n/a _PyUnicode_UTF8_LENGTH(unicode) = 0; 928 n/a } 929 n/a _Py_DEC_REFTOTAL; 930 n/a _Py_ForgetReference(unicode); 931 n/a 932 n/a new_unicode = (PyObject *)PyObject_REALLOC(unicode, new_size); 933 n/a if (new_unicode == NULL) { 934 n/a _Py_NewReference(unicode); 935 n/a PyErr_NoMemory(); 936 n/a return NULL; 937 n/a } 938 n/a unicode = new_unicode; 939 n/a _Py_NewReference(unicode); 940 n/a 941 n/a _PyUnicode_LENGTH(unicode) = length; 942 n/a if (share_wstr) { 943 n/a _PyUnicode_WSTR(unicode) = PyUnicode_DATA(unicode); 944 n/a if (!PyUnicode_IS_ASCII(unicode)) 945 n/a _PyUnicode_WSTR_LENGTH(unicode) = length; 946 n/a } 947 n/a else if (_PyUnicode_HAS_WSTR_MEMORY(unicode)) { 948 n/a PyObject_DEL(_PyUnicode_WSTR(unicode)); 949 n/a _PyUnicode_WSTR(unicode) = NULL; 950 n/a if (!PyUnicode_IS_ASCII(unicode)) 951 n/a _PyUnicode_WSTR_LENGTH(unicode) = 0; 952 n/a } 953 n/a #ifdef Py_DEBUG 954 n/a unicode_fill_invalid(unicode, old_length); 955 n/a #endif 956 n/a PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode), 957 n/a length, 0); 958 n/a assert(_PyUnicode_CheckConsistency(unicode, 0)); 959 n/a return unicode; 960 n/a } 961 n/a 962 n/a static int 963 n/a resize_inplace(PyObject *unicode, Py_ssize_t length) 964 n/a { 965 n/a wchar_t *wstr; 966 n/a Py_ssize_t new_size; 967 n/a assert(!PyUnicode_IS_COMPACT(unicode)); 968 n/a assert(Py_REFCNT(unicode) == 1); 969 n/a 970 n/a if (PyUnicode_IS_READY(unicode)) { 971 n/a Py_ssize_t char_size; 972 n/a int share_wstr, share_utf8; 973 n/a void *data; 974 n/a #ifdef Py_DEBUG 975 n/a Py_ssize_t old_length = _PyUnicode_LENGTH(unicode); 976 n/a #endif 977 n/a 978 n/a data = _PyUnicode_DATA_ANY(unicode); 979 n/a char_size = PyUnicode_KIND(unicode); 980 n/a share_wstr = _PyUnicode_SHARE_WSTR(unicode); 981 n/a share_utf8 = _PyUnicode_SHARE_UTF8(unicode); 982 n/a 983 n/a if (length > (PY_SSIZE_T_MAX / char_size - 1)) { 984 n/a PyErr_NoMemory(); 985 n/a return -1; 986 n/a } 987 n/a new_size = (length + 1) * char_size; 988 n/a 989 n/a if (!share_utf8 && _PyUnicode_HAS_UTF8_MEMORY(unicode)) 990 n/a { 991 n/a PyObject_DEL(_PyUnicode_UTF8(unicode)); 992 n/a _PyUnicode_UTF8(unicode) = NULL; 993 n/a _PyUnicode_UTF8_LENGTH(unicode) = 0; 994 n/a } 995 n/a 996 n/a data = (PyObject *)PyObject_REALLOC(data, new_size); 997 n/a if (data == NULL) { 998 n/a PyErr_NoMemory(); 999 n/a return -1; 1000 n/a } 1001 n/a _PyUnicode_DATA_ANY(unicode) = data; 1002 n/a if (share_wstr) { 1003 n/a _PyUnicode_WSTR(unicode) = data; 1004 n/a _PyUnicode_WSTR_LENGTH(unicode) = length; 1005 n/a } 1006 n/a if (share_utf8) { 1007 n/a _PyUnicode_UTF8(unicode) = data; 1008 n/a _PyUnicode_UTF8_LENGTH(unicode) = length; 1009 n/a } 1010 n/a _PyUnicode_LENGTH(unicode) = length; 1011 n/a PyUnicode_WRITE(PyUnicode_KIND(unicode), data, length, 0); 1012 n/a #ifdef Py_DEBUG 1013 n/a unicode_fill_invalid(unicode, old_length); 1014 n/a #endif 1015 n/a if (share_wstr || _PyUnicode_WSTR(unicode) == NULL) { 1016 n/a assert(_PyUnicode_CheckConsistency(unicode, 0)); 1017 n/a return 0; 1018 n/a } 1019 n/a } 1020 n/a assert(_PyUnicode_WSTR(unicode) != NULL); 1021 n/a 1022 n/a /* check for integer overflow */ 1023 n/a if (length > PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(wchar_t) - 1) { 1024 n/a PyErr_NoMemory(); 1025 n/a return -1; 1026 n/a } 1027 n/a new_size = sizeof(wchar_t) * (length + 1); 1028 n/a wstr = _PyUnicode_WSTR(unicode); 1029 n/a wstr = PyObject_REALLOC(wstr, new_size); 1030 n/a if (!wstr) { 1031 n/a PyErr_NoMemory(); 1032 n/a return -1; 1033 n/a } 1034 n/a _PyUnicode_WSTR(unicode) = wstr; 1035 n/a _PyUnicode_WSTR(unicode)[length] = 0; 1036 n/a _PyUnicode_WSTR_LENGTH(unicode) = length; 1037 n/a assert(_PyUnicode_CheckConsistency(unicode, 0)); 1038 n/a return 0; 1039 n/a } 1040 n/a 1041 n/a static PyObject* 1042 n/a resize_copy(PyObject *unicode, Py_ssize_t length) 1043 n/a { 1044 n/a Py_ssize_t copy_length; 1045 n/a if (_PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND) { 1046 n/a PyObject *copy; 1047 n/a 1048 n/a assert(PyUnicode_IS_READY(unicode)); 1049 n/a 1050 n/a copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode)); 1051 n/a if (copy == NULL) 1052 n/a return NULL; 1053 n/a 1054 n/a copy_length = Py_MIN(length, PyUnicode_GET_LENGTH(unicode)); 1055 n/a _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, copy_length); 1056 n/a return copy; 1057 n/a } 1058 n/a else { 1059 n/a PyObject *w; 1060 n/a 1061 n/a w = (PyObject*)_PyUnicode_New(length); 1062 n/a if (w == NULL) 1063 n/a return NULL; 1064 n/a copy_length = _PyUnicode_WSTR_LENGTH(unicode); 1065 n/a copy_length = Py_MIN(copy_length, length); 1066 n/a memcpy(_PyUnicode_WSTR(w), _PyUnicode_WSTR(unicode), 1067 n/a copy_length * sizeof(wchar_t)); 1068 n/a return w; 1069 n/a } 1070 n/a } 1071 n/a 1072 n/a /* We allocate one more byte to make sure the string is 1073 n/a Ux0000 terminated; some code (e.g. new_identifier) 1074 n/a relies on that. 1075 n/a 1076 n/a XXX This allocator could further be enhanced by assuring that the 1077 n/a free list never reduces its size below 1. 1078 n/a 1079 n/a */ 1080 n/a 1081 n/a static PyUnicodeObject * 1082 n/a _PyUnicode_New(Py_ssize_t length) 1083 n/a { 1084 n/a PyUnicodeObject *unicode; 1085 n/a size_t new_size; 1086 n/a 1087 n/a /* Optimization for empty strings */ 1088 n/a if (length == 0 && unicode_empty != NULL) { 1089 n/a Py_INCREF(unicode_empty); 1090 n/a return (PyUnicodeObject*)unicode_empty; 1091 n/a } 1092 n/a 1093 n/a /* Ensure we won't overflow the size. */ 1094 n/a if (length > ((PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(Py_UNICODE)) - 1)) { 1095 n/a return (PyUnicodeObject *)PyErr_NoMemory(); 1096 n/a } 1097 n/a if (length < 0) { 1098 n/a PyErr_SetString(PyExc_SystemError, 1099 n/a "Negative size passed to _PyUnicode_New"); 1100 n/a return NULL; 1101 n/a } 1102 n/a 1103 n/a unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type); 1104 n/a if (unicode == NULL) 1105 n/a return NULL; 1106 n/a new_size = sizeof(Py_UNICODE) * ((size_t)length + 1); 1107 n/a 1108 n/a _PyUnicode_WSTR_LENGTH(unicode) = length; 1109 n/a _PyUnicode_HASH(unicode) = -1; 1110 n/a _PyUnicode_STATE(unicode).interned = 0; 1111 n/a _PyUnicode_STATE(unicode).kind = 0; 1112 n/a _PyUnicode_STATE(unicode).compact = 0; 1113 n/a _PyUnicode_STATE(unicode).ready = 0; 1114 n/a _PyUnicode_STATE(unicode).ascii = 0; 1115 n/a _PyUnicode_DATA_ANY(unicode) = NULL; 1116 n/a _PyUnicode_LENGTH(unicode) = 0; 1117 n/a _PyUnicode_UTF8(unicode) = NULL; 1118 n/a _PyUnicode_UTF8_LENGTH(unicode) = 0; 1119 n/a 1120 n/a _PyUnicode_WSTR(unicode) = (Py_UNICODE*) PyObject_MALLOC(new_size); 1121 n/a if (!_PyUnicode_WSTR(unicode)) { 1122 n/a Py_DECREF(unicode); 1123 n/a PyErr_NoMemory(); 1124 n/a return NULL; 1125 n/a } 1126 n/a 1127 n/a /* Initialize the first element to guard against cases where 1128 n/a * the caller fails before initializing str -- unicode_resize() 1129 n/a * reads str[0], and the Keep-Alive optimization can keep memory 1130 n/a * allocated for str alive across a call to unicode_dealloc(unicode). 1131 n/a * We don't want unicode_resize to read uninitialized memory in 1132 n/a * that case. 1133 n/a */ 1134 n/a _PyUnicode_WSTR(unicode)[0] = 0; 1135 n/a _PyUnicode_WSTR(unicode)[length] = 0; 1136 n/a 1137 n/a assert(_PyUnicode_CheckConsistency((PyObject *)unicode, 0)); 1138 n/a return unicode; 1139 n/a } 1140 n/a 1141 n/a static const char* 1142 n/a unicode_kind_name(PyObject *unicode) 1143 n/a { 1144 n/a /* don't check consistency: unicode_kind_name() is called from 1145 n/a _PyUnicode_Dump() */ 1146 n/a if (!PyUnicode_IS_COMPACT(unicode)) 1147 n/a { 1148 n/a if (!PyUnicode_IS_READY(unicode)) 1149 n/a return "wstr"; 1150 n/a switch (PyUnicode_KIND(unicode)) 1151 n/a { 1152 n/a case PyUnicode_1BYTE_KIND: 1153 n/a if (PyUnicode_IS_ASCII(unicode)) 1154 n/a return "legacy ascii"; 1155 n/a else 1156 n/a return "legacy latin1"; 1157 n/a case PyUnicode_2BYTE_KIND: 1158 n/a return "legacy UCS2"; 1159 n/a case PyUnicode_4BYTE_KIND: 1160 n/a return "legacy UCS4"; 1161 n/a default: 1162 n/a return "<legacy invalid kind>"; 1163 n/a } 1164 n/a } 1165 n/a assert(PyUnicode_IS_READY(unicode)); 1166 n/a switch (PyUnicode_KIND(unicode)) { 1167 n/a case PyUnicode_1BYTE_KIND: 1168 n/a if (PyUnicode_IS_ASCII(unicode)) 1169 n/a return "ascii"; 1170 n/a else 1171 n/a return "latin1"; 1172 n/a case PyUnicode_2BYTE_KIND: 1173 n/a return "UCS2"; 1174 n/a case PyUnicode_4BYTE_KIND: 1175 n/a return "UCS4"; 1176 n/a default: 1177 n/a return "<invalid compact kind>"; 1178 n/a } 1179 n/a } 1180 n/a 1181 n/a #ifdef Py_DEBUG 1182 n/a /* Functions wrapping macros for use in debugger */ 1183 n/a char *_PyUnicode_utf8(void *unicode){ 1184 n/a return PyUnicode_UTF8(unicode); 1185 n/a } 1186 n/a 1187 n/a void *_PyUnicode_compact_data(void *unicode) { 1188 n/a return _PyUnicode_COMPACT_DATA(unicode); 1189 n/a } 1190 n/a void *_PyUnicode_data(void *unicode){ 1191 n/a printf("obj %p\n", unicode); 1192 n/a printf("compact %d\n", PyUnicode_IS_COMPACT(unicode)); 1193 n/a printf("compact ascii %d\n", PyUnicode_IS_COMPACT_ASCII(unicode)); 1194 n/a printf("ascii op %p\n", ((void*)((PyASCIIObject*)(unicode) + 1))); 1195 n/a printf("compact op %p\n", ((void*)((PyCompactUnicodeObject*)(unicode) + 1))); 1196 n/a printf("compact data %p\n", _PyUnicode_COMPACT_DATA(unicode)); 1197 n/a return PyUnicode_DATA(unicode); 1198 n/a } 1199 n/a 1200 n/a void 1201 n/a _PyUnicode_Dump(PyObject *op) 1202 n/a { 1203 n/a PyASCIIObject *ascii = (PyASCIIObject *)op; 1204 n/a PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op; 1205 n/a PyUnicodeObject *unicode = (PyUnicodeObject *)op; 1206 n/a void *data; 1207 n/a 1208 n/a if (ascii->state.compact) 1209 n/a { 1210 n/a if (ascii->state.ascii) 1211 n/a data = (ascii + 1); 1212 n/a else 1213 n/a data = (compact + 1); 1214 n/a } 1215 n/a else 1216 n/a data = unicode->data.any; 1217 n/a printf("%s: len=%" PY_FORMAT_SIZE_T "u, ", 1218 n/a unicode_kind_name(op), ascii->length); 1219 n/a 1220 n/a if (ascii->wstr == data) 1221 n/a printf("shared "); 1222 n/a printf("wstr=%p", ascii->wstr); 1223 n/a 1224 n/a if (!(ascii->state.ascii == 1 && ascii->state.compact == 1)) { 1225 n/a printf(" (%" PY_FORMAT_SIZE_T "u), ", compact->wstr_length); 1226 n/a if (!ascii->state.compact && compact->utf8 == unicode->data.any) 1227 n/a printf("shared "); 1228 n/a printf("utf8=%p (%" PY_FORMAT_SIZE_T "u)", 1229 n/a compact->utf8, compact->utf8_length); 1230 n/a } 1231 n/a printf(", data=%p\n", data); 1232 n/a } 1233 n/a #endif 1234 n/a 1235 n/a PyObject * 1236 n/a PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar) 1237 n/a { 1238 n/a PyObject *obj; 1239 n/a PyCompactUnicodeObject *unicode; 1240 n/a void *data; 1241 n/a enum PyUnicode_Kind kind; 1242 n/a int is_sharing, is_ascii; 1243 n/a Py_ssize_t char_size; 1244 n/a Py_ssize_t struct_size; 1245 n/a 1246 n/a /* Optimization for empty strings */ 1247 n/a if (size == 0 && unicode_empty != NULL) { 1248 n/a Py_INCREF(unicode_empty); 1249 n/a return unicode_empty; 1250 n/a } 1251 n/a 1252 n/a is_ascii = 0; 1253 n/a is_sharing = 0; 1254 n/a struct_size = sizeof(PyCompactUnicodeObject); 1255 n/a if (maxchar < 128) { 1256 n/a kind = PyUnicode_1BYTE_KIND; 1257 n/a char_size = 1; 1258 n/a is_ascii = 1; 1259 n/a struct_size = sizeof(PyASCIIObject); 1260 n/a } 1261 n/a else if (maxchar < 256) { 1262 n/a kind = PyUnicode_1BYTE_KIND; 1263 n/a char_size = 1; 1264 n/a } 1265 n/a else if (maxchar < 65536) { 1266 n/a kind = PyUnicode_2BYTE_KIND; 1267 n/a char_size = 2; 1268 n/a if (sizeof(wchar_t) == 2) 1269 n/a is_sharing = 1; 1270 n/a } 1271 n/a else { 1272 n/a if (maxchar > MAX_UNICODE) { 1273 n/a PyErr_SetString(PyExc_SystemError, 1274 n/a "invalid maximum character passed to PyUnicode_New"); 1275 n/a return NULL; 1276 n/a } 1277 n/a kind = PyUnicode_4BYTE_KIND; 1278 n/a char_size = 4; 1279 n/a if (sizeof(wchar_t) == 4) 1280 n/a is_sharing = 1; 1281 n/a } 1282 n/a 1283 n/a /* Ensure we won't overflow the size. */ 1284 n/a if (size < 0) { 1285 n/a PyErr_SetString(PyExc_SystemError, 1286 n/a "Negative size passed to PyUnicode_New"); 1287 n/a return NULL; 1288 n/a } 1289 n/a if (size > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1)) 1290 n/a return PyErr_NoMemory(); 1291 n/a 1292 n/a /* Duplicated allocation code from _PyObject_New() instead of a call to 1293 n/a * PyObject_New() so we are able to allocate space for the object and 1294 n/a * it's data buffer. 1295 n/a */ 1296 n/a obj = (PyObject *) PyObject_MALLOC(struct_size + (size + 1) * char_size); 1297 n/a if (obj == NULL) 1298 n/a return PyErr_NoMemory(); 1299 n/a obj = PyObject_INIT(obj, &PyUnicode_Type); 1300 n/a if (obj == NULL) 1301 n/a return NULL; 1302 n/a 1303 n/a unicode = (PyCompactUnicodeObject *)obj; 1304 n/a if (is_ascii) 1305 n/a data = ((PyASCIIObject*)obj) + 1; 1306 n/a else 1307 n/a data = unicode + 1; 1308 n/a _PyUnicode_LENGTH(unicode) = size; 1309 n/a _PyUnicode_HASH(unicode) = -1; 1310 n/a _PyUnicode_STATE(unicode).interned = 0; 1311 n/a _PyUnicode_STATE(unicode).kind = kind; 1312 n/a _PyUnicode_STATE(unicode).compact = 1; 1313 n/a _PyUnicode_STATE(unicode).ready = 1; 1314 n/a _PyUnicode_STATE(unicode).ascii = is_ascii; 1315 n/a if (is_ascii) { 1316 n/a ((char*)data)[size] = 0; 1317 n/a _PyUnicode_WSTR(unicode) = NULL; 1318 n/a } 1319 n/a else if (kind == PyUnicode_1BYTE_KIND) { 1320 n/a ((char*)data)[size] = 0; 1321 n/a _PyUnicode_WSTR(unicode) = NULL; 1322 n/a _PyUnicode_WSTR_LENGTH(unicode) = 0; 1323 n/a unicode->utf8 = NULL; 1324 n/a unicode->utf8_length = 0; 1325 n/a } 1326 n/a else { 1327 n/a unicode->utf8 = NULL; 1328 n/a unicode->utf8_length = 0; 1329 n/a if (kind == PyUnicode_2BYTE_KIND) 1330 n/a ((Py_UCS2*)data)[size] = 0; 1331 n/a else /* kind == PyUnicode_4BYTE_KIND */ 1332 n/a ((Py_UCS4*)data)[size] = 0; 1333 n/a if (is_sharing) { 1334 n/a _PyUnicode_WSTR_LENGTH(unicode) = size; 1335 n/a _PyUnicode_WSTR(unicode) = (wchar_t *)data; 1336 n/a } 1337 n/a else { 1338 n/a _PyUnicode_WSTR_LENGTH(unicode) = 0; 1339 n/a _PyUnicode_WSTR(unicode) = NULL; 1340 n/a } 1341 n/a } 1342 n/a #ifdef Py_DEBUG 1343 n/a unicode_fill_invalid((PyObject*)unicode, 0); 1344 n/a #endif 1345 n/a assert(_PyUnicode_CheckConsistency((PyObject*)unicode, 0)); 1346 n/a return obj; 1347 n/a } 1348 n/a 1349 n/a #if SIZEOF_WCHAR_T == 2 1350 n/a /* Helper function to convert a 16-bits wchar_t representation to UCS4, this 1351 n/a will decode surrogate pairs, the other conversions are implemented as macros 1352 n/a for efficiency. 1353 n/a 1354 n/a This function assumes that unicode can hold one more code point than wstr 1355 n/a characters for a terminating null character. */ 1356 n/a static void 1357 n/a unicode_convert_wchar_to_ucs4(const wchar_t *begin, const wchar_t *end, 1358 n/a PyObject *unicode) 1359 n/a { 1360 n/a const wchar_t *iter; 1361 n/a Py_UCS4 *ucs4_out; 1362 n/a 1363 n/a assert(unicode != NULL); 1364 n/a assert(_PyUnicode_CHECK(unicode)); 1365 n/a assert(_PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND); 1366 n/a ucs4_out = PyUnicode_4BYTE_DATA(unicode); 1367 n/a 1368 n/a for (iter = begin; iter < end; ) { 1369 n/a assert(ucs4_out < (PyUnicode_4BYTE_DATA(unicode) + 1370 n/a _PyUnicode_GET_LENGTH(unicode))); 1371 n/a if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0]) 1372 n/a && (iter+1) < end 1373 n/a && Py_UNICODE_IS_LOW_SURROGATE(iter[1])) 1374 n/a { 1375 n/a *ucs4_out++ = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]); 1376 n/a iter += 2; 1377 n/a } 1378 n/a else { 1379 n/a *ucs4_out++ = *iter; 1380 n/a iter++; 1381 n/a } 1382 n/a } 1383 n/a assert(ucs4_out == (PyUnicode_4BYTE_DATA(unicode) + 1384 n/a _PyUnicode_GET_LENGTH(unicode))); 1385 n/a 1386 n/a } 1387 n/a #endif 1388 n/a 1389 n/a static int 1390 n/a unicode_check_modifiable(PyObject *unicode) 1391 n/a { 1392 n/a if (!unicode_modifiable(unicode)) { 1393 n/a PyErr_SetString(PyExc_SystemError, 1394 n/a "Cannot modify a string currently used"); 1395 n/a return -1; 1396 n/a } 1397 n/a return 0; 1398 n/a } 1399 n/a 1400 n/a static int 1401 n/a _copy_characters(PyObject *to, Py_ssize_t to_start, 1402 n/a PyObject *from, Py_ssize_t from_start, 1403 n/a Py_ssize_t how_many, int check_maxchar) 1404 n/a { 1405 n/a unsigned int from_kind, to_kind; 1406 n/a void *from_data, *to_data; 1407 n/a 1408 n/a assert(0 <= how_many); 1409 n/a assert(0 <= from_start); 1410 n/a assert(0 <= to_start); 1411 n/a assert(PyUnicode_Check(from)); 1412 n/a assert(PyUnicode_IS_READY(from)); 1413 n/a assert(from_start + how_many <= PyUnicode_GET_LENGTH(from)); 1414 n/a 1415 n/a assert(PyUnicode_Check(to)); 1416 n/a assert(PyUnicode_IS_READY(to)); 1417 n/a assert(to_start + how_many <= PyUnicode_GET_LENGTH(to)); 1418 n/a 1419 n/a if (how_many == 0) 1420 n/a return 0; 1421 n/a 1422 n/a from_kind = PyUnicode_KIND(from); 1423 n/a from_data = PyUnicode_DATA(from); 1424 n/a to_kind = PyUnicode_KIND(to); 1425 n/a to_data = PyUnicode_DATA(to); 1426 n/a 1427 n/a #ifdef Py_DEBUG 1428 n/a if (!check_maxchar 1429 n/a && PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to)) 1430 n/a { 1431 n/a const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to); 1432 n/a Py_UCS4 ch; 1433 n/a Py_ssize_t i; 1434 n/a for (i=0; i < how_many; i++) { 1435 n/a ch = PyUnicode_READ(from_kind, from_data, from_start + i); 1436 n/a assert(ch <= to_maxchar); 1437 n/a } 1438 n/a } 1439 n/a #endif 1440 n/a 1441 n/a if (from_kind == to_kind) { 1442 n/a if (check_maxchar 1443 n/a && !PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to)) 1444 n/a { 1445 n/a /* Writing Latin-1 characters into an ASCII string requires to 1446 n/a check that all written characters are pure ASCII */ 1447 n/a Py_UCS4 max_char; 1448 n/a max_char = ucs1lib_find_max_char(from_data, 1449 n/a (Py_UCS1*)from_data + how_many); 1450 n/a if (max_char >= 128) 1451 n/a return -1; 1452 n/a } 1453 n/a memcpy((char*)to_data + to_kind * to_start, 1454 n/a (char*)from_data + from_kind * from_start, 1455 n/a to_kind * how_many); 1456 n/a } 1457 n/a else if (from_kind == PyUnicode_1BYTE_KIND 1458 n/a && to_kind == PyUnicode_2BYTE_KIND) 1459 n/a { 1460 n/a _PyUnicode_CONVERT_BYTES( 1461 n/a Py_UCS1, Py_UCS2, 1462 n/a PyUnicode_1BYTE_DATA(from) + from_start, 1463 n/a PyUnicode_1BYTE_DATA(from) + from_start + how_many, 1464 n/a PyUnicode_2BYTE_DATA(to) + to_start 1465 n/a ); 1466 n/a } 1467 n/a else if (from_kind == PyUnicode_1BYTE_KIND 1468 n/a && to_kind == PyUnicode_4BYTE_KIND) 1469 n/a { 1470 n/a _PyUnicode_CONVERT_BYTES( 1471 n/a Py_UCS1, Py_UCS4, 1472 n/a PyUnicode_1BYTE_DATA(from) + from_start, 1473 n/a PyUnicode_1BYTE_DATA(from) + from_start + how_many, 1474 n/a PyUnicode_4BYTE_DATA(to) + to_start 1475 n/a ); 1476 n/a } 1477 n/a else if (from_kind == PyUnicode_2BYTE_KIND 1478 n/a && to_kind == PyUnicode_4BYTE_KIND) 1479 n/a { 1480 n/a _PyUnicode_CONVERT_BYTES( 1481 n/a Py_UCS2, Py_UCS4, 1482 n/a PyUnicode_2BYTE_DATA(from) + from_start, 1483 n/a PyUnicode_2BYTE_DATA(from) + from_start + how_many, 1484 n/a PyUnicode_4BYTE_DATA(to) + to_start 1485 n/a ); 1486 n/a } 1487 n/a else { 1488 n/a assert (PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to)); 1489 n/a 1490 n/a if (!check_maxchar) { 1491 n/a if (from_kind == PyUnicode_2BYTE_KIND 1492 n/a && to_kind == PyUnicode_1BYTE_KIND) 1493 n/a { 1494 n/a _PyUnicode_CONVERT_BYTES( 1495 n/a Py_UCS2, Py_UCS1, 1496 n/a PyUnicode_2BYTE_DATA(from) + from_start, 1497 n/a PyUnicode_2BYTE_DATA(from) + from_start + how_many, 1498 n/a PyUnicode_1BYTE_DATA(to) + to_start 1499 n/a ); 1500 n/a } 1501 n/a else if (from_kind == PyUnicode_4BYTE_KIND 1502 n/a && to_kind == PyUnicode_1BYTE_KIND) 1503 n/a { 1504 n/a _PyUnicode_CONVERT_BYTES( 1505 n/a Py_UCS4, Py_UCS1, 1506 n/a PyUnicode_4BYTE_DATA(from) + from_start, 1507 n/a PyUnicode_4BYTE_DATA(from) + from_start + how_many, 1508 n/a PyUnicode_1BYTE_DATA(to) + to_start 1509 n/a ); 1510 n/a } 1511 n/a else if (from_kind == PyUnicode_4BYTE_KIND 1512 n/a && to_kind == PyUnicode_2BYTE_KIND) 1513 n/a { 1514 n/a _PyUnicode_CONVERT_BYTES( 1515 n/a Py_UCS4, Py_UCS2, 1516 n/a PyUnicode_4BYTE_DATA(from) + from_start, 1517 n/a PyUnicode_4BYTE_DATA(from) + from_start + how_many, 1518 n/a PyUnicode_2BYTE_DATA(to) + to_start 1519 n/a ); 1520 n/a } 1521 n/a else { 1522 n/a assert(0); 1523 n/a return -1; 1524 n/a } 1525 n/a } 1526 n/a else { 1527 n/a const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to); 1528 n/a Py_UCS4 ch; 1529 n/a Py_ssize_t i; 1530 n/a 1531 n/a for (i=0; i < how_many; i++) { 1532 n/a ch = PyUnicode_READ(from_kind, from_data, from_start + i); 1533 n/a if (ch > to_maxchar) 1534 n/a return -1; 1535 n/a PyUnicode_WRITE(to_kind, to_data, to_start + i, ch); 1536 n/a } 1537 n/a } 1538 n/a } 1539 n/a return 0; 1540 n/a } 1541 n/a 1542 n/a void 1543 n/a _PyUnicode_FastCopyCharacters( 1544 n/a PyObject *to, Py_ssize_t to_start, 1545 n/a PyObject *from, Py_ssize_t from_start, Py_ssize_t how_many) 1546 n/a { 1547 n/a (void)_copy_characters(to, to_start, from, from_start, how_many, 0); 1548 n/a } 1549 n/a 1550 n/a Py_ssize_t 1551 n/a PyUnicode_CopyCharacters(PyObject *to, Py_ssize_t to_start, 1552 n/a PyObject *from, Py_ssize_t from_start, 1553 n/a Py_ssize_t how_many) 1554 n/a { 1555 n/a int err; 1556 n/a 1557 n/a if (!PyUnicode_Check(from) || !PyUnicode_Check(to)) { 1558 n/a PyErr_BadInternalCall(); 1559 n/a return -1; 1560 n/a } 1561 n/a 1562 n/a if (PyUnicode_READY(from) == -1) 1563 n/a return -1; 1564 n/a if (PyUnicode_READY(to) == -1) 1565 n/a return -1; 1566 n/a 1567 n/a if ((size_t)from_start > (size_t)PyUnicode_GET_LENGTH(from)) { 1568 n/a PyErr_SetString(PyExc_IndexError, "string index out of range"); 1569 n/a return -1; 1570 n/a } 1571 n/a if ((size_t)to_start > (size_t)PyUnicode_GET_LENGTH(to)) { 1572 n/a PyErr_SetString(PyExc_IndexError, "string index out of range"); 1573 n/a return -1; 1574 n/a } 1575 n/a if (how_many < 0) { 1576 n/a PyErr_SetString(PyExc_SystemError, "how_many cannot be negative"); 1577 n/a return -1; 1578 n/a } 1579 n/a how_many = Py_MIN(PyUnicode_GET_LENGTH(from)-from_start, how_many); 1580 n/a if (to_start + how_many > PyUnicode_GET_LENGTH(to)) { 1581 n/a PyErr_Format(PyExc_SystemError, 1582 n/a "Cannot write %zi characters at %zi " 1583 n/a "in a string of %zi characters", 1584 n/a how_many, to_start, PyUnicode_GET_LENGTH(to)); 1585 n/a return -1; 1586 n/a } 1587 n/a 1588 n/a if (how_many == 0) 1589 n/a return 0; 1590 n/a 1591 n/a if (unicode_check_modifiable(to)) 1592 n/a return -1; 1593 n/a 1594 n/a err = _copy_characters(to, to_start, from, from_start, how_many, 1); 1595 n/a if (err) { 1596 n/a PyErr_Format(PyExc_SystemError, 1597 n/a "Cannot copy %s characters " 1598 n/a "into a string of %s characters", 1599 n/a unicode_kind_name(from), 1600 n/a unicode_kind_name(to)); 1601 n/a return -1; 1602 n/a } 1603 n/a return how_many; 1604 n/a } 1605 n/a 1606 n/a /* Find the maximum code point and count the number of surrogate pairs so a 1607 n/a correct string length can be computed before converting a string to UCS4. 1608 n/a This function counts single surrogates as a character and not as a pair. 1609 n/a 1610 n/a Return 0 on success, or -1 on error. */ 1611 n/a static int 1612 n/a find_maxchar_surrogates(const wchar_t *begin, const wchar_t *end, 1613 n/a Py_UCS4 *maxchar, Py_ssize_t *num_surrogates) 1614 n/a { 1615 n/a const wchar_t *iter; 1616 n/a Py_UCS4 ch; 1617 n/a 1618 n/a assert(num_surrogates != NULL && maxchar != NULL); 1619 n/a *num_surrogates = 0; 1620 n/a *maxchar = 0; 1621 n/a 1622 n/a for (iter = begin; iter < end; ) { 1623 n/a #if SIZEOF_WCHAR_T == 2 1624 n/a if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0]) 1625 n/a && (iter+1) < end 1626 n/a && Py_UNICODE_IS_LOW_SURROGATE(iter[1])) 1627 n/a { 1628 n/a ch = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]); 1629 n/a ++(*num_surrogates); 1630 n/a iter += 2; 1631 n/a } 1632 n/a else 1633 n/a #endif 1634 n/a { 1635 n/a ch = *iter; 1636 n/a iter++; 1637 n/a } 1638 n/a if (ch > *maxchar) { 1639 n/a *maxchar = ch; 1640 n/a if (*maxchar > MAX_UNICODE) { 1641 n/a PyErr_Format(PyExc_ValueError, 1642 n/a "character U+%x is not in range [U+0000; U+10ffff]", 1643 n/a ch); 1644 n/a return -1; 1645 n/a } 1646 n/a } 1647 n/a } 1648 n/a return 0; 1649 n/a } 1650 n/a 1651 n/a int 1652 n/a _PyUnicode_Ready(PyObject *unicode) 1653 n/a { 1654 n/a wchar_t *end; 1655 n/a Py_UCS4 maxchar = 0; 1656 n/a Py_ssize_t num_surrogates; 1657 n/a #if SIZEOF_WCHAR_T == 2 1658 n/a Py_ssize_t length_wo_surrogates; 1659 n/a #endif 1660 n/a 1661 n/a /* _PyUnicode_Ready() is only intended for old-style API usage where 1662 n/a strings were created using _PyObject_New() and where no canonical 1663 n/a representation (the str field) has been set yet aka strings 1664 n/a which are not yet ready. */ 1665 n/a assert(_PyUnicode_CHECK(unicode)); 1666 n/a assert(_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND); 1667 n/a assert(_PyUnicode_WSTR(unicode) != NULL); 1668 n/a assert(_PyUnicode_DATA_ANY(unicode) == NULL); 1669 n/a assert(_PyUnicode_UTF8(unicode) == NULL); 1670 n/a /* Actually, it should neither be interned nor be anything else: */ 1671 n/a assert(_PyUnicode_STATE(unicode).interned == SSTATE_NOT_INTERNED); 1672 n/a 1673 n/a end = _PyUnicode_WSTR(unicode) + _PyUnicode_WSTR_LENGTH(unicode); 1674 n/a if (find_maxchar_surrogates(_PyUnicode_WSTR(unicode), end, 1675 n/a &maxchar, &num_surrogates) == -1) 1676 n/a return -1; 1677 n/a 1678 n/a if (maxchar < 256) { 1679 n/a _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(_PyUnicode_WSTR_LENGTH(unicode) + 1); 1680 n/a if (!_PyUnicode_DATA_ANY(unicode)) { 1681 n/a PyErr_NoMemory(); 1682 n/a return -1; 1683 n/a } 1684 n/a _PyUnicode_CONVERT_BYTES(wchar_t, unsigned char, 1685 n/a _PyUnicode_WSTR(unicode), end, 1686 n/a PyUnicode_1BYTE_DATA(unicode)); 1687 n/a PyUnicode_1BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0'; 1688 n/a _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode); 1689 n/a _PyUnicode_STATE(unicode).kind = PyUnicode_1BYTE_KIND; 1690 n/a if (maxchar < 128) { 1691 n/a _PyUnicode_STATE(unicode).ascii = 1; 1692 n/a _PyUnicode_UTF8(unicode) = _PyUnicode_DATA_ANY(unicode); 1693 n/a _PyUnicode_UTF8_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode); 1694 n/a } 1695 n/a else { 1696 n/a _PyUnicode_STATE(unicode).ascii = 0; 1697 n/a _PyUnicode_UTF8(unicode) = NULL; 1698 n/a _PyUnicode_UTF8_LENGTH(unicode) = 0; 1699 n/a } 1700 n/a PyObject_FREE(_PyUnicode_WSTR(unicode)); 1701 n/a _PyUnicode_WSTR(unicode) = NULL; 1702 n/a _PyUnicode_WSTR_LENGTH(unicode) = 0; 1703 n/a } 1704 n/a /* In this case we might have to convert down from 4-byte native 1705 n/a wchar_t to 2-byte unicode. */ 1706 n/a else if (maxchar < 65536) { 1707 n/a assert(num_surrogates == 0 && 1708 n/a "FindMaxCharAndNumSurrogatePairs() messed up"); 1709 n/a 1710 n/a #if SIZEOF_WCHAR_T == 2 1711 n/a /* We can share representations and are done. */ 1712 n/a _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode); 1713 n/a PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0'; 1714 n/a _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode); 1715 n/a _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND; 1716 n/a _PyUnicode_UTF8(unicode) = NULL; 1717 n/a _PyUnicode_UTF8_LENGTH(unicode) = 0; 1718 n/a #else 1719 n/a /* sizeof(wchar_t) == 4 */ 1720 n/a _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC( 1721 n/a 2 * (_PyUnicode_WSTR_LENGTH(unicode) + 1)); 1722 n/a if (!_PyUnicode_DATA_ANY(unicode)) { 1723 n/a PyErr_NoMemory(); 1724 n/a return -1; 1725 n/a } 1726 n/a _PyUnicode_CONVERT_BYTES(wchar_t, Py_UCS2, 1727 n/a _PyUnicode_WSTR(unicode), end, 1728 n/a PyUnicode_2BYTE_DATA(unicode)); 1729 n/a PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0'; 1730 n/a _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode); 1731 n/a _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND; 1732 n/a _PyUnicode_UTF8(unicode) = NULL; 1733 n/a _PyUnicode_UTF8_LENGTH(unicode) = 0; 1734 n/a PyObject_FREE(_PyUnicode_WSTR(unicode)); 1735 n/a _PyUnicode_WSTR(unicode) = NULL; 1736 n/a _PyUnicode_WSTR_LENGTH(unicode) = 0; 1737 n/a #endif 1738 n/a } 1739 n/a /* maxchar exeeds 16 bit, wee need 4 bytes for unicode characters */ 1740 n/a else { 1741 n/a #if SIZEOF_WCHAR_T == 2 1742 n/a /* in case the native representation is 2-bytes, we need to allocate a 1743 n/a new normalized 4-byte version. */ 1744 n/a length_wo_surrogates = _PyUnicode_WSTR_LENGTH(unicode) - num_surrogates; 1745 n/a if (length_wo_surrogates > PY_SSIZE_T_MAX / 4 - 1) { 1746 n/a PyErr_NoMemory(); 1747 n/a return -1; 1748 n/a } 1749 n/a _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(4 * (length_wo_surrogates + 1)); 1750 n/a if (!_PyUnicode_DATA_ANY(unicode)) { 1751 n/a PyErr_NoMemory(); 1752 n/a return -1; 1753 n/a } 1754 n/a _PyUnicode_LENGTH(unicode) = length_wo_surrogates; 1755 n/a _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND; 1756 n/a _PyUnicode_UTF8(unicode) = NULL; 1757 n/a _PyUnicode_UTF8_LENGTH(unicode) = 0; 1758 n/a /* unicode_convert_wchar_to_ucs4() requires a ready string */ 1759 n/a _PyUnicode_STATE(unicode).ready = 1; 1760 n/a unicode_convert_wchar_to_ucs4(_PyUnicode_WSTR(unicode), end, unicode); 1761 n/a PyObject_FREE(_PyUnicode_WSTR(unicode)); 1762 n/a _PyUnicode_WSTR(unicode) = NULL; 1763 n/a _PyUnicode_WSTR_LENGTH(unicode) = 0; 1764 n/a #else 1765 n/a assert(num_surrogates == 0); 1766 n/a 1767 n/a _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode); 1768 n/a _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode); 1769 n/a _PyUnicode_UTF8(unicode) = NULL; 1770 n/a _PyUnicode_UTF8_LENGTH(unicode) = 0; 1771 n/a _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND; 1772 n/a #endif 1773 n/a PyUnicode_4BYTE_DATA(unicode)[_PyUnicode_LENGTH(unicode)] = '\0'; 1774 n/a } 1775 n/a _PyUnicode_STATE(unicode).ready = 1; 1776 n/a assert(_PyUnicode_CheckConsistency(unicode, 1)); 1777 n/a return 0; 1778 n/a } 1779 n/a 1780 n/a static void 1781 n/a unicode_dealloc(PyObject *unicode) 1782 n/a { 1783 n/a switch (PyUnicode_CHECK_INTERNED(unicode)) { 1784 n/a case SSTATE_NOT_INTERNED: 1785 n/a break; 1786 n/a 1787 n/a case SSTATE_INTERNED_MORTAL: 1788 n/a /* revive dead object temporarily for DelItem */ 1789 n/a Py_REFCNT(unicode) = 3; 1790 n/a if (PyDict_DelItem(interned, unicode) != 0) 1791 n/a Py_FatalError( 1792 n/a "deletion of interned string failed"); 1793 n/a break; 1794 n/a 1795 n/a case SSTATE_INTERNED_IMMORTAL: 1796 n/a Py_FatalError("Immortal interned string died."); 1797 n/a 1798 n/a default: 1799 n/a Py_FatalError("Inconsistent interned string state."); 1800 n/a } 1801 n/a 1802 n/a if (_PyUnicode_HAS_WSTR_MEMORY(unicode)) 1803 n/a PyObject_DEL(_PyUnicode_WSTR(unicode)); 1804 n/a if (_PyUnicode_HAS_UTF8_MEMORY(unicode)) 1805 n/a PyObject_DEL(_PyUnicode_UTF8(unicode)); 1806 n/a if (!PyUnicode_IS_COMPACT(unicode) && _PyUnicode_DATA_ANY(unicode)) 1807 n/a PyObject_DEL(_PyUnicode_DATA_ANY(unicode)); 1808 n/a 1809 n/a Py_TYPE(unicode)->tp_free(unicode); 1810 n/a } 1811 n/a 1812 n/a #ifdef Py_DEBUG 1813 n/a static int 1814 n/a unicode_is_singleton(PyObject *unicode) 1815 n/a { 1816 n/a PyASCIIObject *ascii = (PyASCIIObject *)unicode; 1817 n/a if (unicode == unicode_empty) 1818 n/a return 1; 1819 n/a if (ascii->state.kind != PyUnicode_WCHAR_KIND && ascii->length == 1) 1820 n/a { 1821 n/a Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0); 1822 n/a if (ch < 256 && unicode_latin1[ch] == unicode) 1823 n/a return 1; 1824 n/a } 1825 n/a return 0; 1826 n/a } 1827 n/a #endif 1828 n/a 1829 n/a static int 1830 n/a unicode_modifiable(PyObject *unicode) 1831 n/a { 1832 n/a assert(_PyUnicode_CHECK(unicode)); 1833 n/a if (Py_REFCNT(unicode) != 1) 1834 n/a return 0; 1835 n/a if (_PyUnicode_HASH(unicode) != -1) 1836 n/a return 0; 1837 n/a if (PyUnicode_CHECK_INTERNED(unicode)) 1838 n/a return 0; 1839 n/a if (!PyUnicode_CheckExact(unicode)) 1840 n/a return 0; 1841 n/a #ifdef Py_DEBUG 1842 n/a /* singleton refcount is greater than 1 */ 1843 n/a assert(!unicode_is_singleton(unicode)); 1844 n/a #endif 1845 n/a return 1; 1846 n/a } 1847 n/a 1848 n/a static int 1849 n/a unicode_resize(PyObject **p_unicode, Py_ssize_t length) 1850 n/a { 1851 n/a PyObject *unicode; 1852 n/a Py_ssize_t old_length; 1853 n/a 1854 n/a assert(p_unicode != NULL); 1855 n/a unicode = *p_unicode; 1856 n/a 1857 n/a assert(unicode != NULL); 1858 n/a assert(PyUnicode_Check(unicode)); 1859 n/a assert(0 <= length); 1860 n/a 1861 n/a if (_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND) 1862 n/a old_length = PyUnicode_WSTR_LENGTH(unicode); 1863 n/a else 1864 n/a old_length = PyUnicode_GET_LENGTH(unicode); 1865 n/a if (old_length == length) 1866 n/a return 0; 1867 n/a 1868 n/a if (length == 0) { 1869 n/a _Py_INCREF_UNICODE_EMPTY(); 1870 n/a if (!unicode_empty) 1871 n/a return -1; 1872 n/a Py_SETREF(*p_unicode, unicode_empty); 1873 n/a return 0; 1874 n/a } 1875 n/a 1876 n/a if (!unicode_modifiable(unicode)) { 1877 n/a PyObject *copy = resize_copy(unicode, length); 1878 n/a if (copy == NULL) 1879 n/a return -1; 1880 n/a Py_SETREF(*p_unicode, copy); 1881 n/a return 0; 1882 n/a } 1883 n/a 1884 n/a if (PyUnicode_IS_COMPACT(unicode)) { 1885 n/a PyObject *new_unicode = resize_compact(unicode, length); 1886 n/a if (new_unicode == NULL) 1887 n/a return -1; 1888 n/a *p_unicode = new_unicode; 1889 n/a return 0; 1890 n/a } 1891 n/a return resize_inplace(unicode, length); 1892 n/a } 1893 n/a 1894 n/a int 1895 n/a PyUnicode_Resize(PyObject **p_unicode, Py_ssize_t length) 1896 n/a { 1897 n/a PyObject *unicode; 1898 n/a if (p_unicode == NULL) { 1899 n/a PyErr_BadInternalCall(); 1900 n/a return -1; 1901 n/a } 1902 n/a unicode = *p_unicode; 1903 n/a if (unicode == NULL || !PyUnicode_Check(unicode) || length < 0) 1904 n/a { 1905 n/a PyErr_BadInternalCall(); 1906 n/a return -1; 1907 n/a } 1908 n/a return unicode_resize(p_unicode, length); 1909 n/a } 1910 n/a 1911 n/a /* Copy an ASCII or latin1 char* string into a Python Unicode string. 1912 n/a 1913 n/a WARNING: The function doesn't copy the terminating null character and 1914 n/a doesn't check the maximum character (may write a latin1 character in an 1915 n/a ASCII string). */ 1916 n/a static void 1917 n/a unicode_write_cstr(PyObject *unicode, Py_ssize_t index, 1918 n/a const char *str, Py_ssize_t len) 1919 n/a { 1920 n/a enum PyUnicode_Kind kind = PyUnicode_KIND(unicode); 1921 n/a void *data = PyUnicode_DATA(unicode); 1922 n/a const char *end = str + len; 1923 n/a 1924 n/a switch (kind) { 1925 n/a case PyUnicode_1BYTE_KIND: { 1926 n/a assert(index + len <= PyUnicode_GET_LENGTH(unicode)); 1927 n/a #ifdef Py_DEBUG 1928 n/a if (PyUnicode_IS_ASCII(unicode)) { 1929 n/a Py_UCS4 maxchar = ucs1lib_find_max_char( 1930 n/a (const Py_UCS1*)str, 1931 n/a (const Py_UCS1*)str + len); 1932 n/a assert(maxchar < 128); 1933 n/a } 1934 n/a #endif 1935 n/a memcpy((char *) data + index, str, len); 1936 n/a break; 1937 n/a } 1938 n/a case PyUnicode_2BYTE_KIND: { 1939 n/a Py_UCS2 *start = (Py_UCS2 *)data + index; 1940 n/a Py_UCS2 *ucs2 = start; 1941 n/a assert(index <= PyUnicode_GET_LENGTH(unicode)); 1942 n/a 1943 n/a for (; str < end; ++ucs2, ++str) 1944 n/a *ucs2 = (Py_UCS2)*str; 1945 n/a 1946 n/a assert((ucs2 - start) <= PyUnicode_GET_LENGTH(unicode)); 1947 n/a break; 1948 n/a } 1949 n/a default: { 1950 n/a Py_UCS4 *start = (Py_UCS4 *)data + index; 1951 n/a Py_UCS4 *ucs4 = start; 1952 n/a assert(kind == PyUnicode_4BYTE_KIND); 1953 n/a assert(index <= PyUnicode_GET_LENGTH(unicode)); 1954 n/a 1955 n/a for (; str < end; ++ucs4, ++str) 1956 n/a *ucs4 = (Py_UCS4)*str; 1957 n/a 1958 n/a assert((ucs4 - start) <= PyUnicode_GET_LENGTH(unicode)); 1959 n/a } 1960 n/a } 1961 n/a } 1962 n/a 1963 n/a static PyObject* 1964 n/a get_latin1_char(unsigned char ch) 1965 n/a { 1966 n/a PyObject *unicode = unicode_latin1[ch]; 1967 n/a if (!unicode) { 1968 n/a unicode = PyUnicode_New(1, ch); 1969 n/a if (!unicode) 1970 n/a return NULL; 1971 n/a PyUnicode_1BYTE_DATA(unicode)[0] = ch; 1972 n/a assert(_PyUnicode_CheckConsistency(unicode, 1)); 1973 n/a unicode_latin1[ch] = unicode; 1974 n/a } 1975 n/a Py_INCREF(unicode); 1976 n/a return unicode; 1977 n/a } 1978 n/a 1979 n/a static PyObject* 1980 n/a unicode_char(Py_UCS4 ch) 1981 n/a { 1982 n/a PyObject *unicode; 1983 n/a 1984 n/a assert(ch <= MAX_UNICODE); 1985 n/a 1986 n/a if (ch < 256) 1987 n/a return get_latin1_char(ch); 1988 n/a 1989 n/a unicode = PyUnicode_New(1, ch); 1990 n/a if (unicode == NULL) 1991 n/a return NULL; 1992 n/a 1993 n/a assert(PyUnicode_KIND(unicode) != PyUnicode_1BYTE_KIND); 1994 n/a if (PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND) { 1995 n/a PyUnicode_2BYTE_DATA(unicode)[0] = (Py_UCS2)ch; 1996 n/a } else { 1997 n/a assert(PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND); 1998 n/a PyUnicode_4BYTE_DATA(unicode)[0] = ch; 1999 n/a } 2000 n/a assert(_PyUnicode_CheckConsistency(unicode, 1)); 2001 n/a return unicode; 2002 n/a } 2003 n/a 2004 n/a PyObject * 2005 n/a PyUnicode_FromUnicode(const Py_UNICODE *u, Py_ssize_t size) 2006 n/a { 2007 n/a if (u == NULL) 2008 n/a return (PyObject*)_PyUnicode_New(size); 2009 n/a 2010 n/a if (size < 0) { 2011 n/a PyErr_BadInternalCall(); 2012 n/a return NULL; 2013 n/a } 2014 n/a 2015 n/a return PyUnicode_FromWideChar(u, size); 2016 n/a } 2017 n/a 2018 n/a PyObject * 2019 n/a PyUnicode_FromWideChar(const wchar_t *u, Py_ssize_t size) 2020 n/a { 2021 n/a PyObject *unicode; 2022 n/a Py_UCS4 maxchar = 0; 2023 n/a Py_ssize_t num_surrogates; 2024 n/a 2025 n/a if (u == NULL && size != 0) { 2026 n/a PyErr_BadInternalCall(); 2027 n/a return NULL; 2028 n/a } 2029 n/a 2030 n/a if (size == -1) { 2031 n/a size = wcslen(u); 2032 n/a } 2033 n/a 2034 n/a /* If the Unicode data is known at construction time, we can apply 2035 n/a some optimizations which share commonly used objects. */ 2036 n/a 2037 n/a /* Optimization for empty strings */ 2038 n/a if (size == 0) 2039 n/a _Py_RETURN_UNICODE_EMPTY(); 2040 n/a 2041 n/a /* Single character Unicode objects in the Latin-1 range are 2042 n/a shared when using this constructor */ 2043 n/a if (size == 1 && (Py_UCS4)*u < 256) 2044 n/a return get_latin1_char((unsigned char)*u); 2045 n/a 2046 n/a /* If not empty and not single character, copy the Unicode data 2047 n/a into the new object */ 2048 n/a if (find_maxchar_surrogates(u, u + size, 2049 n/a &maxchar, &num_surrogates) == -1) 2050 n/a return NULL; 2051 n/a 2052 n/a unicode = PyUnicode_New(size - num_surrogates, maxchar); 2053 n/a if (!unicode) 2054 n/a return NULL; 2055 n/a 2056 n/a switch (PyUnicode_KIND(unicode)) { 2057 n/a case PyUnicode_1BYTE_KIND: 2058 n/a _PyUnicode_CONVERT_BYTES(Py_UNICODE, unsigned char, 2059 n/a u, u + size, PyUnicode_1BYTE_DATA(unicode)); 2060 n/a break; 2061 n/a case PyUnicode_2BYTE_KIND: 2062 n/a #if Py_UNICODE_SIZE == 2 2063 n/a memcpy(PyUnicode_2BYTE_DATA(unicode), u, size * 2); 2064 n/a #else 2065 n/a _PyUnicode_CONVERT_BYTES(Py_UNICODE, Py_UCS2, 2066 n/a u, u + size, PyUnicode_2BYTE_DATA(unicode)); 2067 n/a #endif 2068 n/a break; 2069 n/a case PyUnicode_4BYTE_KIND: 2070 n/a #if SIZEOF_WCHAR_T == 2 2071 n/a /* This is the only case which has to process surrogates, thus 2072 n/a a simple copy loop is not enough and we need a function. */ 2073 n/a unicode_convert_wchar_to_ucs4(u, u + size, unicode); 2074 n/a #else 2075 n/a assert(num_surrogates == 0); 2076 n/a memcpy(PyUnicode_4BYTE_DATA(unicode), u, size * 4); 2077 n/a #endif 2078 n/a break; 2079 n/a default: 2080 n/a assert(0 && "Impossible state"); 2081 n/a } 2082 n/a 2083 n/a return unicode_result(unicode); 2084 n/a } 2085 n/a 2086 n/a PyObject * 2087 n/a PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size) 2088 n/a { 2089 n/a if (size < 0) { 2090 n/a PyErr_SetString(PyExc_SystemError, 2091 n/a "Negative size passed to PyUnicode_FromStringAndSize"); 2092 n/a return NULL; 2093 n/a } 2094 n/a if (u != NULL) 2095 n/a return PyUnicode_DecodeUTF8Stateful(u, size, NULL, NULL); 2096 n/a else 2097 n/a return (PyObject *)_PyUnicode_New(size); 2098 n/a } 2099 n/a 2100 n/a PyObject * 2101 n/a PyUnicode_FromString(const char *u) 2102 n/a { 2103 n/a size_t size = strlen(u); 2104 n/a if (size > PY_SSIZE_T_MAX) { 2105 n/a PyErr_SetString(PyExc_OverflowError, "input too long"); 2106 n/a return NULL; 2107 n/a } 2108 n/a return PyUnicode_DecodeUTF8Stateful(u, (Py_ssize_t)size, NULL, NULL); 2109 n/a } 2110 n/a 2111 n/a PyObject * 2112 n/a _PyUnicode_FromId(_Py_Identifier *id) 2113 n/a { 2114 n/a if (!id->object) { 2115 n/a id->object = PyUnicode_DecodeUTF8Stateful(id->string, 2116 n/a strlen(id->string), 2117 n/a NULL, NULL); 2118 n/a if (!id->object) 2119 n/a return NULL; 2120 n/a PyUnicode_InternInPlace(&id->object); 2121 n/a assert(!id->next); 2122 n/a id->next = static_strings; 2123 n/a static_strings = id; 2124 n/a } 2125 n/a return id->object; 2126 n/a } 2127 n/a 2128 n/a void 2129 n/a _PyUnicode_ClearStaticStrings() 2130 n/a { 2131 n/a _Py_Identifier *tmp, *s = static_strings; 2132 n/a while (s) { 2133 n/a Py_CLEAR(s->object); 2134 n/a tmp = s->next; 2135 n/a s->next = NULL; 2136 n/a s = tmp; 2137 n/a } 2138 n/a static_strings = NULL; 2139 n/a } 2140 n/a 2141 n/a /* Internal function, doesn't check maximum character */ 2142 n/a 2143 n/a PyObject* 2144 n/a _PyUnicode_FromASCII(const char *buffer, Py_ssize_t size) 2145 n/a { 2146 n/a const unsigned char *s = (const unsigned char *)buffer; 2147 n/a PyObject *unicode; 2148 n/a if (size == 1) { 2149 n/a #ifdef Py_DEBUG 2150 n/a assert((unsigned char)s[0] < 128); 2151 n/a #endif 2152 n/a return get_latin1_char(s[0]); 2153 n/a } 2154 n/a unicode = PyUnicode_New(size, 127); 2155 n/a if (!unicode) 2156 n/a return NULL; 2157 n/a memcpy(PyUnicode_1BYTE_DATA(unicode), s, size); 2158 n/a assert(_PyUnicode_CheckConsistency(unicode, 1)); 2159 n/a return unicode; 2160 n/a } 2161 n/a 2162 n/a static Py_UCS4 2163 n/a kind_maxchar_limit(unsigned int kind) 2164 n/a { 2165 n/a switch (kind) { 2166 n/a case PyUnicode_1BYTE_KIND: 2167 n/a return 0x80; 2168 n/a case PyUnicode_2BYTE_KIND: 2169 n/a return 0x100; 2170 n/a case PyUnicode_4BYTE_KIND: 2171 n/a return 0x10000; 2172 n/a default: 2173 n/a assert(0 && "invalid kind"); 2174 n/a return MAX_UNICODE; 2175 n/a } 2176 n/a } 2177 n/a 2178 n/a static inline Py_UCS4 2179 n/a align_maxchar(Py_UCS4 maxchar) 2180 n/a { 2181 n/a if (maxchar <= 127) 2182 n/a return 127; 2183 n/a else if (maxchar <= 255) 2184 n/a return 255; 2185 n/a else if (maxchar <= 65535) 2186 n/a return 65535; 2187 n/a else 2188 n/a return MAX_UNICODE; 2189 n/a } 2190 n/a 2191 n/a static PyObject* 2192 n/a _PyUnicode_FromUCS1(const Py_UCS1* u, Py_ssize_t size) 2193 n/a { 2194 n/a PyObject *res; 2195 n/a unsigned char max_char; 2196 n/a 2197 n/a if (size == 0) 2198 n/a _Py_RETURN_UNICODE_EMPTY(); 2199 n/a assert(size > 0); 2200 n/a if (size == 1) 2201 n/a return get_latin1_char(u[0]); 2202 n/a 2203 n/a max_char = ucs1lib_find_max_char(u, u + size); 2204 n/a res = PyUnicode_New(size, max_char); 2205 n/a if (!res) 2206 n/a return NULL; 2207 n/a memcpy(PyUnicode_1BYTE_DATA(res), u, size); 2208 n/a assert(_PyUnicode_CheckConsistency(res, 1)); 2209 n/a return res; 2210 n/a } 2211 n/a 2212 n/a static PyObject* 2213 n/a _PyUnicode_FromUCS2(const Py_UCS2 *u, Py_ssize_t size) 2214 n/a { 2215 n/a PyObject *res; 2216 n/a Py_UCS2 max_char; 2217 n/a 2218 n/a if (size == 0) 2219 n/a _Py_RETURN_UNICODE_EMPTY(); 2220 n/a assert(size > 0); 2221 n/a if (size == 1) 2222 n/a return unicode_char(u[0]); 2223 n/a 2224 n/a max_char = ucs2lib_find_max_char(u, u + size); 2225 n/a res = PyUnicode_New(size, max_char); 2226 n/a if (!res) 2227 n/a return NULL; 2228 n/a if (max_char >= 256) 2229 n/a memcpy(PyUnicode_2BYTE_DATA(res), u, sizeof(Py_UCS2)*size); 2230 n/a else { 2231 n/a _PyUnicode_CONVERT_BYTES( 2232 n/a Py_UCS2, Py_UCS1, u, u + size, PyUnicode_1BYTE_DATA(res)); 2233 n/a } 2234 n/a assert(_PyUnicode_CheckConsistency(res, 1)); 2235 n/a return res; 2236 n/a } 2237 n/a 2238 n/a static PyObject* 2239 n/a _PyUnicode_FromUCS4(const Py_UCS4 *u, Py_ssize_t size) 2240 n/a { 2241 n/a PyObject *res; 2242 n/a Py_UCS4 max_char; 2243 n/a 2244 n/a if (size == 0) 2245 n/a _Py_RETURN_UNICODE_EMPTY(); 2246 n/a assert(size > 0); 2247 n/a if (size == 1) 2248 n/a return unicode_char(u[0]); 2249 n/a 2250 n/a max_char = ucs4lib_find_max_char(u, u + size); 2251 n/a res = PyUnicode_New(size, max_char); 2252 n/a if (!res) 2253 n/a return NULL; 2254 n/a if (max_char < 256) 2255 n/a _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, u, u + size, 2256 n/a PyUnicode_1BYTE_DATA(res)); 2257 n/a else if (max_char < 0x10000) 2258 n/a _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, u, u + size, 2259 n/a PyUnicode_2BYTE_DATA(res)); 2260 n/a else 2261 n/a memcpy(PyUnicode_4BYTE_DATA(res), u, sizeof(Py_UCS4)*size); 2262 n/a assert(_PyUnicode_CheckConsistency(res, 1)); 2263 n/a return res; 2264 n/a } 2265 n/a 2266 n/a PyObject* 2267 n/a PyUnicode_FromKindAndData(int kind, const void *buffer, Py_ssize_t size) 2268 n/a { 2269 n/a if (size < 0) { 2270 n/a PyErr_SetString(PyExc_ValueError, "size must be positive"); 2271 n/a return NULL; 2272 n/a } 2273 n/a switch (kind) { 2274 n/a case PyUnicode_1BYTE_KIND: 2275 n/a return _PyUnicode_FromUCS1(buffer, size); 2276 n/a case PyUnicode_2BYTE_KIND: 2277 n/a return _PyUnicode_FromUCS2(buffer, size); 2278 n/a case PyUnicode_4BYTE_KIND: 2279 n/a return _PyUnicode_FromUCS4(buffer, size); 2280 n/a default: 2281 n/a PyErr_SetString(PyExc_SystemError, "invalid kind"); 2282 n/a return NULL; 2283 n/a } 2284 n/a } 2285 n/a 2286 n/a Py_UCS4 2287 n/a _PyUnicode_FindMaxChar(PyObject *unicode, Py_ssize_t start, Py_ssize_t end) 2288 n/a { 2289 n/a enum PyUnicode_Kind kind; 2290 n/a void *startptr, *endptr; 2291 n/a 2292 n/a assert(PyUnicode_IS_READY(unicode)); 2293 n/a assert(0 <= start); 2294 n/a assert(end <= PyUnicode_GET_LENGTH(unicode)); 2295 n/a assert(start <= end); 2296 n/a 2297 n/a if (start == 0 && end == PyUnicode_GET_LENGTH(unicode)) 2298 n/a return PyUnicode_MAX_CHAR_VALUE(unicode); 2299 n/a 2300 n/a if (start == end) 2301 n/a return 127; 2302 n/a 2303 n/a if (PyUnicode_IS_ASCII(unicode)) 2304 n/a return 127; 2305 n/a 2306 n/a kind = PyUnicode_KIND(unicode); 2307 n/a startptr = PyUnicode_DATA(unicode); 2308 n/a endptr = (char *)startptr + end * kind; 2309 n/a startptr = (char *)startptr + start * kind; 2310 n/a switch(kind) { 2311 n/a case PyUnicode_1BYTE_KIND: 2312 n/a return ucs1lib_find_max_char(startptr, endptr); 2313 n/a case PyUnicode_2BYTE_KIND: 2314 n/a return ucs2lib_find_max_char(startptr, endptr); 2315 n/a case PyUnicode_4BYTE_KIND: 2316 n/a return ucs4lib_find_max_char(startptr, endptr); 2317 n/a default: 2318 n/a assert(0); 2319 n/a return 0; 2320 n/a } 2321 n/a } 2322 n/a 2323 n/a /* Ensure that a string uses the most efficient storage, if it is not the 2324 n/a case: create a new string with of the right kind. Write NULL into *p_unicode 2325 n/a on error. */ 2326 n/a static void 2327 n/a unicode_adjust_maxchar(PyObject **p_unicode) 2328 n/a { 2329 n/a PyObject *unicode, *copy; 2330 n/a Py_UCS4 max_char; 2331 n/a Py_ssize_t len; 2332 n/a unsigned int kind; 2333 n/a 2334 n/a assert(p_unicode != NULL); 2335 n/a unicode = *p_unicode; 2336 n/a assert(PyUnicode_IS_READY(unicode)); 2337 n/a if (PyUnicode_IS_ASCII(unicode)) 2338 n/a return; 2339 n/a 2340 n/a len = PyUnicode_GET_LENGTH(unicode); 2341 n/a kind = PyUnicode_KIND(unicode); 2342 n/a if (kind == PyUnicode_1BYTE_KIND) { 2343 n/a const Py_UCS1 *u = PyUnicode_1BYTE_DATA(unicode); 2344 n/a max_char = ucs1lib_find_max_char(u, u + len); 2345 n/a if (max_char >= 128) 2346 n/a return; 2347 n/a } 2348 n/a else if (kind == PyUnicode_2BYTE_KIND) { 2349 n/a const Py_UCS2 *u = PyUnicode_2BYTE_DATA(unicode); 2350 n/a max_char = ucs2lib_find_max_char(u, u + len); 2351 n/a if (max_char >= 256) 2352 n/a return; 2353 n/a } 2354 n/a else { 2355 n/a const Py_UCS4 *u = PyUnicode_4BYTE_DATA(unicode); 2356 n/a assert(kind == PyUnicode_4BYTE_KIND); 2357 n/a max_char = ucs4lib_find_max_char(u, u + len); 2358 n/a if (max_char >= 0x10000) 2359 n/a return; 2360 n/a } 2361 n/a copy = PyUnicode_New(len, max_char); 2362 n/a if (copy != NULL) 2363 n/a _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, len); 2364 n/a Py_DECREF(unicode); 2365 n/a *p_unicode = copy; 2366 n/a } 2367 n/a 2368 n/a PyObject* 2369 n/a _PyUnicode_Copy(PyObject *unicode) 2370 n/a { 2371 n/a Py_ssize_t length; 2372 n/a PyObject *copy; 2373 n/a 2374 n/a if (!PyUnicode_Check(unicode)) { 2375 n/a PyErr_BadInternalCall(); 2376 n/a return NULL; 2377 n/a } 2378 n/a if (PyUnicode_READY(unicode) == -1) 2379 n/a return NULL; 2380 n/a 2381 n/a length = PyUnicode_GET_LENGTH(unicode); 2382 n/a copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode)); 2383 n/a if (!copy) 2384 n/a return NULL; 2385 n/a assert(PyUnicode_KIND(copy) == PyUnicode_KIND(unicode)); 2386 n/a 2387 n/a memcpy(PyUnicode_DATA(copy), PyUnicode_DATA(unicode), 2388 n/a length * PyUnicode_KIND(unicode)); 2389 n/a assert(_PyUnicode_CheckConsistency(copy, 1)); 2390 n/a return copy; 2391 n/a } 2392 n/a 2393 n/a 2394 n/a /* Widen Unicode objects to larger buffers. Don't write terminating null 2395 n/a character. Return NULL on error. */ 2396 n/a 2397 n/a void* 2398 n/a _PyUnicode_AsKind(PyObject *s, unsigned int kind) 2399 n/a { 2400 n/a Py_ssize_t len; 2401 n/a void *result; 2402 n/a unsigned int skind; 2403 n/a 2404 n/a if (PyUnicode_READY(s) == -1) 2405 n/a return NULL; 2406 n/a 2407 n/a len = PyUnicode_GET_LENGTH(s); 2408 n/a skind = PyUnicode_KIND(s); 2409 n/a if (skind >= kind) { 2410 n/a PyErr_SetString(PyExc_SystemError, "invalid widening attempt"); 2411 n/a return NULL; 2412 n/a } 2413 n/a switch (kind) { 2414 n/a case PyUnicode_2BYTE_KIND: 2415 n/a result = PyMem_New(Py_UCS2, len); 2416 n/a if (!result) 2417 n/a return PyErr_NoMemory(); 2418 n/a assert(skind == PyUnicode_1BYTE_KIND); 2419 n/a _PyUnicode_CONVERT_BYTES( 2420 n/a Py_UCS1, Py_UCS2, 2421 n/a PyUnicode_1BYTE_DATA(s), 2422 n/a PyUnicode_1BYTE_DATA(s) + len, 2423 n/a result); 2424 n/a return result; 2425 n/a case PyUnicode_4BYTE_KIND: 2426 n/a result = PyMem_New(Py_UCS4, len); 2427 n/a if (!result) 2428 n/a return PyErr_NoMemory(); 2429 n/a if (skind == PyUnicode_2BYTE_KIND) { 2430 n/a _PyUnicode_CONVERT_BYTES( 2431 n/a Py_UCS2, Py_UCS4, 2432 n/a PyUnicode_2BYTE_DATA(s), 2433 n/a PyUnicode_2BYTE_DATA(s) + len, 2434 n/a result); 2435 n/a } 2436 n/a else { 2437 n/a assert(skind == PyUnicode_1BYTE_KIND); 2438 n/a _PyUnicode_CONVERT_BYTES( 2439 n/a Py_UCS1, Py_UCS4, 2440 n/a PyUnicode_1BYTE_DATA(s), 2441 n/a PyUnicode_1BYTE_DATA(s) + len, 2442 n/a result); 2443 n/a } 2444 n/a return result; 2445 n/a default: 2446 n/a break; 2447 n/a } 2448 n/a PyErr_SetString(PyExc_SystemError, "invalid kind"); 2449 n/a return NULL; 2450 n/a } 2451 n/a 2452 n/a static Py_UCS4* 2453 n/a as_ucs4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize, 2454 n/a int copy_null) 2455 n/a { 2456 n/a int kind; 2457 n/a void *data; 2458 n/a Py_ssize_t len, targetlen; 2459 n/a if (PyUnicode_READY(string) == -1) 2460 n/a return NULL; 2461 n/a kind = PyUnicode_KIND(string); 2462 n/a data = PyUnicode_DATA(string); 2463 n/a len = PyUnicode_GET_LENGTH(string); 2464 n/a targetlen = len; 2465 n/a if (copy_null) 2466 n/a targetlen++; 2467 n/a if (!target) { 2468 n/a target = PyMem_New(Py_UCS4, targetlen); 2469 n/a if (!target) { 2470 n/a PyErr_NoMemory(); 2471 n/a return NULL; 2472 n/a } 2473 n/a } 2474 n/a else { 2475 n/a if (targetsize < targetlen) { 2476 n/a PyErr_Format(PyExc_SystemError, 2477 n/a "string is longer than the buffer"); 2478 n/a if (copy_null && 0 < targetsize) 2479 n/a target[0] = 0; 2480 n/a return NULL; 2481 n/a } 2482 n/a } 2483 n/a if (kind == PyUnicode_1BYTE_KIND) { 2484 n/a Py_UCS1 *start = (Py_UCS1 *) data; 2485 n/a _PyUnicode_CONVERT_BYTES(Py_UCS1, Py_UCS4, start, start + len, target); 2486 n/a } 2487 n/a else if (kind == PyUnicode_2BYTE_KIND) { 2488 n/a Py_UCS2 *start = (Py_UCS2 *) data; 2489 n/a _PyUnicode_CONVERT_BYTES(Py_UCS2, Py_UCS4, start, start + len, target); 2490 n/a } 2491 n/a else { 2492 n/a assert(kind == PyUnicode_4BYTE_KIND); 2493 n/a memcpy(target, data, len * sizeof(Py_UCS4)); 2494 n/a } 2495 n/a if (copy_null) 2496 n/a target[len] = 0; 2497 n/a return target; 2498 n/a } 2499 n/a 2500 n/a Py_UCS4* 2501 n/a PyUnicode_AsUCS4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize, 2502 n/a int copy_null) 2503 n/a { 2504 n/a if (target == NULL || targetsize < 0) { 2505 n/a PyErr_BadInternalCall(); 2506 n/a return NULL; 2507 n/a } 2508 n/a return as_ucs4(string, target, targetsize, copy_null); 2509 n/a } 2510 n/a 2511 n/a Py_UCS4* 2512 n/a PyUnicode_AsUCS4Copy(PyObject *string) 2513 n/a { 2514 n/a return as_ucs4(string, NULL, 0, 1); 2515 n/a } 2516 n/a 2517 n/a /* maximum number of characters required for output of %lld or %p. 2518 n/a We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits, 2519 n/a plus 1 for the sign. 53/22 is an upper bound for log10(256). */ 2520 n/a #define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22) 2521 n/a 2522 n/a static int 2523 n/a unicode_fromformat_write_str(_PyUnicodeWriter *writer, PyObject *str, 2524 n/a Py_ssize_t width, Py_ssize_t precision) 2525 n/a { 2526 n/a Py_ssize_t length, fill, arglen; 2527 n/a Py_UCS4 maxchar; 2528 n/a 2529 n/a if (PyUnicode_READY(str) == -1) 2530 n/a return -1; 2531 n/a 2532 n/a length = PyUnicode_GET_LENGTH(str); 2533 n/a if ((precision == -1 || precision >= length) 2534 n/a && width <= length) 2535 n/a return _PyUnicodeWriter_WriteStr(writer, str); 2536 n/a 2537 n/a if (precision != -1) 2538 n/a length = Py_MIN(precision, length); 2539 n/a 2540 n/a arglen = Py_MAX(length, width); 2541 n/a if (PyUnicode_MAX_CHAR_VALUE(str) > writer->maxchar) 2542 n/a maxchar = _PyUnicode_FindMaxChar(str, 0, length); 2543 n/a else 2544 n/a maxchar = writer->maxchar; 2545 n/a 2546 n/a if (_PyUnicodeWriter_Prepare(writer, arglen, maxchar) == -1) 2547 n/a return -1; 2548 n/a 2549 n/a if (width > length) { 2550 n/a fill = width - length; 2551 n/a if (PyUnicode_Fill(writer->buffer, writer->pos, fill, ' ') == -1) 2552 n/a return -1; 2553 n/a writer->pos += fill; 2554 n/a } 2555 n/a 2556 n/a _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos, 2557 n/a str, 0, length); 2558 n/a writer->pos += length; 2559 n/a return 0; 2560 n/a } 2561 n/a 2562 n/a static int 2563 n/a unicode_fromformat_write_cstr(_PyUnicodeWriter *writer, const char *str, 2564 n/a Py_ssize_t width, Py_ssize_t precision) 2565 n/a { 2566 n/a /* UTF-8 */ 2567 n/a Py_ssize_t length; 2568 n/a PyObject *unicode; 2569 n/a int res; 2570 n/a 2571 n/a length = strlen(str); 2572 n/a if (precision != -1) 2573 n/a length = Py_MIN(length, precision); 2574 n/a unicode = PyUnicode_DecodeUTF8Stateful(str, length, "replace", NULL); 2575 n/a if (unicode == NULL) 2576 n/a return -1; 2577 n/a 2578 n/a res = unicode_fromformat_write_str(writer, unicode, width, -1); 2579 n/a Py_DECREF(unicode); 2580 n/a return res; 2581 n/a } 2582 n/a 2583 n/a static const char* 2584 n/a unicode_fromformat_arg(_PyUnicodeWriter *writer, 2585 n/a const char *f, va_list *vargs) 2586 n/a { 2587 n/a const char *p; 2588 n/a Py_ssize_t len; 2589 n/a int zeropad; 2590 n/a Py_ssize_t width; 2591 n/a Py_ssize_t precision; 2592 n/a int longflag; 2593 n/a int longlongflag; 2594 n/a int size_tflag; 2595 n/a Py_ssize_t fill; 2596 n/a 2597 n/a p = f; 2598 n/a f++; 2599 n/a zeropad = 0; 2600 n/a if (*f == '0') { 2601 n/a zeropad = 1; 2602 n/a f++; 2603 n/a } 2604 n/a 2605 n/a /* parse the width.precision part, e.g. "%2.5s" => width=2, precision=5 */ 2606 n/a width = -1; 2607 n/a if (Py_ISDIGIT((unsigned)*f)) { 2608 n/a width = *f - '0'; 2609 n/a f++; 2610 n/a while (Py_ISDIGIT((unsigned)*f)) { 2611 n/a if (width > (PY_SSIZE_T_MAX - ((int)*f - '0')) / 10) { 2612 n/a PyErr_SetString(PyExc_ValueError, 2613 n/a "width too big"); 2614 n/a return NULL; 2615 n/a } 2616 n/a width = (width * 10) + (*f - '0'); 2617 n/a f++; 2618 n/a } 2619 n/a } 2620 n/a precision = -1; 2621 n/a if (*f == '.') { 2622 n/a f++; 2623 n/a if (Py_ISDIGIT((unsigned)*f)) { 2624 n/a precision = (*f - '0'); 2625 n/a f++; 2626 n/a while (Py_ISDIGIT((unsigned)*f)) { 2627 n/a if (precision > (PY_SSIZE_T_MAX - ((int)*f - '0')) / 10) { 2628 n/a PyErr_SetString(PyExc_ValueError, 2629 n/a "precision too big"); 2630 n/a return NULL; 2631 n/a } 2632 n/a precision = (precision * 10) + (*f - '0'); 2633 n/a f++; 2634 n/a } 2635 n/a } 2636 n/a if (*f == '%') { 2637 n/a /* "%.3%s" => f points to "3" */ 2638 n/a f--; 2639 n/a } 2640 n/a } 2641 n/a if (*f == '\0') { 2642 n/a /* bogus format "%.123" => go backward, f points to "3" */ 2643 n/a f--; 2644 n/a } 2645 n/a 2646 n/a /* Handle %ld, %lu, %lld and %llu. */ 2647 n/a longflag = 0; 2648 n/a longlongflag = 0; 2649 n/a size_tflag = 0; 2650 n/a if (*f == 'l') { 2651 n/a if (f[1] == 'd' || f[1] == 'u' || f[1] == 'i') { 2652 n/a longflag = 1; 2653 n/a ++f; 2654 n/a } 2655 n/a else if (f[1] == 'l' && 2656 n/a (f[2] == 'd' || f[2] == 'u' || f[2] == 'i')) { 2657 n/a longlongflag = 1; 2658 n/a f += 2; 2659 n/a } 2660 n/a } 2661 n/a /* handle the size_t flag. */ 2662 n/a else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u' || f[1] == 'i')) { 2663 n/a size_tflag = 1; 2664 n/a ++f; 2665 n/a } 2666 n/a 2667 n/a if (f[1] == '\0') 2668 n/a writer->overallocate = 0; 2669 n/a 2670 n/a switch (*f) { 2671 n/a case 'c': 2672 n/a { 2673 n/a int ordinal = va_arg(*vargs, int); 2674 n/a if (ordinal < 0 || ordinal > MAX_UNICODE) { 2675 n/a PyErr_SetString(PyExc_OverflowError, 2676 n/a "character argument not in range(0x110000)"); 2677 n/a return NULL; 2678 n/a } 2679 n/a if (_PyUnicodeWriter_WriteCharInline(writer, ordinal) < 0) 2680 n/a return NULL; 2681 n/a break; 2682 n/a } 2683 n/a 2684 n/a case 'i': 2685 n/a case 'd': 2686 n/a case 'u': 2687 n/a case 'x': 2688 n/a { 2689 n/a /* used by sprintf */ 2690 n/a char buffer[MAX_LONG_LONG_CHARS]; 2691 n/a Py_ssize_t arglen; 2692 n/a 2693 n/a if (*f == 'u') { 2694 n/a if (longflag) 2695 n/a len = sprintf(buffer, "%lu", 2696 n/a va_arg(*vargs, unsigned long)); 2697 n/a else if (longlongflag) 2698 n/a len = sprintf(buffer, "%llu", 2699 n/a va_arg(*vargs, unsigned long long)); 2700 n/a else if (size_tflag) 2701 n/a len = sprintf(buffer, "%" PY_FORMAT_SIZE_T "u", 2702 n/a va_arg(*vargs, size_t)); 2703 n/a else 2704 n/a len = sprintf(buffer, "%u", 2705 n/a va_arg(*vargs, unsigned int)); 2706 n/a } 2707 n/a else if (*f == 'x') { 2708 n/a len = sprintf(buffer, "%x", va_arg(*vargs, int)); 2709 n/a } 2710 n/a else { 2711 n/a if (longflag) 2712 n/a len = sprintf(buffer, "%li", 2713 n/a va_arg(*vargs, long)); 2714 n/a else if (longlongflag) 2715 n/a len = sprintf(buffer, "%lli", 2716 n/a va_arg(*vargs, long long)); 2717 n/a else if (size_tflag) 2718 n/a len = sprintf(buffer, "%" PY_FORMAT_SIZE_T "i", 2719 n/a va_arg(*vargs, Py_ssize_t)); 2720 n/a else 2721 n/a len = sprintf(buffer, "%i", 2722 n/a va_arg(*vargs, int)); 2723 n/a } 2724 n/a assert(len >= 0); 2725 n/a 2726 n/a if (precision < len) 2727 n/a precision = len; 2728 n/a 2729 n/a arglen = Py_MAX(precision, width); 2730 n/a if (_PyUnicodeWriter_Prepare(writer, arglen, 127) == -1) 2731 n/a return NULL; 2732 n/a 2733 n/a if (width > precision) { 2734 n/a Py_UCS4 fillchar; 2735 n/a fill = width - precision; 2736 n/a fillchar = zeropad?'0':' '; 2737 n/a if (PyUnicode_Fill(writer->buffer, writer->pos, fill, fillchar) == -1) 2738 n/a return NULL; 2739 n/a writer->pos += fill; 2740 n/a } 2741 n/a if (precision > len) { 2742 n/a fill = precision - len; 2743 n/a if (PyUnicode_Fill(writer->buffer, writer->pos, fill, '0') == -1) 2744 n/a return NULL; 2745 n/a writer->pos += fill; 2746 n/a } 2747 n/a 2748 n/a if (_PyUnicodeWriter_WriteASCIIString(writer, buffer, len) < 0) 2749 n/a return NULL; 2750 n/a break; 2751 n/a } 2752 n/a 2753 n/a case 'p': 2754 n/a { 2755 n/a char number[MAX_LONG_LONG_CHARS]; 2756 n/a 2757 n/a len = sprintf(number, "%p", va_arg(*vargs, void*)); 2758 n/a assert(len >= 0); 2759 n/a 2760 n/a /* %p is ill-defined: ensure leading 0x. */ 2761 n/a if (number[1] == 'X') 2762 n/a number[1] = 'x'; 2763 n/a else if (number[1] != 'x') { 2764 n/a memmove(number + 2, number, 2765 n/a strlen(number) + 1); 2766 n/a number[0] = '0'; 2767 n/a number[1] = 'x'; 2768 n/a len += 2; 2769 n/a } 2770 n/a 2771 n/a if (_PyUnicodeWriter_WriteASCIIString(writer, number, len) < 0) 2772 n/a return NULL; 2773 n/a break; 2774 n/a } 2775 n/a 2776 n/a case 's': 2777 n/a { 2778 n/a /* UTF-8 */ 2779 n/a const char *s = va_arg(*vargs, const char*); 2780 n/a if (unicode_fromformat_write_cstr(writer, s, width, precision) < 0) 2781 n/a return NULL; 2782 n/a break; 2783 n/a } 2784 n/a 2785 n/a case 'U': 2786 n/a { 2787 n/a PyObject *obj = va_arg(*vargs, PyObject *); 2788 n/a assert(obj && _PyUnicode_CHECK(obj)); 2789 n/a 2790 n/a if (unicode_fromformat_write_str(writer, obj, width, precision) == -1) 2791 n/a return NULL; 2792 n/a break; 2793 n/a } 2794 n/a 2795 n/a case 'V': 2796 n/a { 2797 n/a PyObject *obj = va_arg(*vargs, PyObject *); 2798 n/a const char *str = va_arg(*vargs, const char *); 2799 n/a if (obj) { 2800 n/a assert(_PyUnicode_CHECK(obj)); 2801 n/a if (unicode_fromformat_write_str(writer, obj, width, precision) == -1) 2802 n/a return NULL; 2803 n/a } 2804 n/a else { 2805 n/a assert(str != NULL); 2806 n/a if (unicode_fromformat_write_cstr(writer, str, width, precision) < 0) 2807 n/a return NULL; 2808 n/a } 2809 n/a break; 2810 n/a } 2811 n/a 2812 n/a case 'S': 2813 n/a { 2814 n/a PyObject *obj = va_arg(*vargs, PyObject *); 2815 n/a PyObject *str; 2816 n/a assert(obj); 2817 n/a str = PyObject_Str(obj); 2818 n/a if (!str) 2819 n/a return NULL; 2820 n/a if (unicode_fromformat_write_str(writer, str, width, precision) == -1) { 2821 n/a Py_DECREF(str); 2822 n/a return NULL; 2823 n/a } 2824 n/a Py_DECREF(str); 2825 n/a break; 2826 n/a } 2827 n/a 2828 n/a case 'R': 2829 n/a { 2830 n/a PyObject *obj = va_arg(*vargs, PyObject *); 2831 n/a PyObject *repr; 2832 n/a assert(obj); 2833 n/a repr = PyObject_Repr(obj); 2834 n/a if (!repr) 2835 n/a return NULL; 2836 n/a if (unicode_fromformat_write_str(writer, repr, width, precision) == -1) { 2837 n/a Py_DECREF(repr); 2838 n/a return NULL; 2839 n/a } 2840 n/a Py_DECREF(repr); 2841 n/a break; 2842 n/a } 2843 n/a 2844 n/a case 'A': 2845 n/a { 2846 n/a PyObject *obj = va_arg(*vargs, PyObject *); 2847 n/a PyObject *ascii; 2848 n/a assert(obj); 2849 n/a ascii = PyObject_ASCII(obj); 2850 n/a if (!ascii) 2851 n/a return NULL; 2852 n/a if (unicode_fromformat_write_str(writer, ascii, width, precision) == -1) { 2853 n/a Py_DECREF(ascii); 2854 n/a return NULL; 2855 n/a } 2856 n/a Py_DECREF(ascii); 2857 n/a break; 2858 n/a } 2859 n/a 2860 n/a case '%': 2861 n/a if (_PyUnicodeWriter_WriteCharInline(writer, '%') < 0) 2862 n/a return NULL; 2863 n/a break; 2864 n/a 2865 n/a default: 2866 n/a /* if we stumble upon an unknown formatting code, copy the rest 2867 n/a of the format string to the output string. (we cannot just 2868 n/a skip the code, since there's no way to know what's in the 2869 n/a argument list) */ 2870 n/a len = strlen(p); 2871 n/a if (_PyUnicodeWriter_WriteLatin1String(writer, p, len) == -1) 2872 n/a return NULL; 2873 n/a f = p+len; 2874 n/a return f; 2875 n/a } 2876 n/a 2877 n/a f++; 2878 n/a return f; 2879 n/a } 2880 n/a 2881 n/a PyObject * 2882 n/a PyUnicode_FromFormatV(const char *format, va_list vargs) 2883 n/a { 2884 n/a va_list vargs2; 2885 n/a const char *f; 2886 n/a _PyUnicodeWriter writer; 2887 n/a 2888 n/a _PyUnicodeWriter_Init(&writer); 2889 n/a writer.min_length = strlen(format) + 100; 2890 n/a writer.overallocate = 1; 2891 n/a 2892 n/a // Copy varags to be able to pass a reference to a subfunction. 2893 n/a va_copy(vargs2, vargs); 2894 n/a 2895 n/a for (f = format; *f; ) { 2896 n/a if (*f == '%') { 2897 n/a f = unicode_fromformat_arg(&writer, f, &vargs2); 2898 n/a if (f == NULL) 2899 n/a goto fail; 2900 n/a } 2901 n/a else { 2902 n/a const char *p; 2903 n/a Py_ssize_t len; 2904 n/a 2905 n/a p = f; 2906 n/a do 2907 n/a { 2908 n/a if ((unsigned char)*p > 127) { 2909 n/a PyErr_Format(PyExc_ValueError, 2910 n/a "PyUnicode_FromFormatV() expects an ASCII-encoded format " 2911 n/a "string, got a non-ASCII byte: 0x%02x", 2912 n/a (unsigned char)*p); 2913 n/a goto fail; 2914 n/a } 2915 n/a p++; 2916 n/a } 2917 n/a while (*p != '\0' && *p != '%'); 2918 n/a len = p - f; 2919 n/a 2920 n/a if (*p == '\0') 2921 n/a writer.overallocate = 0; 2922 n/a 2923 n/a if (_PyUnicodeWriter_WriteASCIIString(&writer, f, len) < 0) 2924 n/a goto fail; 2925 n/a 2926 n/a f = p; 2927 n/a } 2928 n/a } 2929 n/a va_end(vargs2); 2930 n/a return _PyUnicodeWriter_Finish(&writer); 2931 n/a 2932 n/a fail: 2933 n/a va_end(vargs2); 2934 n/a _PyUnicodeWriter_Dealloc(&writer); 2935 n/a return NULL; 2936 n/a } 2937 n/a 2938 n/a PyObject * 2939 n/a PyUnicode_FromFormat(const char *format, ...) 2940 n/a { 2941 n/a PyObject* ret; 2942 n/a va_list vargs; 2943 n/a 2944 n/a #ifdef HAVE_STDARG_PROTOTYPES 2945 n/a va_start(vargs, format); 2946 n/a #else 2947 n/a va_start(vargs); 2948 n/a #endif 2949 n/a ret = PyUnicode_FromFormatV(format, vargs); 2950 n/a va_end(vargs); 2951 n/a return ret; 2952 n/a } 2953 n/a 2954 n/a #ifdef HAVE_WCHAR_H 2955 n/a 2956 n/a /* Helper function for PyUnicode_AsWideChar() and PyUnicode_AsWideCharString(): 2957 n/a convert a Unicode object to a wide character string. 2958 n/a 2959 n/a - If w is NULL: return the number of wide characters (including the null 2960 n/a character) required to convert the unicode object. Ignore size argument. 2961 n/a 2962 n/a - Otherwise: return the number of wide characters (excluding the null 2963 n/a character) written into w. Write at most size wide characters (including 2964 n/a the null character). */ 2965 n/a static Py_ssize_t 2966 n/a unicode_aswidechar(PyObject *unicode, 2967 n/a wchar_t *w, 2968 n/a Py_ssize_t size) 2969 n/a { 2970 n/a Py_ssize_t res; 2971 n/a const wchar_t *wstr; 2972 n/a 2973 n/a wstr = PyUnicode_AsUnicodeAndSize(unicode, &res); 2974 n/a if (wstr == NULL) 2975 n/a return -1; 2976 n/a 2977 n/a if (w != NULL) { 2978 n/a if (size > res) 2979 n/a size = res + 1; 2980 n/a else 2981 n/a res = size; 2982 n/a memcpy(w, wstr, size * sizeof(wchar_t)); 2983 n/a return res; 2984 n/a } 2985 n/a else 2986 n/a return res + 1; 2987 n/a } 2988 n/a 2989 n/a Py_ssize_t 2990 n/a PyUnicode_AsWideChar(PyObject *unicode, 2991 n/a wchar_t *w, 2992 n/a Py_ssize_t size) 2993 n/a { 2994 n/a if (unicode == NULL) { 2995 n/a PyErr_BadInternalCall(); 2996 n/a return -1; 2997 n/a } 2998 n/a return unicode_aswidechar(unicode, w, size); 2999 n/a } 3000 n/a 3001 n/a wchar_t* 3002 n/a PyUnicode_AsWideCharString(PyObject *unicode, 3003 n/a Py_ssize_t *size) 3004 n/a { 3005 n/a wchar_t* buffer; 3006 n/a Py_ssize_t buflen; 3007 n/a 3008 n/a if (unicode == NULL) { 3009 n/a PyErr_BadInternalCall(); 3010 n/a return NULL; 3011 n/a } 3012 n/a 3013 n/a buflen = unicode_aswidechar(unicode, NULL, 0); 3014 n/a if (buflen == -1) 3015 n/a return NULL; 3016 n/a buffer = PyMem_NEW(wchar_t, buflen); 3017 n/a if (buffer == NULL) { 3018 n/a PyErr_NoMemory(); 3019 n/a return NULL; 3020 n/a } 3021 n/a buflen = unicode_aswidechar(unicode, buffer, buflen); 3022 n/a if (buflen == -1) { 3023 n/a PyMem_FREE(buffer); 3024 n/a return NULL; 3025 n/a } 3026 n/a if (size != NULL) 3027 n/a *size = buflen; 3028 n/a return buffer; 3029 n/a } 3030 n/a 3031 n/a #endif /* HAVE_WCHAR_H */ 3032 n/a 3033 n/a PyObject * 3034 n/a PyUnicode_FromOrdinal(int ordinal) 3035 n/a { 3036 n/a if (ordinal < 0 || ordinal > MAX_UNICODE) { 3037 n/a PyErr_SetString(PyExc_ValueError, 3038 n/a "chr() arg not in range(0x110000)"); 3039 n/a return NULL; 3040 n/a } 3041 n/a 3042 n/a return unicode_char((Py_UCS4)ordinal); 3043 n/a } 3044 n/a 3045 n/a PyObject * 3046 n/a PyUnicode_FromObject(PyObject *obj) 3047 n/a { 3048 n/a /* XXX Perhaps we should make this API an alias of 3049 n/a PyObject_Str() instead ?! */ 3050 n/a if (PyUnicode_CheckExact(obj)) { 3051 n/a if (PyUnicode_READY(obj) == -1) 3052 n/a return NULL; 3053 n/a Py_INCREF(obj); 3054 n/a return obj; 3055 n/a } 3056 n/a if (PyUnicode_Check(obj)) { 3057 n/a /* For a Unicode subtype that's not a Unicode object, 3058 n/a return a true Unicode object with the same data. */ 3059 n/a return _PyUnicode_Copy(obj); 3060 n/a } 3061 n/a PyErr_Format(PyExc_TypeError, 3062 n/a "Can't convert '%.100s' object to str implicitly", 3063 n/a Py_TYPE(obj)->tp_name); 3064 n/a return NULL; 3065 n/a } 3066 n/a 3067 n/a PyObject * 3068 n/a PyUnicode_FromEncodedObject(PyObject *obj, 3069 n/a const char *encoding, 3070 n/a const char *errors) 3071 n/a { 3072 n/a Py_buffer buffer; 3073 n/a PyObject *v; 3074 n/a 3075 n/a if (obj == NULL) { 3076 n/a PyErr_BadInternalCall(); 3077 n/a return NULL; 3078 n/a } 3079 n/a 3080 n/a /* Decoding bytes objects is the most common case and should be fast */ 3081 n/a if (PyBytes_Check(obj)) { 3082 n/a if (PyBytes_GET_SIZE(obj) == 0) 3083 n/a _Py_RETURN_UNICODE_EMPTY(); 3084 n/a v = PyUnicode_Decode( 3085 n/a PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj), 3086 n/a encoding, errors); 3087 n/a return v; 3088 n/a } 3089 n/a 3090 n/a if (PyUnicode_Check(obj)) { 3091 n/a PyErr_SetString(PyExc_TypeError, 3092 n/a "decoding str is not supported"); 3093 n/a return NULL; 3094 n/a } 3095 n/a 3096 n/a /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */ 3097 n/a if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) { 3098 n/a PyErr_Format(PyExc_TypeError, 3099 n/a "decoding to str: need a bytes-like object, %.80s found", 3100 n/a Py_TYPE(obj)->tp_name); 3101 n/a return NULL; 3102 n/a } 3103 n/a 3104 n/a if (buffer.len == 0) { 3105 n/a PyBuffer_Release(&buffer); 3106 n/a _Py_RETURN_UNICODE_EMPTY(); 3107 n/a } 3108 n/a 3109 n/a v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors); 3110 n/a PyBuffer_Release(&buffer); 3111 n/a return v; 3112 n/a } 3113 n/a 3114 n/a /* Normalize an encoding name: similar to encodings.normalize_encoding(), but 3115 n/a also convert to lowercase. Return 1 on success, or 0 on error (encoding is 3116 n/a longer than lower_len-1). */ 3117 n/a int 3118 n/a _Py_normalize_encoding(const char *encoding, 3119 n/a char *lower, 3120 n/a size_t lower_len) 3121 n/a { 3122 n/a const char *e; 3123 n/a char *l; 3124 n/a char *l_end; 3125 n/a int punct; 3126 n/a 3127 n/a assert(encoding != NULL); 3128 n/a 3129 n/a e = encoding; 3130 n/a l = lower; 3131 n/a l_end = &lower[lower_len - 1]; 3132 n/a punct = 0; 3133 n/a while (1) { 3134 n/a char c = *e; 3135 n/a if (c == 0) { 3136 n/a break; 3137 n/a } 3138 n/a 3139 n/a if (Py_ISALNUM(c) || c == '.') { 3140 n/a if (punct && l != lower) { 3141 n/a if (l == l_end) { 3142 n/a return 0; 3143 n/a } 3144 n/a *l++ = '_'; 3145 n/a } 3146 n/a punct = 0; 3147 n/a 3148 n/a if (l == l_end) { 3149 n/a return 0; 3150 n/a } 3151 n/a *l++ = Py_TOLOWER(c); 3152 n/a } 3153 n/a else { 3154 n/a punct = 1; 3155 n/a } 3156 n/a 3157 n/a e++; 3158 n/a } 3159 n/a *l = '\0'; 3160 n/a return 1; 3161 n/a } 3162 n/a 3163 n/a PyObject * 3164 n/a PyUnicode_Decode(const char *s, 3165 n/a Py_ssize_t size, 3166 n/a const char *encoding, 3167 n/a const char *errors) 3168 n/a { 3169 n/a PyObject *buffer = NULL, *unicode; 3170 n/a Py_buffer info; 3171 n/a char buflower[11]; /* strlen("iso-8859-1\0") == 11, longest shortcut */ 3172 n/a 3173 n/a if (encoding == NULL) { 3174 n/a return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL); 3175 n/a } 3176 n/a 3177 n/a /* Shortcuts for common default encodings */ 3178 n/a if (_Py_normalize_encoding(encoding, buflower, sizeof(buflower))) { 3179 n/a char *lower = buflower; 3180 n/a 3181 n/a /* Fast paths */ 3182 n/a if (lower[0] == 'u' && lower[1] == 't' && lower[2] == 'f') { 3183 n/a lower += 3; 3184 n/a if (*lower == '_') { 3185 n/a /* Match "utf8" and "utf_8" */ 3186 n/a lower++; 3187 n/a } 3188 n/a 3189 n/a if (lower[0] == '8' && lower[1] == 0) { 3190 n/a return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL); 3191 n/a } 3192 n/a else if (lower[0] == '1' && lower[1] == '6' && lower[2] == 0) { 3193 n/a return PyUnicode_DecodeUTF16(s, size, errors, 0); 3194 n/a } 3195 n/a else if (lower[0] == '3' && lower[1] == '2' && lower[2] == 0) { 3196 n/a return PyUnicode_DecodeUTF32(s, size, errors, 0); 3197 n/a } 3198 n/a } 3199 n/a else { 3200 n/a if (strcmp(lower, "ascii") == 0 3201 n/a || strcmp(lower, "us_ascii") == 0) { 3202 n/a return PyUnicode_DecodeASCII(s, size, errors); 3203 n/a } 3204 n/a #ifdef MS_WINDOWS 3205 n/a else if (strcmp(lower, "mbcs") == 0) { 3206 n/a return PyUnicode_DecodeMBCS(s, size, errors); 3207 n/a } 3208 n/a #endif 3209 n/a else if (strcmp(lower, "latin1") == 0 3210 n/a || strcmp(lower, "latin_1") == 0 3211 n/a || strcmp(lower, "iso_8859_1") == 0 3212 n/a || strcmp(lower, "iso8859_1") == 0) { 3213 n/a return PyUnicode_DecodeLatin1(s, size, errors); 3214 n/a } 3215 n/a } 3216 n/a } 3217 n/a 3218 n/a /* Decode via the codec registry */ 3219 n/a buffer = NULL; 3220 n/a if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0) 3221 n/a goto onError; 3222 n/a buffer = PyMemoryView_FromBuffer(&info); 3223 n/a if (buffer == NULL) 3224 n/a goto onError; 3225 n/a unicode = _PyCodec_DecodeText(buffer, encoding, errors); 3226 n/a if (unicode == NULL) 3227 n/a goto onError; 3228 n/a if (!PyUnicode_Check(unicode)) { 3229 n/a PyErr_Format(PyExc_TypeError, 3230 n/a "'%.400s' decoder returned '%.400s' instead of 'str'; " 3231 n/a "use codecs.decode() to decode to arbitrary types", 3232 n/a encoding, 3233 n/a Py_TYPE(unicode)->tp_name); 3234 n/a Py_DECREF(unicode); 3235 n/a goto onError; 3236 n/a } 3237 n/a Py_DECREF(buffer); 3238 n/a return unicode_result(unicode); 3239 n/a 3240 n/a onError: 3241 n/a Py_XDECREF(buffer); 3242 n/a return NULL; 3243 n/a } 3244 n/a 3245 n/a PyObject * 3246 n/a PyUnicode_AsDecodedObject(PyObject *unicode, 3247 n/a const char *encoding, 3248 n/a const char *errors) 3249 n/a { 3250 n/a if (!PyUnicode_Check(unicode)) { 3251 n/a PyErr_BadArgument(); 3252 n/a return NULL; 3253 n/a } 3254 n/a 3255 n/a if (PyErr_WarnEx(PyExc_DeprecationWarning, 3256 n/a "PyUnicode_AsDecodedObject() is deprecated; " 3257 n/a "use PyCodec_Decode() to decode from str", 1) < 0) 3258 n/a return NULL; 3259 n/a 3260 n/a if (encoding == NULL) 3261 n/a encoding = PyUnicode_GetDefaultEncoding(); 3262 n/a 3263 n/a /* Decode via the codec registry */ 3264 n/a return PyCodec_Decode(unicode, encoding, errors); 3265 n/a } 3266 n/a 3267 n/a PyObject * 3268 n/a PyUnicode_AsDecodedUnicode(PyObject *unicode, 3269 n/a const char *encoding, 3270 n/a const char *errors) 3271 n/a { 3272 n/a PyObject *v; 3273 n/a 3274 n/a if (!PyUnicode_Check(unicode)) { 3275 n/a PyErr_BadArgument(); 3276 n/a goto onError; 3277 n/a } 3278 n/a 3279 n/a if (PyErr_WarnEx(PyExc_DeprecationWarning, 3280 n/a "PyUnicode_AsDecodedUnicode() is deprecated; " 3281 n/a "use PyCodec_Decode() to decode from str to str", 1) < 0) 3282 n/a return NULL; 3283 n/a 3284 n/a if (encoding == NULL) 3285 n/a encoding = PyUnicode_GetDefaultEncoding(); 3286 n/a 3287 n/a /* Decode via the codec registry */ 3288 n/a v = PyCodec_Decode(unicode, encoding, errors); 3289 n/a if (v == NULL) 3290 n/a goto onError; 3291 n/a if (!PyUnicode_Check(v)) { 3292 n/a PyErr_Format(PyExc_TypeError, 3293 n/a "'%.400s' decoder returned '%.400s' instead of 'str'; " 3294 n/a "use codecs.decode() to decode to arbitrary types", 3295 n/a encoding, 3296 n/a Py_TYPE(unicode)->tp_name); 3297 n/a Py_DECREF(v); 3298 n/a goto onError; 3299 n/a } 3300 n/a return unicode_result(v); 3301 n/a 3302 n/a onError: 3303 n/a return NULL; 3304 n/a } 3305 n/a 3306 n/a PyObject * 3307 n/a PyUnicode_Encode(const Py_UNICODE *s, 3308 n/a Py_ssize_t size, 3309 n/a const char *encoding, 3310 n/a const char *errors) 3311 n/a { 3312 n/a PyObject *v, *unicode; 3313 n/a 3314 n/a unicode = PyUnicode_FromWideChar(s, size); 3315 n/a if (unicode == NULL) 3316 n/a return NULL; 3317 n/a v = PyUnicode_AsEncodedString(unicode, encoding, errors); 3318 n/a Py_DECREF(unicode); 3319 n/a return v; 3320 n/a } 3321 n/a 3322 n/a PyObject * 3323 n/a PyUnicode_AsEncodedObject(PyObject *unicode, 3324 n/a const char *encoding, 3325 n/a const char *errors) 3326 n/a { 3327 n/a PyObject *v; 3328 n/a 3329 n/a if (!PyUnicode_Check(unicode)) { 3330 n/a PyErr_BadArgument(); 3331 n/a goto onError; 3332 n/a } 3333 n/a 3334 n/a if (PyErr_WarnEx(PyExc_DeprecationWarning, 3335 n/a "PyUnicode_AsEncodedObject() is deprecated; " 3336 n/a "use PyUnicode_AsEncodedString() to encode from str to bytes " 3337 n/a "or PyCodec_Encode() for generic encoding", 1) < 0) 3338 n/a return NULL; 3339 n/a 3340 n/a if (encoding == NULL) 3341 n/a encoding = PyUnicode_GetDefaultEncoding(); 3342 n/a 3343 n/a /* Encode via the codec registry */ 3344 n/a v = PyCodec_Encode(unicode, encoding, errors); 3345 n/a if (v == NULL) 3346 n/a goto onError; 3347 n/a return v; 3348 n/a 3349 n/a onError: 3350 n/a return NULL; 3351 n/a } 3352 n/a 3353 n/a static size_t 3354 n/a wcstombs_errorpos(const wchar_t *wstr) 3355 n/a { 3356 n/a size_t len; 3357 n/a #if SIZEOF_WCHAR_T == 2 3358 n/a wchar_t buf[3]; 3359 n/a #else 3360 n/a wchar_t buf[2]; 3361 n/a #endif 3362 n/a char outbuf[MB_LEN_MAX]; 3363 n/a const wchar_t *start, *previous; 3364 n/a 3365 n/a #if SIZEOF_WCHAR_T == 2 3366 n/a buf[2] = 0; 3367 n/a #else 3368 n/a buf[1] = 0; 3369 n/a #endif 3370 n/a start = wstr; 3371 n/a while (*wstr != L'\0') 3372 n/a { 3373 n/a previous = wstr; 3374 n/a #if SIZEOF_WCHAR_T == 2 3375 n/a if (Py_UNICODE_IS_HIGH_SURROGATE(wstr[0]) 3376 n/a && Py_UNICODE_IS_LOW_SURROGATE(wstr[1])) 3377 n/a { 3378 n/a buf[0] = wstr[0]; 3379 n/a buf[1] = wstr[1]; 3380 n/a wstr += 2; 3381 n/a } 3382 n/a else { 3383 n/a buf[0] = *wstr; 3384 n/a buf[1] = 0; 3385 n/a wstr++; 3386 n/a } 3387 n/a #else 3388 n/a buf[0] = *wstr; 3389 n/a wstr++; 3390 n/a #endif 3391 n/a len = wcstombs(outbuf, buf, sizeof(outbuf)); 3392 n/a if (len == (size_t)-1) 3393 n/a return previous - start; 3394 n/a } 3395 n/a 3396 n/a /* failed to find the unencodable character */ 3397 n/a return 0; 3398 n/a } 3399 n/a 3400 n/a static int 3401 n/a locale_error_handler(const char *errors, int *surrogateescape) 3402 n/a { 3403 n/a _Py_error_handler error_handler = get_error_handler(errors); 3404 n/a switch (error_handler) 3405 n/a { 3406 n/a case _Py_ERROR_STRICT: 3407 n/a *surrogateescape = 0; 3408 n/a return 0; 3409 n/a case _Py_ERROR_SURROGATEESCAPE: 3410 n/a *surrogateescape = 1; 3411 n/a return 0; 3412 n/a default: 3413 n/a PyErr_Format(PyExc_ValueError, 3414 n/a "only 'strict' and 'surrogateescape' error handlers " 3415 n/a "are supported, not '%s'", 3416 n/a errors); 3417 n/a return -1; 3418 n/a } 3419 n/a } 3420 n/a 3421 n/a PyObject * 3422 n/a PyUnicode_EncodeLocale(PyObject *unicode, const char *errors) 3423 n/a { 3424 n/a Py_ssize_t wlen, wlen2; 3425 n/a wchar_t *wstr; 3426 n/a char *errmsg; 3427 n/a PyObject *bytes, *reason, *exc; 3428 n/a size_t error_pos, errlen; 3429 n/a int surrogateescape; 3430 n/a 3431 n/a if (locale_error_handler(errors, &surrogateescape) < 0) 3432 n/a return NULL; 3433 n/a 3434 n/a wstr = PyUnicode_AsWideCharString(unicode, &wlen); 3435 n/a if (wstr == NULL) 3436 n/a return NULL; 3437 n/a 3438 n/a wlen2 = wcslen(wstr); 3439 n/a if (wlen2 != wlen) { 3440 n/a PyMem_Free(wstr); 3441 n/a PyErr_SetString(PyExc_ValueError, "embedded null character"); 3442 n/a return NULL; 3443 n/a } 3444 n/a 3445 n/a if (surrogateescape) { 3446 n/a /* "surrogateescape" error handler */ 3447 n/a char *str; 3448 n/a 3449 n/a str = Py_EncodeLocale(wstr, &error_pos); 3450 n/a if (str == NULL) { 3451 n/a if (error_pos == (size_t)-1) { 3452 n/a PyErr_NoMemory(); 3453 n/a PyMem_Free(wstr); 3454 n/a return NULL; 3455 n/a } 3456 n/a else { 3457 n/a goto encode_error; 3458 n/a } 3459 n/a } 3460 n/a PyMem_Free(wstr); 3461 n/a 3462 n/a bytes = PyBytes_FromString(str); 3463 n/a PyMem_Free(str); 3464 n/a } 3465 n/a else { 3466 n/a /* strict mode */ 3467 n/a size_t len, len2; 3468 n/a 3469 n/a len = wcstombs(NULL, wstr, 0); 3470 n/a if (len == (size_t)-1) { 3471 n/a error_pos = (size_t)-1; 3472 n/a goto encode_error; 3473 n/a } 3474 n/a 3475 n/a bytes = PyBytes_FromStringAndSize(NULL, len); 3476 n/a if (bytes == NULL) { 3477 n/a PyMem_Free(wstr); 3478 n/a return NULL; 3479 n/a } 3480 n/a 3481 n/a len2 = wcstombs(PyBytes_AS_STRING(bytes), wstr, len+1); 3482 n/a if (len2 == (size_t)-1 || len2 > len) { 3483 n/a Py_DECREF(bytes); 3484 n/a error_pos = (size_t)-1; 3485 n/a goto encode_error; 3486 n/a } 3487 n/a PyMem_Free(wstr); 3488 n/a } 3489 n/a return bytes; 3490 n/a 3491 n/a encode_error: 3492 n/a errmsg = strerror(errno); 3493 n/a assert(errmsg != NULL); 3494 n/a 3495 n/a if (error_pos == (size_t)-1) 3496 n/a error_pos = wcstombs_errorpos(wstr); 3497 n/a 3498 n/a PyMem_Free(wstr); 3499 n/a 3500 n/a wstr = Py_DecodeLocale(errmsg, &errlen); 3501 n/a if (wstr != NULL) { 3502 n/a reason = PyUnicode_FromWideChar(wstr, errlen); 3503 n/a PyMem_RawFree(wstr); 3504 n/a } else { 3505 n/a errmsg = NULL; 3506 n/a } 3507 n/a 3508 n/a if (errmsg == NULL) 3509 n/a reason = PyUnicode_FromString( 3510 n/a "wcstombs() encountered an unencodable " 3511 n/a "wide character"); 3512 n/a if (reason == NULL) 3513 n/a return NULL; 3514 n/a 3515 n/a exc = PyObject_CallFunction(PyExc_UnicodeEncodeError, "sOnnO", 3516 n/a "locale", unicode, 3517 n/a (Py_ssize_t)error_pos, 3518 n/a (Py_ssize_t)(error_pos+1), 3519 n/a reason); 3520 n/a Py_DECREF(reason); 3521 n/a if (exc != NULL) { 3522 n/a PyCodec_StrictErrors(exc); 3523 n/a Py_DECREF(exc); 3524 n/a } 3525 n/a return NULL; 3526 n/a } 3527 n/a 3528 n/a PyObject * 3529 n/a PyUnicode_EncodeFSDefault(PyObject *unicode) 3530 n/a { 3531 n/a #if defined(__APPLE__) 3532 n/a return _PyUnicode_AsUTF8String(unicode, Py_FileSystemDefaultEncodeErrors); 3533 n/a #else 3534 n/a PyInterpreterState *interp = PyThreadState_GET()->interp; 3535 n/a /* Bootstrap check: if the filesystem codec is implemented in Python, we 3536 n/a cannot use it to encode and decode filenames before it is loaded. Load 3537 n/a the Python codec requires to encode at least its own filename. Use the C 3538 n/a version of the locale codec until the codec registry is initialized and 3539 n/a the Python codec is loaded. 3540 n/a 3541 n/a Py_FileSystemDefaultEncoding is shared between all interpreters, we 3542 n/a cannot only rely on it: check also interp->fscodec_initialized for 3543 n/a subinterpreters. */ 3544 n/a if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) { 3545 n/a return PyUnicode_AsEncodedString(unicode, 3546 n/a Py_FileSystemDefaultEncoding, 3547 n/a Py_FileSystemDefaultEncodeErrors); 3548 n/a } 3549 n/a else { 3550 n/a return PyUnicode_EncodeLocale(unicode, Py_FileSystemDefaultEncodeErrors); 3551 n/a } 3552 n/a #endif 3553 n/a } 3554 n/a 3555 n/a PyObject * 3556 n/a PyUnicode_AsEncodedString(PyObject *unicode, 3557 n/a const char *encoding, 3558 n/a const char *errors) 3559 n/a { 3560 n/a PyObject *v; 3561 n/a char buflower[11]; /* strlen("iso_8859_1\0") == 11, longest shortcut */ 3562 n/a 3563 n/a if (!PyUnicode_Check(unicode)) { 3564 n/a PyErr_BadArgument(); 3565 n/a return NULL; 3566 n/a } 3567 n/a 3568 n/a if (encoding == NULL) { 3569 n/a return _PyUnicode_AsUTF8String(unicode, errors); 3570 n/a } 3571 n/a 3572 n/a /* Shortcuts for common default encodings */ 3573 n/a if (_Py_normalize_encoding(encoding, buflower, sizeof(buflower))) { 3574 n/a char *lower = buflower; 3575 n/a 3576 n/a /* Fast paths */ 3577 n/a if (lower[0] == 'u' && lower[1] == 't' && lower[2] == 'f') { 3578 n/a lower += 3; 3579 n/a if (*lower == '_') { 3580 n/a /* Match "utf8" and "utf_8" */ 3581 n/a lower++; 3582 n/a } 3583 n/a 3584 n/a if (lower[0] == '8' && lower[1] == 0) { 3585 n/a return _PyUnicode_AsUTF8String(unicode, errors); 3586 n/a } 3587 n/a else if (lower[0] == '1' && lower[1] == '6' && lower[2] == 0) { 3588 n/a return _PyUnicode_EncodeUTF16(unicode, errors, 0); 3589 n/a } 3590 n/a else if (lower[0] == '3' && lower[1] == '2' && lower[2] == 0) { 3591 n/a return _PyUnicode_EncodeUTF32(unicode, errors, 0); 3592 n/a } 3593 n/a } 3594 n/a else { 3595 n/a if (strcmp(lower, "ascii") == 0 3596 n/a || strcmp(lower, "us_ascii") == 0) { 3597 n/a return _PyUnicode_AsASCIIString(unicode, errors); 3598 n/a } 3599 n/a #ifdef MS_WINDOWS 3600 n/a else if (strcmp(lower, "mbcs") == 0) { 3601 n/a return PyUnicode_EncodeCodePage(CP_ACP, unicode, errors); 3602 n/a } 3603 n/a #endif 3604 n/a else if (strcmp(lower, "latin1") == 0 || 3605 n/a strcmp(lower, "latin_1") == 0 || 3606 n/a strcmp(lower, "iso_8859_1") == 0 || 3607 n/a strcmp(lower, "iso8859_1") == 0) { 3608 n/a return _PyUnicode_AsLatin1String(unicode, errors); 3609 n/a } 3610 n/a } 3611 n/a } 3612 n/a 3613 n/a /* Encode via the codec registry */ 3614 n/a v = _PyCodec_EncodeText(unicode, encoding, errors); 3615 n/a if (v == NULL) 3616 n/a return NULL; 3617 n/a 3618 n/a /* The normal path */ 3619 n/a if (PyBytes_Check(v)) 3620 n/a return v; 3621 n/a 3622 n/a /* If the codec returns a buffer, raise a warning and convert to bytes */ 3623 n/a if (PyByteArray_Check(v)) { 3624 n/a int error; 3625 n/a PyObject *b; 3626 n/a 3627 n/a error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1, 3628 n/a "encoder %s returned bytearray instead of bytes; " 3629 n/a "use codecs.encode() to encode to arbitrary types", 3630 n/a encoding); 3631 n/a if (error) { 3632 n/a Py_DECREF(v); 3633 n/a return NULL; 3634 n/a } 3635 n/a 3636 n/a b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), Py_SIZE(v)); 3637 n/a Py_DECREF(v); 3638 n/a return b; 3639 n/a } 3640 n/a 3641 n/a PyErr_Format(PyExc_TypeError, 3642 n/a "'%.400s' encoder returned '%.400s' instead of 'bytes'; " 3643 n/a "use codecs.encode() to encode to arbitrary types", 3644 n/a encoding, 3645 n/a Py_TYPE(v)->tp_name); 3646 n/a Py_DECREF(v); 3647 n/a return NULL; 3648 n/a } 3649 n/a 3650 n/a PyObject * 3651 n/a PyUnicode_AsEncodedUnicode(PyObject *unicode, 3652 n/a const char *encoding, 3653 n/a const char *errors) 3654 n/a { 3655 n/a PyObject *v; 3656 n/a 3657 n/a if (!PyUnicode_Check(unicode)) { 3658 n/a PyErr_BadArgument(); 3659 n/a goto onError; 3660 n/a } 3661 n/a 3662 n/a if (PyErr_WarnEx(PyExc_DeprecationWarning, 3663 n/a "PyUnicode_AsEncodedUnicode() is deprecated; " 3664 n/a "use PyCodec_Encode() to encode from str to str", 1) < 0) 3665 n/a return NULL; 3666 n/a 3667 n/a if (encoding == NULL) 3668 n/a encoding = PyUnicode_GetDefaultEncoding(); 3669 n/a 3670 n/a /* Encode via the codec registry */ 3671 n/a v = PyCodec_Encode(unicode, encoding, errors); 3672 n/a if (v == NULL) 3673 n/a goto onError; 3674 n/a if (!PyUnicode_Check(v)) { 3675 n/a PyErr_Format(PyExc_TypeError, 3676 n/a "'%.400s' encoder returned '%.400s' instead of 'str'; " 3677 n/a "use codecs.encode() to encode to arbitrary types", 3678 n/a encoding, 3679 n/a Py_TYPE(v)->tp_name); 3680 n/a Py_DECREF(v); 3681 n/a goto onError; 3682 n/a } 3683 n/a return v; 3684 n/a 3685 n/a onError: 3686 n/a return NULL; 3687 n/a } 3688 n/a 3689 n/a static size_t 3690 n/a mbstowcs_errorpos(const char *str, size_t len) 3691 n/a { 3692 n/a #ifdef HAVE_MBRTOWC 3693 n/a const char *start = str; 3694 n/a mbstate_t mbs; 3695 n/a size_t converted; 3696 n/a wchar_t ch; 3697 n/a 3698 n/a memset(&mbs, 0, sizeof mbs); 3699 n/a while (len) 3700 n/a { 3701 n/a converted = mbrtowc(&ch, str, len, &mbs); 3702 n/a if (converted == 0) 3703 n/a /* Reached end of string */ 3704 n/a break; 3705 n/a if (converted == (size_t)-1 || converted == (size_t)-2) { 3706 n/a /* Conversion error or incomplete character */ 3707 n/a return str - start; 3708 n/a } 3709 n/a else { 3710 n/a str += converted; 3711 n/a len -= converted; 3712 n/a } 3713 n/a } 3714 n/a /* failed to find the undecodable byte sequence */ 3715 n/a return 0; 3716 n/a #endif 3717 n/a return 0; 3718 n/a } 3719 n/a 3720 n/a PyObject* 3721 n/a PyUnicode_DecodeLocaleAndSize(const char *str, Py_ssize_t len, 3722 n/a const char *errors) 3723 n/a { 3724 n/a wchar_t smallbuf[256]; 3725 n/a size_t smallbuf_len = Py_ARRAY_LENGTH(smallbuf); 3726 n/a wchar_t *wstr; 3727 n/a size_t wlen, wlen2; 3728 n/a PyObject *unicode; 3729 n/a int surrogateescape; 3730 n/a size_t error_pos, errlen; 3731 n/a char *errmsg; 3732 n/a PyObject *exc, *reason = NULL; /* initialize to prevent gcc warning */ 3733 n/a 3734 n/a if (locale_error_handler(errors, &surrogateescape) < 0) 3735 n/a return NULL; 3736 n/a 3737 n/a if (str[len] != '\0' || (size_t)len != strlen(str)) { 3738 n/a PyErr_SetString(PyExc_ValueError, "embedded null byte"); 3739 n/a return NULL; 3740 n/a } 3741 n/a 3742 n/a if (surrogateescape) { 3743 n/a /* "surrogateescape" error handler */ 3744 n/a wstr = Py_DecodeLocale(str, &wlen); 3745 n/a if (wstr == NULL) { 3746 n/a if (wlen == (size_t)-1) 3747 n/a PyErr_NoMemory(); 3748 n/a else 3749 n/a PyErr_SetFromErrno(PyExc_OSError); 3750 n/a return NULL; 3751 n/a } 3752 n/a 3753 n/a unicode = PyUnicode_FromWideChar(wstr, wlen); 3754 n/a PyMem_RawFree(wstr); 3755 n/a } 3756 n/a else { 3757 n/a /* strict mode */ 3758 n/a #ifndef HAVE_BROKEN_MBSTOWCS 3759 n/a wlen = mbstowcs(NULL, str, 0); 3760 n/a #else 3761 n/a wlen = len; 3762 n/a #endif 3763 n/a if (wlen == (size_t)-1) 3764 n/a goto decode_error; 3765 n/a if (wlen+1 <= smallbuf_len) { 3766 n/a wstr = smallbuf; 3767 n/a } 3768 n/a else { 3769 n/a wstr = PyMem_New(wchar_t, wlen+1); 3770 n/a if (!wstr) 3771 n/a return PyErr_NoMemory(); 3772 n/a } 3773 n/a 3774 n/a wlen2 = mbstowcs(wstr, str, wlen+1); 3775 n/a if (wlen2 == (size_t)-1) { 3776 n/a if (wstr != smallbuf) 3777 n/a PyMem_Free(wstr); 3778 n/a goto decode_error; 3779 n/a } 3780 n/a #ifdef HAVE_BROKEN_MBSTOWCS 3781 n/a assert(wlen2 == wlen); 3782 n/a #endif 3783 n/a unicode = PyUnicode_FromWideChar(wstr, wlen2); 3784 n/a if (wstr != smallbuf) 3785 n/a PyMem_Free(wstr); 3786 n/a } 3787 n/a return unicode; 3788 n/a 3789 n/a decode_error: 3790 n/a errmsg = strerror(errno); 3791 n/a assert(errmsg != NULL); 3792 n/a 3793 n/a error_pos = mbstowcs_errorpos(str, len); 3794 n/a wstr = Py_DecodeLocale(errmsg, &errlen); 3795 n/a if (wstr != NULL) { 3796 n/a reason = PyUnicode_FromWideChar(wstr, errlen); 3797 n/a PyMem_RawFree(wstr); 3798 n/a } 3799 n/a 3800 n/a if (reason == NULL) 3801 n/a reason = PyUnicode_FromString( 3802 n/a "mbstowcs() encountered an invalid multibyte sequence"); 3803 n/a if (reason == NULL) 3804 n/a return NULL; 3805 n/a 3806 n/a exc = PyObject_CallFunction(PyExc_UnicodeDecodeError, "sy#nnO", 3807 n/a "locale", str, len, 3808 n/a (Py_ssize_t)error_pos, 3809 n/a (Py_ssize_t)(error_pos+1), 3810 n/a reason); 3811 n/a Py_DECREF(reason); 3812 n/a if (exc != NULL) { 3813 n/a PyCodec_StrictErrors(exc); 3814 n/a Py_DECREF(exc); 3815 n/a } 3816 n/a return NULL; 3817 n/a } 3818 n/a 3819 n/a PyObject* 3820 n/a PyUnicode_DecodeLocale(const char *str, const char *errors) 3821 n/a { 3822 n/a Py_ssize_t size = (Py_ssize_t)strlen(str); 3823 n/a return PyUnicode_DecodeLocaleAndSize(str, size, errors); 3824 n/a } 3825 n/a 3826 n/a 3827 n/a PyObject* 3828 n/a PyUnicode_DecodeFSDefault(const char *s) { 3829 n/a Py_ssize_t size = (Py_ssize_t)strlen(s); 3830 n/a return PyUnicode_DecodeFSDefaultAndSize(s, size); 3831 n/a } 3832 n/a 3833 n/a PyObject* 3834 n/a PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size) 3835 n/a { 3836 n/a #if defined(__APPLE__) 3837 n/a return PyUnicode_DecodeUTF8Stateful(s, size, Py_FileSystemDefaultEncodeErrors, NULL); 3838 n/a #else 3839 n/a PyInterpreterState *interp = PyThreadState_GET()->interp; 3840 n/a /* Bootstrap check: if the filesystem codec is implemented in Python, we 3841 n/a cannot use it to encode and decode filenames before it is loaded. Load 3842 n/a the Python codec requires to encode at least its own filename. Use the C 3843 n/a version of the locale codec until the codec registry is initialized and 3844 n/a the Python codec is loaded. 3845 n/a 3846 n/a Py_FileSystemDefaultEncoding is shared between all interpreters, we 3847 n/a cannot only rely on it: check also interp->fscodec_initialized for 3848 n/a subinterpreters. */ 3849 n/a if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) { 3850 n/a return PyUnicode_Decode(s, size, 3851 n/a Py_FileSystemDefaultEncoding, 3852 n/a Py_FileSystemDefaultEncodeErrors); 3853 n/a } 3854 n/a else { 3855 n/a return PyUnicode_DecodeLocaleAndSize(s, size, Py_FileSystemDefaultEncodeErrors); 3856 n/a } 3857 n/a #endif 3858 n/a } 3859 n/a 3860 n/a 3861 n/a int 3862 n/a PyUnicode_FSConverter(PyObject* arg, void* addr) 3863 n/a { 3864 n/a PyObject *path = NULL; 3865 n/a PyObject *output = NULL; 3866 n/a Py_ssize_t size; 3867 n/a void *data; 3868 n/a if (arg == NULL) { 3869 n/a Py_DECREF(*(PyObject**)addr); 3870 n/a *(PyObject**)addr = NULL; 3871 n/a return 1; 3872 n/a } 3873 n/a path = PyOS_FSPath(arg); 3874 n/a if (path == NULL) { 3875 n/a return 0; 3876 n/a } 3877 n/a if (PyBytes_Check(path)) { 3878 n/a output = path; 3879 n/a } 3880 n/a else { // PyOS_FSPath() guarantees its returned value is bytes or str. 3881 n/a output = PyUnicode_EncodeFSDefault(path); 3882 n/a Py_DECREF(path); 3883 n/a if (!output) { 3884 n/a return 0; 3885 n/a } 3886 n/a assert(PyBytes_Check(output)); 3887 n/a } 3888 n/a 3889 n/a size = PyBytes_GET_SIZE(output); 3890 n/a data = PyBytes_AS_STRING(output); 3891 n/a if ((size_t)size != strlen(data)) { 3892 n/a PyErr_SetString(PyExc_ValueError, "embedded null byte"); 3893 n/a Py_DECREF(output); 3894 n/a return 0; 3895 n/a } 3896 n/a *(PyObject**)addr = output; 3897 n/a return Py_CLEANUP_SUPPORTED; 3898 n/a } 3899 n/a 3900 n/a 3901 n/a int 3902 n/a PyUnicode_FSDecoder(PyObject* arg, void* addr) 3903 n/a { 3904 n/a int is_buffer = 0; 3905 n/a PyObject *path = NULL; 3906 n/a PyObject *output = NULL; 3907 n/a if (arg == NULL) { 3908 n/a Py_DECREF(*(PyObject**)addr); 3909 n/a return 1; 3910 n/a } 3911 n/a 3912 n/a is_buffer = PyObject_CheckBuffer(arg); 3913 n/a if (!is_buffer) { 3914 n/a path = PyOS_FSPath(arg); 3915 n/a if (path == NULL) { 3916 n/a return 0; 3917 n/a } 3918 n/a } 3919 n/a else { 3920 n/a path = arg; 3921 n/a Py_INCREF(arg); 3922 n/a } 3923 n/a 3924 n/a if (PyUnicode_Check(path)) { 3925 n/a if (PyUnicode_READY(path) == -1) { 3926 n/a Py_DECREF(path); 3927 n/a return 0; 3928 n/a } 3929 n/a output = path; 3930 n/a } 3931 n/a else if (PyBytes_Check(path) || is_buffer) { 3932 n/a PyObject *path_bytes = NULL; 3933 n/a 3934 n/a if (!PyBytes_Check(path) && 3935 n/a PyErr_WarnFormat(PyExc_DeprecationWarning, 1, 3936 n/a "path should be string, bytes, or os.PathLike, not %.200s", 3937 n/a Py_TYPE(arg)->tp_name)) { 3938 n/a Py_DECREF(path); 3939 n/a return 0; 3940 n/a } 3941 n/a path_bytes = PyBytes_FromObject(path); 3942 n/a Py_DECREF(path); 3943 n/a if (!path_bytes) { 3944 n/a return 0; 3945 n/a } 3946 n/a output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(path_bytes), 3947 n/a PyBytes_GET_SIZE(path_bytes)); 3948 n/a Py_DECREF(path_bytes); 3949 n/a if (!output) { 3950 n/a return 0; 3951 n/a } 3952 n/a } 3953 n/a else { 3954 n/a PyErr_Format(PyExc_TypeError, 3955 n/a "path should be string, bytes, or os.PathLike, not %.200s", 3956 n/a Py_TYPE(arg)->tp_name); 3957 n/a Py_DECREF(path); 3958 n/a return 0; 3959 n/a } 3960 n/a if (PyUnicode_READY(output) == -1) { 3961 n/a Py_DECREF(output); 3962 n/a return 0; 3963 n/a } 3964 n/a if (findchar(PyUnicode_DATA(output), PyUnicode_KIND(output), 3965 n/a PyUnicode_GET_LENGTH(output), 0, 1) >= 0) { 3966 n/a PyErr_SetString(PyExc_ValueError, "embedded null character"); 3967 n/a Py_DECREF(output); 3968 n/a return 0; 3969 n/a } 3970 n/a *(PyObject**)addr = output; 3971 n/a return Py_CLEANUP_SUPPORTED; 3972 n/a } 3973 n/a 3974 n/a 3975 n/a const char * 3976 n/a PyUnicode_AsUTF8AndSize(PyObject *unicode, Py_ssize_t *psize) 3977 n/a { 3978 n/a PyObject *bytes; 3979 n/a 3980 n/a if (!PyUnicode_Check(unicode)) { 3981 n/a PyErr_BadArgument(); 3982 n/a return NULL; 3983 n/a } 3984 n/a if (PyUnicode_READY(unicode) == -1) 3985 n/a return NULL; 3986 n/a 3987 n/a if (PyUnicode_UTF8(unicode) == NULL) { 3988 n/a assert(!PyUnicode_IS_COMPACT_ASCII(unicode)); 3989 n/a bytes = _PyUnicode_AsUTF8String(unicode, NULL); 3990 n/a if (bytes == NULL) 3991 n/a return NULL; 3992 n/a _PyUnicode_UTF8(unicode) = PyObject_MALLOC(PyBytes_GET_SIZE(bytes) + 1); 3993 n/a if (_PyUnicode_UTF8(unicode) == NULL) { 3994 n/a PyErr_NoMemory(); 3995 n/a Py_DECREF(bytes); 3996 n/a return NULL; 3997 n/a } 3998 n/a _PyUnicode_UTF8_LENGTH(unicode) = PyBytes_GET_SIZE(bytes); 3999 n/a memcpy(_PyUnicode_UTF8(unicode), 4000 n/a PyBytes_AS_STRING(bytes), 4001 n/a _PyUnicode_UTF8_LENGTH(unicode) + 1); 4002 n/a Py_DECREF(bytes); 4003 n/a } 4004 n/a 4005 n/a if (psize) 4006 n/a *psize = PyUnicode_UTF8_LENGTH(unicode); 4007 n/a return PyUnicode_UTF8(unicode); 4008 n/a } 4009 n/a 4010 n/a const char * 4011 n/a PyUnicode_AsUTF8(PyObject *unicode) 4012 n/a { 4013 n/a return PyUnicode_AsUTF8AndSize(unicode, NULL); 4014 n/a } 4015 n/a 4016 n/a Py_UNICODE * 4017 n/a PyUnicode_AsUnicodeAndSize(PyObject *unicode, Py_ssize_t *size) 4018 n/a { 4019 n/a const unsigned char *one_byte; 4020 n/a #if SIZEOF_WCHAR_T == 4 4021 n/a const Py_UCS2 *two_bytes; 4022 n/a #else 4023 n/a const Py_UCS4 *four_bytes; 4024 n/a const Py_UCS4 *ucs4_end; 4025 n/a Py_ssize_t num_surrogates; 4026 n/a #endif 4027 n/a wchar_t *w; 4028 n/a wchar_t *wchar_end; 4029 n/a 4030 n/a if (!PyUnicode_Check(unicode)) { 4031 n/a PyErr_BadArgument(); 4032 n/a return NULL; 4033 n/a } 4034 n/a if (_PyUnicode_WSTR(unicode) == NULL) { 4035 n/a /* Non-ASCII compact unicode object */ 4036 n/a assert(_PyUnicode_KIND(unicode) != 0); 4037 n/a assert(PyUnicode_IS_READY(unicode)); 4038 n/a 4039 n/a if (PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND) { 4040 n/a #if SIZEOF_WCHAR_T == 2 4041 n/a four_bytes = PyUnicode_4BYTE_DATA(unicode); 4042 n/a ucs4_end = four_bytes + _PyUnicode_LENGTH(unicode); 4043 n/a num_surrogates = 0; 4044 n/a 4045 n/a for (; four_bytes < ucs4_end; ++four_bytes) { 4046 n/a if (*four_bytes > 0xFFFF) 4047 n/a ++num_surrogates; 4048 n/a } 4049 n/a 4050 n/a _PyUnicode_WSTR(unicode) = (wchar_t *) PyObject_MALLOC( 4051 n/a sizeof(wchar_t) * (_PyUnicode_LENGTH(unicode) + 1 + num_surrogates)); 4052 n/a if (!_PyUnicode_WSTR(unicode)) { 4053 n/a PyErr_NoMemory(); 4054 n/a return NULL; 4055 n/a } 4056 n/a _PyUnicode_WSTR_LENGTH(unicode) = _PyUnicode_LENGTH(unicode) + num_surrogates; 4057 n/a 4058 n/a w = _PyUnicode_WSTR(unicode); 4059 n/a wchar_end = w + _PyUnicode_WSTR_LENGTH(unicode); 4060 n/a four_bytes = PyUnicode_4BYTE_DATA(unicode); 4061 n/a for (; four_bytes < ucs4_end; ++four_bytes, ++w) { 4062 n/a if (*four_bytes > 0xFFFF) { 4063 n/a assert(*four_bytes <= MAX_UNICODE); 4064 n/a /* encode surrogate pair in this case */ 4065 n/a *w++ = Py_UNICODE_HIGH_SURROGATE(*four_bytes); 4066 n/a *w = Py_UNICODE_LOW_SURROGATE(*four_bytes); 4067 n/a } 4068 n/a else 4069 n/a *w = *four_bytes; 4070 n/a 4071 n/a if (w > wchar_end) { 4072 n/a assert(0 && "Miscalculated string end"); 4073 n/a } 4074 n/a } 4075 n/a *w = 0; 4076 n/a #else 4077 n/a /* sizeof(wchar_t) == 4 */ 4078 n/a Py_FatalError("Impossible unicode object state, wstr and str " 4079 n/a "should share memory already."); 4080 n/a return NULL; 4081 n/a #endif 4082 n/a } 4083 n/a else { 4084 n/a if ((size_t)_PyUnicode_LENGTH(unicode) > 4085 n/a PY_SSIZE_T_MAX / sizeof(wchar_t) - 1) { 4086 n/a PyErr_NoMemory(); 4087 n/a return NULL; 4088 n/a } 4089 n/a _PyUnicode_WSTR(unicode) = (wchar_t *) PyObject_MALLOC(sizeof(wchar_t) * 4090 n/a (_PyUnicode_LENGTH(unicode) + 1)); 4091 n/a if (!_PyUnicode_WSTR(unicode)) { 4092 n/a PyErr_NoMemory(); 4093 n/a return NULL; 4094 n/a } 4095 n/a if (!PyUnicode_IS_COMPACT_ASCII(unicode)) 4096 n/a _PyUnicode_WSTR_LENGTH(unicode) = _PyUnicode_LENGTH(unicode); 4097 n/a w = _PyUnicode_WSTR(unicode); 4098 n/a wchar_end = w + _PyUnicode_LENGTH(unicode); 4099 n/a 4100 n/a if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND) { 4101 n/a one_byte = PyUnicode_1BYTE_DATA(unicode); 4102 n/a for (; w < wchar_end; ++one_byte, ++w) 4103 n/a *w = *one_byte; 4104 n/a /* null-terminate the wstr */ 4105 n/a *w = 0; 4106 n/a } 4107 n/a else if (PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND) { 4108 n/a #if SIZEOF_WCHAR_T == 4 4109 n/a two_bytes = PyUnicode_2BYTE_DATA(unicode); 4110 n/a for (; w < wchar_end; ++two_bytes, ++w) 4111 n/a *w = *two_bytes; 4112 n/a /* null-terminate the wstr */ 4113 n/a *w = 0; 4114 n/a #else 4115 n/a /* sizeof(wchar_t) == 2 */ 4116 n/a PyObject_FREE(_PyUnicode_WSTR(unicode)); 4117 n/a _PyUnicode_WSTR(unicode) = NULL; 4118 n/a Py_FatalError("Impossible unicode object state, wstr " 4119 n/a "and str should share memory already."); 4120 n/a return NULL; 4121 n/a #endif 4122 n/a } 4123 n/a else { 4124 n/a assert(0 && "This should never happen."); 4125 n/a } 4126 n/a } 4127 n/a } 4128 n/a if (size != NULL) 4129 n/a *size = PyUnicode_WSTR_LENGTH(unicode); 4130 n/a return _PyUnicode_WSTR(unicode); 4131 n/a } 4132 n/a 4133 n/a Py_UNICODE * 4134 n/a PyUnicode_AsUnicode(PyObject *unicode) 4135 n/a { 4136 n/a return PyUnicode_AsUnicodeAndSize(unicode, NULL); 4137 n/a } 4138 n/a 4139 n/a 4140 n/a Py_ssize_t 4141 n/a PyUnicode_GetSize(PyObject *unicode) 4142 n/a { 4143 n/a if (!PyUnicode_Check(unicode)) { 4144 n/a PyErr_BadArgument(); 4145 n/a goto onError; 4146 n/a } 4147 n/a if (_PyUnicode_WSTR(unicode) == NULL) { 4148 n/a if (PyUnicode_AsUnicode(unicode) == NULL) 4149 n/a goto onError; 4150 n/a } 4151 n/a return PyUnicode_WSTR_LENGTH(unicode); 4152 n/a 4153 n/a onError: 4154 n/a return -1; 4155 n/a } 4156 n/a 4157 n/a Py_ssize_t 4158 n/a PyUnicode_GetLength(PyObject *unicode) 4159 n/a { 4160 n/a if (!PyUnicode_Check(unicode)) { 4161 n/a PyErr_BadArgument(); 4162 n/a return -1; 4163 n/a } 4164 n/a if (PyUnicode_READY(unicode) == -1) 4165 n/a return -1; 4166 n/a return PyUnicode_GET_LENGTH(unicode); 4167 n/a } 4168 n/a 4169 n/a Py_UCS4 4170 n/a PyUnicode_ReadChar(PyObject *unicode, Py_ssize_t index) 4171 n/a { 4172 n/a void *data; 4173 n/a int kind; 4174 n/a 4175 n/a if (!PyUnicode_Check(unicode) || PyUnicode_READY(unicode) == -1) { 4176 n/a PyErr_BadArgument(); 4177 n/a return (Py_UCS4)-1; 4178 n/a } 4179 n/a if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) { 4180 n/a PyErr_SetString(PyExc_IndexError, "string index out of range"); 4181 n/a return (Py_UCS4)-1; 4182 n/a } 4183 n/a data = PyUnicode_DATA(unicode); 4184 n/a kind = PyUnicode_KIND(unicode); 4185 n/a return PyUnicode_READ(kind, data, index); 4186 n/a } 4187 n/a 4188 n/a int 4189 n/a PyUnicode_WriteChar(PyObject *unicode, Py_ssize_t index, Py_UCS4 ch) 4190 n/a { 4191 n/a if (!PyUnicode_Check(unicode) || !PyUnicode_IS_COMPACT(unicode)) { 4192 n/a PyErr_BadArgument(); 4193 n/a return -1; 4194 n/a } 4195 n/a assert(PyUnicode_IS_READY(unicode)); 4196 n/a if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) { 4197 n/a PyErr_SetString(PyExc_IndexError, "string index out of range"); 4198 n/a return -1; 4199 n/a } 4200 n/a if (unicode_check_modifiable(unicode)) 4201 n/a return -1; 4202 n/a if (ch > PyUnicode_MAX_CHAR_VALUE(unicode)) { 4203 n/a PyErr_SetString(PyExc_ValueError, "character out of range"); 4204 n/a return -1; 4205 n/a } 4206 n/a PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode), 4207 n/a index, ch); 4208 n/a return 0; 4209 n/a } 4210 n/a 4211 n/a const char * 4212 n/a PyUnicode_GetDefaultEncoding(void) 4213 n/a { 4214 n/a return "utf-8"; 4215 n/a } 4216 n/a 4217 n/a /* create or adjust a UnicodeDecodeError */ 4218 n/a static void 4219 n/a make_decode_exception(PyObject **exceptionObject, 4220 n/a const char *encoding, 4221 n/a const char *input, Py_ssize_t length, 4222 n/a Py_ssize_t startpos, Py_ssize_t endpos, 4223 n/a const char *reason) 4224 n/a { 4225 n/a if (*exceptionObject == NULL) { 4226 n/a *exceptionObject = PyUnicodeDecodeError_Create( 4227 n/a encoding, input, length, startpos, endpos, reason); 4228 n/a } 4229 n/a else { 4230 n/a if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos)) 4231 n/a goto onError; 4232 n/a if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos)) 4233 n/a goto onError; 4234 n/a if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason)) 4235 n/a goto onError; 4236 n/a } 4237 n/a return; 4238 n/a 4239 n/a onError: 4240 n/a Py_CLEAR(*exceptionObject); 4241 n/a } 4242 n/a 4243 n/a #ifdef MS_WINDOWS 4244 n/a /* error handling callback helper: 4245 n/a build arguments, call the callback and check the arguments, 4246 n/a if no exception occurred, copy the replacement to the output 4247 n/a and adjust various state variables. 4248 n/a return 0 on success, -1 on error 4249 n/a */ 4250 n/a 4251 n/a static int 4252 n/a unicode_decode_call_errorhandler_wchar( 4253 n/a const char *errors, PyObject **errorHandler, 4254 n/a const char *encoding, const char *reason, 4255 n/a const char **input, const char **inend, Py_ssize_t *startinpos, 4256 n/a Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr, 4257 n/a PyObject **output, Py_ssize_t *outpos) 4258 n/a { 4259 n/a static const char *argparse = "Un;decoding error handler must return (str, int) tuple"; 4260 n/a 4261 n/a PyObject *restuple = NULL; 4262 n/a PyObject *repunicode = NULL; 4263 n/a Py_ssize_t outsize; 4264 n/a Py_ssize_t insize; 4265 n/a Py_ssize_t requiredsize; 4266 n/a Py_ssize_t newpos; 4267 n/a PyObject *inputobj = NULL; 4268 n/a wchar_t *repwstr; 4269 n/a Py_ssize_t repwlen; 4270 n/a 4271 n/a assert (_PyUnicode_KIND(*output) == PyUnicode_WCHAR_KIND); 4272 n/a outsize = _PyUnicode_WSTR_LENGTH(*output); 4273 n/a 4274 n/a if (*errorHandler == NULL) { 4275 n/a *errorHandler = PyCodec_LookupError(errors); 4276 n/a if (*errorHandler == NULL) 4277 n/a goto onError; 4278 n/a } 4279 n/a 4280 n/a make_decode_exception(exceptionObject, 4281 n/a encoding, 4282 n/a *input, *inend - *input, 4283 n/a *startinpos, *endinpos, 4284 n/a reason); 4285 n/a if (*exceptionObject == NULL) 4286 n/a goto onError; 4287 n/a 4288 n/a restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL); 4289 n/a if (restuple == NULL) 4290 n/a goto onError; 4291 n/a if (!PyTuple_Check(restuple)) { 4292 n/a PyErr_SetString(PyExc_TypeError, &argparse[3]); 4293 n/a goto onError; 4294 n/a } 4295 n/a if (!PyArg_ParseTuple(restuple, argparse, &repunicode, &newpos)) 4296 n/a goto onError; 4297 n/a 4298 n/a /* Copy back the bytes variables, which might have been modified by the 4299 n/a callback */ 4300 n/a inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject); 4301 n/a if (!inputobj) 4302 n/a goto onError; 4303 n/a *input = PyBytes_AS_STRING(inputobj); 4304 n/a insize = PyBytes_GET_SIZE(inputobj); 4305 n/a *inend = *input + insize; 4306 n/a /* we can DECREF safely, as the exception has another reference, 4307 n/a so the object won't go away. */ 4308 n/a Py_DECREF(inputobj); 4309 n/a 4310 n/a if (newpos<0) 4311 n/a newpos = insize+newpos; 4312 n/a if (newpos<0 || newpos>insize) { 4313 n/a PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos); 4314 n/a goto onError; 4315 n/a } 4316 n/a 4317 n/a repwstr = PyUnicode_AsUnicodeAndSize(repunicode, &repwlen); 4318 n/a if (repwstr == NULL) 4319 n/a goto onError; 4320 n/a /* need more space? (at least enough for what we 4321 n/a have+the replacement+the rest of the string (starting 4322 n/a at the new input position), so we won't have to check space 4323 n/a when there are no errors in the rest of the string) */ 4324 n/a requiredsize = *outpos; 4325 n/a if (requiredsize > PY_SSIZE_T_MAX - repwlen) 4326 n/a goto overflow; 4327 n/a requiredsize += repwlen; 4328 n/a if (requiredsize > PY_SSIZE_T_MAX - (insize - newpos)) 4329 n/a goto overflow; 4330 n/a requiredsize += insize - newpos; 4331 n/a if (requiredsize > outsize) { 4332 n/a if (outsize <= PY_SSIZE_T_MAX/2 && requiredsize < 2*outsize) 4333 n/a requiredsize = 2*outsize; 4334 n/a if (unicode_resize(output, requiredsize) < 0) 4335 n/a goto onError; 4336 n/a } 4337 n/a wcsncpy(_PyUnicode_WSTR(*output) + *outpos, repwstr, repwlen); 4338 n/a *outpos += repwlen; 4339 n/a *endinpos = newpos; 4340 n/a *inptr = *input + newpos; 4341 n/a 4342 n/a /* we made it! */ 4343 n/a Py_DECREF(restuple); 4344 n/a return 0; 4345 n/a 4346 n/a overflow: 4347 n/a PyErr_SetString(PyExc_OverflowError, 4348 n/a "decoded result is too long for a Python string"); 4349 n/a 4350 n/a onError: 4351 n/a Py_XDECREF(restuple); 4352 n/a return -1; 4353 n/a } 4354 n/a #endif /* MS_WINDOWS */ 4355 n/a 4356 n/a static int 4357 n/a unicode_decode_call_errorhandler_writer( 4358 n/a const char *errors, PyObject **errorHandler, 4359 n/a const char *encoding, const char *reason, 4360 n/a const char **input, const char **inend, Py_ssize_t *startinpos, 4361 n/a Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr, 4362 n/a _PyUnicodeWriter *writer /* PyObject **output, Py_ssize_t *outpos */) 4363 n/a { 4364 n/a static const char *argparse = "Un;decoding error handler must return (str, int) tuple"; 4365 n/a 4366 n/a PyObject *restuple = NULL; 4367 n/a PyObject *repunicode = NULL; 4368 n/a Py_ssize_t insize; 4369 n/a Py_ssize_t newpos; 4370 n/a Py_ssize_t replen; 4371 n/a PyObject *inputobj = NULL; 4372 n/a 4373 n/a if (*errorHandler == NULL) { 4374 n/a *errorHandler = PyCodec_LookupError(errors); 4375 n/a if (*errorHandler == NULL) 4376 n/a goto onError; 4377 n/a } 4378 n/a 4379 n/a make_decode_exception(exceptionObject, 4380 n/a encoding, 4381 n/a *input, *inend - *input, 4382 n/a *startinpos, *endinpos, 4383 n/a reason); 4384 n/a if (*exceptionObject == NULL) 4385 n/a goto onError; 4386 n/a 4387 n/a restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL); 4388 n/a if (restuple == NULL) 4389 n/a goto onError; 4390 n/a if (!PyTuple_Check(restuple)) { 4391 n/a PyErr_SetString(PyExc_TypeError, &argparse[3]); 4392 n/a goto onError; 4393 n/a } 4394 n/a if (!PyArg_ParseTuple(restuple, argparse, &repunicode, &newpos)) 4395 n/a goto onError; 4396 n/a 4397 n/a /* Copy back the bytes variables, which might have been modified by the 4398 n/a callback */ 4399 n/a inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject); 4400 n/a if (!inputobj) 4401 n/a goto onError; 4402 n/a *input = PyBytes_AS_STRING(inputobj); 4403 n/a insize = PyBytes_GET_SIZE(inputobj); 4404 n/a *inend = *input + insize; 4405 n/a /* we can DECREF safely, as the exception has another reference, 4406 n/a so the object won't go away. */ 4407 n/a Py_DECREF(inputobj); 4408 n/a 4409 n/a if (newpos<0) 4410 n/a newpos = insize+newpos; 4411 n/a if (newpos<0 || newpos>insize) { 4412 n/a PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos); 4413 n/a goto onError; 4414 n/a } 4415 n/a 4416 n/a replen = PyUnicode_GET_LENGTH(repunicode); 4417 n/a if (replen > 1) { 4418 n/a writer->min_length += replen - 1; 4419 n/a writer->overallocate = 1; 4420 n/a if (_PyUnicodeWriter_Prepare(writer, writer->min_length, 4421 n/a PyUnicode_MAX_CHAR_VALUE(repunicode)) == -1) 4422 n/a goto onError; 4423 n/a } 4424 n/a if (_PyUnicodeWriter_WriteStr(writer, repunicode) == -1) 4425 n/a goto onError; 4426 n/a 4427 n/a *endinpos = newpos; 4428 n/a *inptr = *input + newpos; 4429 n/a 4430 n/a /* we made it! */ 4431 n/a Py_DECREF(restuple); 4432 n/a return 0; 4433 n/a 4434 n/a onError: 4435 n/a Py_XDECREF(restuple); 4436 n/a return -1; 4437 n/a } 4438 n/a 4439 n/a /* --- UTF-7 Codec -------------------------------------------------------- */ 4440 n/a 4441 n/a /* See RFC2152 for details. We encode conservatively and decode liberally. */ 4442 n/a 4443 n/a /* Three simple macros defining base-64. */ 4444 n/a 4445 n/a /* Is c a base-64 character? */ 4446 n/a 4447 n/a #define IS_BASE64(c) \ 4448 n/a (((c) >= 'A' && (c) <= 'Z') || \ 4449 n/a ((c) >= 'a' && (c) <= 'z') || \ 4450 n/a ((c) >= '0' && (c) <= '9') || \ 4451 n/a (c) == '+' || (c) == '/') 4452 n/a 4453 n/a /* given that c is a base-64 character, what is its base-64 value? */ 4454 n/a 4455 n/a #define FROM_BASE64(c) \ 4456 n/a (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \ 4457 n/a ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \ 4458 n/a ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \ 4459 n/a (c) == '+' ? 62 : 63) 4460 n/a 4461 n/a /* What is the base-64 character of the bottom 6 bits of n? */ 4462 n/a 4463 n/a #define TO_BASE64(n) \ 4464 n/a ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f]) 4465 n/a 4466 n/a /* DECODE_DIRECT: this byte encountered in a UTF-7 string should be 4467 n/a * decoded as itself. We are permissive on decoding; the only ASCII 4468 n/a * byte not decoding to itself is the + which begins a base64 4469 n/a * string. */ 4470 n/a 4471 n/a #define DECODE_DIRECT(c) \ 4472 n/a ((c) <= 127 && (c) != '+') 4473 n/a 4474 n/a /* The UTF-7 encoder treats ASCII characters differently according to 4475 n/a * whether they are Set D, Set O, Whitespace, or special (i.e. none of 4476 n/a * the above). See RFC2152. This array identifies these different 4477 n/a * sets: 4478 n/a * 0 : "Set D" 4479 n/a * alphanumeric and '(),-./:? 4480 n/a * 1 : "Set O" 4481 n/a * !"#$%&*;<=>@[]^_`{|} 4482 n/a * 2 : "whitespace" 4483 n/a * ht nl cr sp 4484 n/a * 3 : special (must be base64 encoded) 4485 n/a * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127) 4486 n/a */ 4487 n/a 4488 n/a static 4489 n/a char utf7_category[128] = { 4490 n/a /* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */ 4491 n/a 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3, 4492 n/a /* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */ 4493 n/a 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4494 n/a /* sp ! " # $ % & ' ( ) * + , - . / */ 4495 n/a 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0, 4496 n/a /* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */ 4497 n/a 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 4498 n/a /* @ A B C D E F G H I J K L M N O */ 4499 n/a 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4500 n/a /* P Q R S T U V W X Y Z [ \ ] ^ _ */ 4501 n/a 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1, 4502 n/a /* ` a b c d e f g h i j k l m n o */ 4503 n/a 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4504 n/a /* p q r s t u v w x y z { | } ~ del */ 4505 n/a 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3, 4506 n/a }; 4507 n/a 4508 n/a /* ENCODE_DIRECT: this character should be encoded as itself. The 4509 n/a * answer depends on whether we are encoding set O as itself, and also 4510 n/a * on whether we are encoding whitespace as itself. RFC2152 makes it 4511 n/a * clear that the answers to these questions vary between 4512 n/a * applications, so this code needs to be flexible. */ 4513 n/a 4514 n/a #define ENCODE_DIRECT(c, directO, directWS) \ 4515 n/a ((c) < 128 && (c) > 0 && \ 4516 n/a ((utf7_category[(c)] == 0) || \ 4517 n/a (directWS && (utf7_category[(c)] == 2)) || \ 4518 n/a (directO && (utf7_category[(c)] == 1)))) 4519 n/a 4520 n/a PyObject * 4521 n/a PyUnicode_DecodeUTF7(const char *s, 4522 n/a Py_ssize_t size, 4523 n/a const char *errors) 4524 n/a { 4525 n/a return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL); 4526 n/a } 4527 n/a 4528 n/a /* The decoder. The only state we preserve is our read position, 4529 n/a * i.e. how many characters we have consumed. So if we end in the 4530 n/a * middle of a shift sequence we have to back off the read position 4531 n/a * and the output to the beginning of the sequence, otherwise we lose 4532 n/a * all the shift state (seen bits, number of bits seen, high 4533 n/a * surrogate). */ 4534 n/a 4535 n/a PyObject * 4536 n/a PyUnicode_DecodeUTF7Stateful(const char *s, 4537 n/a Py_ssize_t size, 4538 n/a const char *errors, 4539 n/a Py_ssize_t *consumed) 4540 n/a { 4541 n/a const char *starts = s; 4542 n/a Py_ssize_t startinpos; 4543 n/a Py_ssize_t endinpos; 4544 n/a const char *e; 4545 n/a _PyUnicodeWriter writer; 4546 n/a const char *errmsg = ""; 4547 n/a int inShift = 0; 4548 n/a Py_ssize_t shiftOutStart; 4549 n/a unsigned int base64bits = 0; 4550 n/a unsigned long base64buffer = 0; 4551 n/a Py_UCS4 surrogate = 0; 4552 n/a PyObject *errorHandler = NULL; 4553 n/a PyObject *exc = NULL; 4554 n/a 4555 n/a if (size == 0) { 4556 n/a if (consumed) 4557 n/a *consumed = 0; 4558 n/a _Py_RETURN_UNICODE_EMPTY(); 4559 n/a } 4560 n/a 4561 n/a /* Start off assuming it's all ASCII. Widen later as necessary. */ 4562 n/a _PyUnicodeWriter_Init(&writer); 4563 n/a writer.min_length = size; 4564 n/a 4565 n/a shiftOutStart = 0; 4566 n/a e = s + size; 4567 n/a 4568 n/a while (s < e) { 4569 n/a Py_UCS4 ch; 4570 n/a restart: 4571 n/a ch = (unsigned char) *s; 4572 n/a 4573 n/a if (inShift) { /* in a base-64 section */ 4574 n/a if (IS_BASE64(ch)) { /* consume a base-64 character */ 4575 n/a base64buffer = (base64buffer << 6) | FROM_BASE64(ch); 4576 n/a base64bits += 6; 4577 n/a s++; 4578 n/a if (base64bits >= 16) { 4579 n/a /* we have enough bits for a UTF-16 value */ 4580 n/a Py_UCS4 outCh = (Py_UCS4)(base64buffer >> (base64bits-16)); 4581 n/a base64bits -= 16; 4582 n/a base64buffer &= (1 << base64bits) - 1; /* clear high bits */ 4583 n/a assert(outCh <= 0xffff); 4584 n/a if (surrogate) { 4585 n/a /* expecting a second surrogate */ 4586 n/a if (Py_UNICODE_IS_LOW_SURROGATE(outCh)) { 4587 n/a Py_UCS4 ch2 = Py_UNICODE_JOIN_SURROGATES(surrogate, outCh); 4588 n/a if (_PyUnicodeWriter_WriteCharInline(&writer, ch2) < 0) 4589 n/a goto onError; 4590 n/a surrogate = 0; 4591 n/a continue; 4592 n/a } 4593 n/a else { 4594 n/a if (_PyUnicodeWriter_WriteCharInline(&writer, surrogate) < 0) 4595 n/a goto onError; 4596 n/a surrogate = 0; 4597 n/a } 4598 n/a } 4599 n/a if (Py_UNICODE_IS_HIGH_SURROGATE(outCh)) { 4600 n/a /* first surrogate */ 4601 n/a surrogate = outCh; 4602 n/a } 4603 n/a else { 4604 n/a if (_PyUnicodeWriter_WriteCharInline(&writer, outCh) < 0) 4605 n/a goto onError; 4606 n/a } 4607 n/a } 4608 n/a } 4609 n/a else { /* now leaving a base-64 section */ 4610 n/a inShift = 0; 4611 n/a if (base64bits > 0) { /* left-over bits */ 4612 n/a if (base64bits >= 6) { 4613 n/a /* We've seen at least one base-64 character */ 4614 n/a s++; 4615 n/a errmsg = "partial character in shift sequence"; 4616 n/a goto utf7Error; 4617 n/a } 4618 n/a else { 4619 n/a /* Some bits remain; they should be zero */ 4620 n/a if (base64buffer != 0) { 4621 n/a s++; 4622 n/a errmsg = "non-zero padding bits in shift sequence"; 4623 n/a goto utf7Error; 4624 n/a } 4625 n/a } 4626 n/a } 4627 n/a if (surrogate && DECODE_DIRECT(ch)) { 4628 n/a if (_PyUnicodeWriter_WriteCharInline(&writer, surrogate) < 0) 4629 n/a goto onError; 4630 n/a } 4631 n/a surrogate = 0; 4632 n/a if (ch == '-') { 4633 n/a /* '-' is absorbed; other terminating 4634 n/a characters are preserved */ 4635 n/a s++; 4636 n/a } 4637 n/a } 4638 n/a } 4639 n/a else if ( ch == '+' ) { 4640 n/a startinpos = s-starts; 4641 n/a s++; /* consume '+' */ 4642 n/a if (s < e && *s == '-') { /* '+-' encodes '+' */ 4643 n/a s++; 4644 n/a if (_PyUnicodeWriter_WriteCharInline(&writer, '+') < 0) 4645 n/a goto onError; 4646 n/a } 4647 n/a else { /* begin base64-encoded section */ 4648 n/a inShift = 1; 4649 n/a surrogate = 0; 4650 n/a shiftOutStart = writer.pos; 4651 n/a base64bits = 0; 4652 n/a base64buffer = 0; 4653 n/a } 4654 n/a } 4655 n/a else if (DECODE_DIRECT(ch)) { /* character decodes as itself */ 4656 n/a s++; 4657 n/a if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0) 4658 n/a goto onError; 4659 n/a } 4660 n/a else { 4661 n/a startinpos = s-starts; 4662 n/a s++; 4663 n/a errmsg = "unexpected special character"; 4664 n/a goto utf7Error; 4665 n/a } 4666 n/a continue; 4667 n/a utf7Error: 4668 n/a endinpos = s-starts; 4669 n/a if (unicode_decode_call_errorhandler_writer( 4670 n/a errors, &errorHandler, 4671 n/a "utf7", errmsg, 4672 n/a &starts, &e, &startinpos, &endinpos, &exc, &s, 4673 n/a &writer)) 4674 n/a goto onError; 4675 n/a } 4676 n/a 4677 n/a /* end of string */ 4678 n/a 4679 n/a if (inShift && !consumed) { /* in shift sequence, no more to follow */ 4680 n/a /* if we're in an inconsistent state, that's an error */ 4681 n/a inShift = 0; 4682 n/a if (surrogate || 4683 n/a (base64bits >= 6) || 4684 n/a (base64bits > 0 && base64buffer != 0)) { 4685 n/a endinpos = size; 4686 n/a if (unicode_decode_call_errorhandler_writer( 4687 n/a errors, &errorHandler, 4688 n/a "utf7", "unterminated shift sequence", 4689 n/a &starts, &e, &startinpos, &endinpos, &exc, &s, 4690 n/a &writer)) 4691 n/a goto onError; 4692 n/a if (s < e) 4693 n/a goto restart; 4694 n/a } 4695 n/a } 4696 n/a 4697 n/a /* return state */ 4698 n/a if (consumed) { 4699 n/a if (inShift) { 4700 n/a *consumed = startinpos; 4701 n/a if (writer.pos != shiftOutStart && writer.maxchar > 127) { 4702 n/a PyObject *result = PyUnicode_FromKindAndData( 4703 n/a writer.kind, writer.data, shiftOutStart); 4704 n/a Py_XDECREF(errorHandler); 4705 n/a Py_XDECREF(exc); 4706 n/a _PyUnicodeWriter_Dealloc(&writer); 4707 n/a return result; 4708 n/a } 4709 n/a writer.pos = shiftOutStart; /* back off output */ 4710 n/a } 4711 n/a else { 4712 n/a *consumed = s-starts; 4713 n/a } 4714 n/a } 4715 n/a 4716 n/a Py_XDECREF(errorHandler); 4717 n/a Py_XDECREF(exc); 4718 n/a return _PyUnicodeWriter_Finish(&writer); 4719 n/a 4720 n/a onError: 4721 n/a Py_XDECREF(errorHandler); 4722 n/a Py_XDECREF(exc); 4723 n/a _PyUnicodeWriter_Dealloc(&writer); 4724 n/a return NULL; 4725 n/a } 4726 n/a 4727 n/a 4728 n/a PyObject * 4729 n/a _PyUnicode_EncodeUTF7(PyObject *str, 4730 n/a int base64SetO, 4731 n/a int base64WhiteSpace, 4732 n/a const char *errors) 4733 n/a { 4734 n/a int kind; 4735 n/a void *data; 4736 n/a Py_ssize_t len; 4737 n/a PyObject *v; 4738 n/a int inShift = 0; 4739 n/a Py_ssize_t i; 4740 n/a unsigned int base64bits = 0; 4741 n/a unsigned long base64buffer = 0; 4742 n/a char * out; 4743 n/a char * start; 4744 n/a 4745 n/a if (PyUnicode_READY(str) == -1) 4746 n/a return NULL; 4747 n/a kind = PyUnicode_KIND(str); 4748 n/a data = PyUnicode_DATA(str); 4749 n/a len = PyUnicode_GET_LENGTH(str); 4750 n/a 4751 n/a if (len == 0) 4752 n/a return PyBytes_FromStringAndSize(NULL, 0); 4753 n/a 4754 n/a /* It might be possible to tighten this worst case */ 4755 n/a if (len > PY_SSIZE_T_MAX / 8) 4756 n/a return PyErr_NoMemory(); 4757 n/a v = PyBytes_FromStringAndSize(NULL, len * 8); 4758 n/a if (v == NULL) 4759 n/a return NULL; 4760 n/a 4761 n/a start = out = PyBytes_AS_STRING(v); 4762 n/a for (i = 0; i < len; ++i) { 4763 n/a Py_UCS4 ch = PyUnicode_READ(kind, data, i); 4764 n/a 4765 n/a if (inShift) { 4766 n/a if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) { 4767 n/a /* shifting out */ 4768 n/a if (base64bits) { /* output remaining bits */ 4769 n/a *out++ = TO_BASE64(base64buffer << (6-base64bits)); 4770 n/a base64buffer = 0; 4771 n/a base64bits = 0; 4772 n/a } 4773 n/a inShift = 0; 4774 n/a /* Characters not in the BASE64 set implicitly unshift the sequence 4775 n/a so no '-' is required, except if the character is itself a '-' */ 4776 n/a if (IS_BASE64(ch) || ch == '-') { 4777 n/a *out++ = '-'; 4778 n/a } 4779 n/a *out++ = (char) ch; 4780 n/a } 4781 n/a else { 4782 n/a goto encode_char; 4783 n/a } 4784 n/a } 4785 n/a else { /* not in a shift sequence */ 4786 n/a if (ch == '+') { 4787 n/a *out++ = '+'; 4788 n/a *out++ = '-'; 4789 n/a } 4790 n/a else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) { 4791 n/a *out++ = (char) ch; 4792 n/a } 4793 n/a else { 4794 n/a *out++ = '+'; 4795 n/a inShift = 1; 4796 n/a goto encode_char; 4797 n/a } 4798 n/a } 4799 n/a continue; 4800 n/a encode_char: 4801 n/a if (ch >= 0x10000) { 4802 n/a assert(ch <= MAX_UNICODE); 4803 n/a 4804 n/a /* code first surrogate */ 4805 n/a base64bits += 16; 4806 n/a base64buffer = (base64buffer << 16) | Py_UNICODE_HIGH_SURROGATE(ch); 4807 n/a while (base64bits >= 6) { 4808 n/a *out++ = TO_BASE64(base64buffer >> (base64bits-6)); 4809 n/a base64bits -= 6; 4810 n/a } 4811 n/a /* prepare second surrogate */ 4812 n/a ch = Py_UNICODE_LOW_SURROGATE(ch); 4813 n/a } 4814 n/a base64bits += 16; 4815 n/a base64buffer = (base64buffer << 16) | ch; 4816 n/a while (base64bits >= 6) { 4817 n/a *out++ = TO_BASE64(base64buffer >> (base64bits-6)); 4818 n/a base64bits -= 6; 4819 n/a } 4820 n/a } 4821 n/a if (base64bits) 4822 n/a *out++= TO_BASE64(base64buffer << (6-base64bits) ); 4823 n/a if (inShift) 4824 n/a *out++ = '-'; 4825 n/a if (_PyBytes_Resize(&v, out - start) < 0) 4826 n/a return NULL; 4827 n/a return v; 4828 n/a } 4829 n/a PyObject * 4830 n/a PyUnicode_EncodeUTF7(const Py_UNICODE *s, 4831 n/a Py_ssize_t size, 4832 n/a int base64SetO, 4833 n/a int base64WhiteSpace, 4834 n/a const char *errors) 4835 n/a { 4836 n/a PyObject *result; 4837 n/a PyObject *tmp = PyUnicode_FromWideChar(s, size); 4838 n/a if (tmp == NULL) 4839 n/a return NULL; 4840 n/a result = _PyUnicode_EncodeUTF7(tmp, base64SetO, 4841 n/a base64WhiteSpace, errors); 4842 n/a Py_DECREF(tmp); 4843 n/a return result; 4844 n/a } 4845 n/a 4846 n/a #undef IS_BASE64 4847 n/a #undef FROM_BASE64 4848 n/a #undef TO_BASE64 4849 n/a #undef DECODE_DIRECT 4850 n/a #undef ENCODE_DIRECT 4851 n/a 4852 n/a /* --- UTF-8 Codec -------------------------------------------------------- */ 4853 n/a 4854 n/a PyObject * 4855 n/a PyUnicode_DecodeUTF8(const char *s, 4856 n/a Py_ssize_t size, 4857 n/a const char *errors) 4858 n/a { 4859 n/a return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL); 4860 n/a } 4861 n/a 4862 n/a #include "stringlib/asciilib.h" 4863 n/a #include "stringlib/codecs.h" 4864 n/a #include "stringlib/undef.h" 4865 n/a 4866 n/a #include "stringlib/ucs1lib.h" 4867 n/a #include "stringlib/codecs.h" 4868 n/a #include "stringlib/undef.h" 4869 n/a 4870 n/a #include "stringlib/ucs2lib.h" 4871 n/a #include "stringlib/codecs.h" 4872 n/a #include "stringlib/undef.h" 4873 n/a 4874 n/a #include "stringlib/ucs4lib.h" 4875 n/a #include "stringlib/codecs.h" 4876 n/a #include "stringlib/undef.h" 4877 n/a 4878 n/a /* Mask to quickly check whether a C 'long' contains a 4879 n/a non-ASCII, UTF8-encoded char. */ 4880 n/a #if (SIZEOF_LONG == 8) 4881 n/a # define ASCII_CHAR_MASK 0x8080808080808080UL 4882 n/a #elif (SIZEOF_LONG == 4) 4883 n/a # define ASCII_CHAR_MASK 0x80808080UL 4884 n/a #else 4885 n/a # error C 'long' size should be either 4 or 8! 4886 n/a #endif 4887 n/a 4888 n/a static Py_ssize_t 4889 n/a ascii_decode(const char *start, const char *end, Py_UCS1 *dest) 4890 n/a { 4891 n/a const char *p = start; 4892 n/a const char *aligned_end = (const char *) _Py_ALIGN_DOWN(end, SIZEOF_LONG); 4893 n/a 4894 n/a /* 4895 n/a * Issue #17237: m68k is a bit different from most architectures in 4896 n/a * that objects do not use "natural alignment" - for example, int and 4897 n/a * long are only aligned at 2-byte boundaries. Therefore the assert() 4898 n/a * won't work; also, tests have shown that skipping the "optimised 4899 n/a * version" will even speed up m68k. 4900 n/a */ 4901 n/a #if !defined(__m68k__) 4902 n/a #if SIZEOF_LONG <= SIZEOF_VOID_P 4903 n/a assert(_Py_IS_ALIGNED(dest, SIZEOF_LONG)); 4904 n/a if (_Py_IS_ALIGNED(p, SIZEOF_LONG)) { 4905 n/a /* Fast path, see in STRINGLIB(utf8_decode) for 4906 n/a an explanation. */ 4907 n/a /* Help allocation */ 4908 n/a const char *_p = p; 4909 n/a Py_UCS1 * q = dest; 4910 n/a while (_p < aligned_end) { 4911 n/a unsigned long value = *(const unsigned long *) _p; 4912 n/a if (value & ASCII_CHAR_MASK) 4913 n/a break; 4914 n/a *((unsigned long *)q) = value; 4915 n/a _p += SIZEOF_LONG; 4916 n/a q += SIZEOF_LONG; 4917 n/a } 4918 n/a p = _p; 4919 n/a while (p < end) { 4920 n/a if ((unsigned char)*p & 0x80) 4921 n/a break; 4922 n/a *q++ = *p++; 4923 n/a } 4924 n/a return p - start; 4925 n/a } 4926 n/a #endif 4927 n/a #endif 4928 n/a while (p < end) { 4929 n/a /* Fast path, see in STRINGLIB(utf8_decode) in stringlib/codecs.h 4930 n/a for an explanation. */ 4931 n/a if (_Py_IS_ALIGNED(p, SIZEOF_LONG)) { 4932 n/a /* Help allocation */ 4933 n/a const char *_p = p; 4934 n/a while (_p < aligned_end) { 4935 n/a unsigned long value = *(unsigned long *) _p; 4936 n/a if (value & ASCII_CHAR_MASK) 4937 n/a break; 4938 n/a _p += SIZEOF_LONG; 4939 n/a } 4940 n/a p = _p; 4941 n/a if (_p == end) 4942 n/a break; 4943 n/a } 4944 n/a if ((unsigned char)*p & 0x80) 4945 n/a break; 4946 n/a ++p; 4947 n/a } 4948 n/a memcpy(dest, start, p - start); 4949 n/a return p - start; 4950 n/a } 4951 n/a 4952 n/a PyObject * 4953 n/a PyUnicode_DecodeUTF8Stateful(const char *s, 4954 n/a Py_ssize_t size, 4955 n/a const char *errors, 4956 n/a Py_ssize_t *consumed) 4957 n/a { 4958 n/a _PyUnicodeWriter writer; 4959 n/a const char *starts = s; 4960 n/a const char *end = s + size; 4961 n/a 4962 n/a Py_ssize_t startinpos; 4963 n/a Py_ssize_t endinpos; 4964 n/a const char *errmsg = ""; 4965 n/a PyObject *error_handler_obj = NULL; 4966 n/a PyObject *exc = NULL; 4967 n/a _Py_error_handler error_handler = _Py_ERROR_UNKNOWN; 4968 n/a 4969 n/a if (size == 0) { 4970 n/a if (consumed) 4971 n/a *consumed = 0; 4972 n/a _Py_RETURN_UNICODE_EMPTY(); 4973 n/a } 4974 n/a 4975 n/a /* ASCII is equivalent to the first 128 ordinals in Unicode. */ 4976 n/a if (size == 1 && (unsigned char)s[0] < 128) { 4977 n/a if (consumed) 4978 n/a *consumed = 1; 4979 n/a return get_latin1_char((unsigned char)s[0]); 4980 n/a } 4981 n/a 4982 n/a _PyUnicodeWriter_Init(&writer); 4983 n/a writer.min_length = size; 4984 n/a if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1) 4985 n/a goto onError; 4986 n/a 4987 n/a writer.pos = ascii_decode(s, end, writer.data); 4988 n/a s += writer.pos; 4989 n/a while (s < end) { 4990 n/a Py_UCS4 ch; 4991 n/a int kind = writer.kind; 4992 n/a 4993 n/a if (kind == PyUnicode_1BYTE_KIND) { 4994 n/a if (PyUnicode_IS_ASCII(writer.buffer)) 4995 n/a ch = asciilib_utf8_decode(&s, end, writer.data, &writer.pos); 4996 n/a else 4997 n/a ch = ucs1lib_utf8_decode(&s, end, writer.data, &writer.pos); 4998 n/a } else if (kind == PyUnicode_2BYTE_KIND) { 4999 n/a ch = ucs2lib_utf8_decode(&s, end, writer.data, &writer.pos); 5000 n/a } else { 5001 n/a assert(kind == PyUnicode_4BYTE_KIND); 5002 n/a ch = ucs4lib_utf8_decode(&s, end, writer.data, &writer.pos); 5003 n/a } 5004 n/a 5005 n/a switch (ch) { 5006 n/a case 0: 5007 n/a if (s == end || consumed) 5008 n/a goto End; 5009 n/a errmsg = "unexpected end of data"; 5010 n/a startinpos = s - starts; 5011 n/a endinpos = end - starts; 5012 n/a break; 5013 n/a case 1: 5014 n/a errmsg = "invalid start byte"; 5015 n/a startinpos = s - starts; 5016 n/a endinpos = startinpos + 1; 5017 n/a break; 5018 n/a case 2: 5019 n/a case 3: 5020 n/a case 4: 5021 n/a errmsg = "invalid continuation byte"; 5022 n/a startinpos = s - starts; 5023 n/a endinpos = startinpos + ch - 1; 5024 n/a break; 5025 n/a default: 5026 n/a if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0) 5027 n/a goto onError; 5028 n/a continue; 5029 n/a } 5030 n/a 5031 n/a if (error_handler == _Py_ERROR_UNKNOWN) 5032 n/a error_handler = get_error_handler(errors); 5033 n/a 5034 n/a switch (error_handler) { 5035 n/a case _Py_ERROR_IGNORE: 5036 n/a s += (endinpos - startinpos); 5037 n/a break; 5038 n/a 5039 n/a case _Py_ERROR_REPLACE: 5040 n/a if (_PyUnicodeWriter_WriteCharInline(&writer, 0xfffd) < 0) 5041 n/a goto onError; 5042 n/a s += (endinpos - startinpos); 5043 n/a break; 5044 n/a 5045 n/a case _Py_ERROR_SURROGATEESCAPE: 5046 n/a { 5047 n/a Py_ssize_t i; 5048 n/a 5049 n/a if (_PyUnicodeWriter_PrepareKind(&writer, PyUnicode_2BYTE_KIND) < 0) 5050 n/a goto onError; 5051 n/a for (i=startinpos; i<endinpos; i++) { 5052 n/a ch = (Py_UCS4)(unsigned char)(starts[i]); 5053 n/a PyUnicode_WRITE(writer.kind, writer.data, writer.pos, 5054 n/a ch + 0xdc00); 5055 n/a writer.pos++; 5056 n/a } 5057 n/a s += (endinpos - startinpos); 5058 n/a break; 5059 n/a } 5060 n/a 5061 n/a default: 5062 n/a if (unicode_decode_call_errorhandler_writer( 5063 n/a errors, &error_handler_obj, 5064 n/a "utf-8", errmsg, 5065 n/a &starts, &end, &startinpos, &endinpos, &exc, &s, 5066 n/a &writer)) 5067 n/a goto onError; 5068 n/a } 5069 n/a } 5070 n/a 5071 n/a End: 5072 n/a if (consumed) 5073 n/a *consumed = s - starts; 5074 n/a 5075 n/a Py_XDECREF(error_handler_obj); 5076 n/a Py_XDECREF(exc); 5077 n/a return _PyUnicodeWriter_Finish(&writer); 5078 n/a 5079 n/a onError: 5080 n/a Py_XDECREF(error_handler_obj); 5081 n/a Py_XDECREF(exc); 5082 n/a _PyUnicodeWriter_Dealloc(&writer); 5083 n/a return NULL; 5084 n/a } 5085 n/a 5086 n/a #if defined(__APPLE__) || defined(__ANDROID__) 5087 n/a 5088 n/a /* Simplified UTF-8 decoder using surrogateescape error handler, 5089 n/a used to decode the command line arguments on Mac OS X and Android. 5090 n/a 5091 n/a Return a pointer to a newly allocated wide character string (use 5092 n/a PyMem_RawFree() to free the memory), or NULL on memory allocation error. */ 5093 n/a 5094 n/a wchar_t* 5095 n/a _Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size) 5096 n/a { 5097 n/a const char *e; 5098 n/a wchar_t *unicode; 5099 n/a Py_ssize_t outpos; 5100 n/a 5101 n/a /* Note: size will always be longer than the resulting Unicode 5102 n/a character count */ 5103 n/a if (PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(wchar_t) < (size + 1)) 5104 n/a return NULL; 5105 n/a unicode = PyMem_RawMalloc((size + 1) * sizeof(wchar_t)); 5106 n/a if (!unicode) 5107 n/a return NULL; 5108 n/a 5109 n/a /* Unpack UTF-8 encoded data */ 5110 n/a e = s + size; 5111 n/a outpos = 0; 5112 n/a while (s < e) { 5113 n/a Py_UCS4 ch; 5114 n/a #if SIZEOF_WCHAR_T == 4 5115 n/a ch = ucs4lib_utf8_decode(&s, e, (Py_UCS4 *)unicode, &outpos); 5116 n/a #else 5117 n/a ch = ucs2lib_utf8_decode(&s, e, (Py_UCS2 *)unicode, &outpos); 5118 n/a #endif 5119 n/a if (ch > 0xFF) { 5120 n/a #if SIZEOF_WCHAR_T == 4 5121 n/a assert(0); 5122 n/a #else 5123 n/a assert(ch > 0xFFFF && ch <= MAX_UNICODE); 5124 n/a /* compute and append the two surrogates: */ 5125 n/a unicode[outpos++] = (wchar_t)Py_UNICODE_HIGH_SURROGATE(ch); 5126 n/a unicode[outpos++] = (wchar_t)Py_UNICODE_LOW_SURROGATE(ch); 5127 n/a #endif 5128 n/a } 5129 n/a else { 5130 n/a if (!ch && s == e) 5131 n/a break; 5132 n/a /* surrogateescape */ 5133 n/a unicode[outpos++] = 0xDC00 + (unsigned char)*s++; 5134 n/a } 5135 n/a } 5136 n/a unicode[outpos] = L'\0'; 5137 n/a return unicode; 5138 n/a } 5139 n/a 5140 n/a #endif /* __APPLE__ or __ANDROID__ */ 5141 n/a 5142 n/a /* Primary internal function which creates utf8 encoded bytes objects. 5143 n/a 5144 n/a Allocation strategy: if the string is short, convert into a stack buffer 5145 n/a and allocate exactly as much space needed at the end. Else allocate the 5146 n/a maximum possible needed (4 result bytes per Unicode character), and return 5147 n/a the excess memory at the end. 5148 n/a */ 5149 n/a PyObject * 5150 n/a _PyUnicode_AsUTF8String(PyObject *unicode, const char *errors) 5151 n/a { 5152 n/a enum PyUnicode_Kind kind; 5153 n/a void *data; 5154 n/a Py_ssize_t size; 5155 n/a 5156 n/a if (!PyUnicode_Check(unicode)) { 5157 n/a PyErr_BadArgument(); 5158 n/a return NULL; 5159 n/a } 5160 n/a 5161 n/a if (PyUnicode_READY(unicode) == -1) 5162 n/a return NULL; 5163 n/a 5164 n/a if (PyUnicode_UTF8(unicode)) 5165 n/a return PyBytes_FromStringAndSize(PyUnicode_UTF8(unicode), 5166 n/a PyUnicode_UTF8_LENGTH(unicode)); 5167 n/a 5168 n/a kind = PyUnicode_KIND(unicode); 5169 n/a data = PyUnicode_DATA(unicode); 5170 n/a size = PyUnicode_GET_LENGTH(unicode); 5171 n/a 5172 n/a switch (kind) { 5173 n/a default: 5174 n/a assert(0); 5175 n/a case PyUnicode_1BYTE_KIND: 5176 n/a /* the string cannot be ASCII, or PyUnicode_UTF8() would be set */ 5177 n/a assert(!PyUnicode_IS_ASCII(unicode)); 5178 n/a return ucs1lib_utf8_encoder(unicode, data, size, errors); 5179 n/a case PyUnicode_2BYTE_KIND: 5180 n/a return ucs2lib_utf8_encoder(unicode, data, size, errors); 5181 n/a case PyUnicode_4BYTE_KIND: 5182 n/a return ucs4lib_utf8_encoder(unicode, data, size, errors); 5183 n/a } 5184 n/a } 5185 n/a 5186 n/a PyObject * 5187 n/a PyUnicode_EncodeUTF8(const Py_UNICODE *s, 5188 n/a Py_ssize_t size, 5189 n/a const char *errors) 5190 n/a { 5191 n/a PyObject *v, *unicode; 5192 n/a 5193 n/a unicode = PyUnicode_FromWideChar(s, size); 5194 n/a if (unicode == NULL) 5195 n/a return NULL; 5196 n/a v = _PyUnicode_AsUTF8String(unicode, errors); 5197 n/a Py_DECREF(unicode); 5198 n/a return v; 5199 n/a } 5200 n/a 5201 n/a PyObject * 5202 n/a PyUnicode_AsUTF8String(PyObject *unicode) 5203 n/a { 5204 n/a return _PyUnicode_AsUTF8String(unicode, NULL); 5205 n/a } 5206 n/a 5207 n/a /* --- UTF-32 Codec ------------------------------------------------------- */ 5208 n/a 5209 n/a PyObject * 5210 n/a PyUnicode_DecodeUTF32(const char *s, 5211 n/a Py_ssize_t size, 5212 n/a const char *errors, 5213 n/a int *byteorder) 5214 n/a { 5215 n/a return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL); 5216 n/a } 5217 n/a 5218 n/a PyObject * 5219 n/a PyUnicode_DecodeUTF32Stateful(const char *s, 5220 n/a Py_ssize_t size, 5221 n/a const char *errors, 5222 n/a int *byteorder, 5223 n/a Py_ssize_t *consumed) 5224 n/a { 5225 n/a const char *starts = s; 5226 n/a Py_ssize_t startinpos; 5227 n/a Py_ssize_t endinpos; 5228 n/a _PyUnicodeWriter writer; 5229 n/a const unsigned char *q, *e; 5230 n/a int le, bo = 0; /* assume native ordering by default */ 5231 n/a const char *encoding; 5232 n/a const char *errmsg = ""; 5233 n/a PyObject *errorHandler = NULL; 5234 n/a PyObject *exc = NULL; 5235 n/a 5236 n/a q = (unsigned char *)s; 5237 n/a e = q + size; 5238 n/a 5239 n/a if (byteorder) 5240 n/a bo = *byteorder; 5241 n/a 5242 n/a /* Check for BOM marks (U+FEFF) in the input and adjust current 5243 n/a byte order setting accordingly. In native mode, the leading BOM 5244 n/a mark is skipped, in all other modes, it is copied to the output 5245 n/a stream as-is (giving a ZWNBSP character). */ 5246 n/a if (bo == 0 && size >= 4) { 5247 n/a Py_UCS4 bom = ((unsigned int)q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0]; 5248 n/a if (bom == 0x0000FEFF) { 5249 n/a bo = -1; 5250 n/a q += 4; 5251 n/a } 5252 n/a else if (bom == 0xFFFE0000) { 5253 n/a bo = 1; 5254 n/a q += 4; 5255 n/a } 5256 n/a if (byteorder) 5257 n/a *byteorder = bo; 5258 n/a } 5259 n/a 5260 n/a if (q == e) { 5261 n/a if (consumed) 5262 n/a *consumed = size; 5263 n/a _Py_RETURN_UNICODE_EMPTY(); 5264 n/a } 5265 n/a 5266 n/a #ifdef WORDS_BIGENDIAN 5267 n/a le = bo < 0; 5268 n/a #else 5269 n/a le = bo <= 0; 5270 n/a #endif 5271 n/a encoding = le ? "utf-32-le" : "utf-32-be"; 5272 n/a 5273 n/a _PyUnicodeWriter_Init(&writer); 5274 n/a writer.min_length = (e - q + 3) / 4; 5275 n/a if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1) 5276 n/a goto onError; 5277 n/a 5278 n/a while (1) { 5279 n/a Py_UCS4 ch = 0; 5280 n/a Py_UCS4 maxch = PyUnicode_MAX_CHAR_VALUE(writer.buffer); 5281 n/a 5282 n/a if (e - q >= 4) { 5283 n/a enum PyUnicode_Kind kind = writer.kind; 5284 n/a void *data = writer.data; 5285 n/a const unsigned char *last = e - 4; 5286 n/a Py_ssize_t pos = writer.pos; 5287 n/a if (le) { 5288 n/a do { 5289 n/a ch = ((unsigned int)q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0]; 5290 n/a if (ch > maxch) 5291 n/a break; 5292 n/a if (kind != PyUnicode_1BYTE_KIND && 5293 n/a Py_UNICODE_IS_SURROGATE(ch)) 5294 n/a break; 5295 n/a PyUnicode_WRITE(kind, data, pos++, ch); 5296 n/a q += 4; 5297 n/a } while (q <= last); 5298 n/a } 5299 n/a else { 5300 n/a do { 5301 n/a ch = ((unsigned int)q[0] << 24) | (q[1] << 16) | (q[2] << 8) | q[3]; 5302 n/a if (ch > maxch) 5303 n/a break; 5304 n/a if (kind != PyUnicode_1BYTE_KIND && 5305 n/a Py_UNICODE_IS_SURROGATE(ch)) 5306 n/a break; 5307 n/a PyUnicode_WRITE(kind, data, pos++, ch); 5308 n/a q += 4; 5309 n/a } while (q <= last); 5310 n/a } 5311 n/a writer.pos = pos; 5312 n/a } 5313 n/a 5314 n/a if (Py_UNICODE_IS_SURROGATE(ch)) { 5315 n/a errmsg = "code point in surrogate code point range(0xd800, 0xe000)"; 5316 n/a startinpos = ((const char *)q) - starts; 5317 n/a endinpos = startinpos + 4; 5318 n/a } 5319 n/a else if (ch <= maxch) { 5320 n/a if (q == e || consumed) 5321 n/a break; 5322 n/a /* remaining bytes at the end? (size should be divisible by 4) */ 5323 n/a errmsg = "truncated data"; 5324 n/a startinpos = ((const char *)q) - starts; 5325 n/a endinpos = ((const char *)e) - starts; 5326 n/a } 5327 n/a else { 5328 n/a if (ch < 0x110000) { 5329 n/a if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0) 5330 n/a goto onError; 5331 n/a q += 4; 5332 n/a continue; 5333 n/a } 5334 n/a errmsg = "code point not in range(0x110000)"; 5335 n/a startinpos = ((const char *)q) - starts; 5336 n/a endinpos = startinpos + 4; 5337 n/a } 5338 n/a 5339 n/a /* The remaining input chars are ignored if the callback 5340 n/a chooses to skip the input */ 5341 n/a if (unicode_decode_call_errorhandler_writer( 5342 n/a errors, &errorHandler, 5343 n/a encoding, errmsg, 5344 n/a &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q, 5345 n/a &writer)) 5346 n/a goto onError; 5347 n/a } 5348 n/a 5349 n/a if (consumed) 5350 n/a *consumed = (const char *)q-starts; 5351 n/a 5352 n/a Py_XDECREF(errorHandler); 5353 n/a Py_XDECREF(exc); 5354 n/a return _PyUnicodeWriter_Finish(&writer); 5355 n/a 5356 n/a onError: 5357 n/a _PyUnicodeWriter_Dealloc(&writer); 5358 n/a Py_XDECREF(errorHandler); 5359 n/a Py_XDECREF(exc); 5360 n/a return NULL; 5361 n/a } 5362 n/a 5363 n/a PyObject * 5364 n/a _PyUnicode_EncodeUTF32(PyObject *str, 5365 n/a const char *errors, 5366 n/a int byteorder) 5367 n/a { 5368 n/a enum PyUnicode_Kind kind; 5369 n/a const void *data; 5370 n/a Py_ssize_t len; 5371 n/a PyObject *v; 5372 n/a uint32_t *out; 5373 n/a #if PY_LITTLE_ENDIAN 5374 n/a int native_ordering = byteorder <= 0; 5375 n/a #else 5376 n/a int native_ordering = byteorder >= 0; 5377 n/a #endif 5378 n/a const char *encoding; 5379 n/a Py_ssize_t nsize, pos; 5380 n/a PyObject *errorHandler = NULL; 5381 n/a PyObject *exc = NULL; 5382 n/a PyObject *rep = NULL; 5383 n/a 5384 n/a if (!PyUnicode_Check(str)) { 5385 n/a PyErr_BadArgument(); 5386 n/a return NULL; 5387 n/a } 5388 n/a if (PyUnicode_READY(str) == -1) 5389 n/a return NULL; 5390 n/a kind = PyUnicode_KIND(str); 5391 n/a data = PyUnicode_DATA(str); 5392 n/a len = PyUnicode_GET_LENGTH(str); 5393 n/a 5394 n/a if (len > PY_SSIZE_T_MAX / 4 - (byteorder == 0)) 5395 n/a return PyErr_NoMemory(); 5396 n/a nsize = len + (byteorder == 0); 5397 n/a v = PyBytes_FromStringAndSize(NULL, nsize * 4); 5398 n/a if (v == NULL) 5399 n/a return NULL; 5400 n/a 5401 n/a /* output buffer is 4-bytes aligned */ 5402 n/a assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), 4)); 5403 n/a out = (uint32_t *)PyBytes_AS_STRING(v); 5404 n/a if (byteorder == 0) 5405 n/a *out++ = 0xFEFF; 5406 n/a if (len == 0) 5407 n/a goto done; 5408 n/a 5409 n/a if (byteorder == -1) 5410 n/a encoding = "utf-32-le"; 5411 n/a else if (byteorder == 1) 5412 n/a encoding = "utf-32-be"; 5413 n/a else 5414 n/a encoding = "utf-32"; 5415 n/a 5416 n/a if (kind == PyUnicode_1BYTE_KIND) { 5417 n/a ucs1lib_utf32_encode((const Py_UCS1 *)data, len, &out, native_ordering); 5418 n/a goto done; 5419 n/a } 5420 n/a 5421 n/a pos = 0; 5422 n/a while (pos < len) { 5423 n/a Py_ssize_t repsize, moreunits; 5424 n/a 5425 n/a if (kind == PyUnicode_2BYTE_KIND) { 5426 n/a pos += ucs2lib_utf32_encode((const Py_UCS2 *)data + pos, len - pos, 5427 n/a &out, native_ordering); 5428 n/a } 5429 n/a else { 5430 n/a assert(kind == PyUnicode_4BYTE_KIND); 5431 n/a pos += ucs4lib_utf32_encode((const Py_UCS4 *)data + pos, len - pos, 5432 n/a &out, native_ordering); 5433 n/a } 5434 n/a if (pos == len) 5435 n/a break; 5436 n/a 5437 n/a rep = unicode_encode_call_errorhandler( 5438 n/a errors, &errorHandler, 5439 n/a encoding, "surrogates not allowed", 5440 n/a str, &exc, pos, pos + 1, &pos); 5441 n/a if (!rep) 5442 n/a goto error; 5443 n/a 5444 n/a if (PyBytes_Check(rep)) { 5445 n/a repsize = PyBytes_GET_SIZE(rep); 5446 n/a if (repsize & 3) { 5447 n/a raise_encode_exception(&exc, encoding, 5448 n/a str, pos - 1, pos, 5449 n/a "surrogates not allowed"); 5450 n/a goto error; 5451 n/a } 5452 n/a moreunits = repsize / 4; 5453 n/a } 5454 n/a else { 5455 n/a assert(PyUnicode_Check(rep)); 5456 n/a if (PyUnicode_READY(rep) < 0) 5457 n/a goto error; 5458 n/a moreunits = repsize = PyUnicode_GET_LENGTH(rep); 5459 n/a if (!PyUnicode_IS_ASCII(rep)) { 5460 n/a raise_encode_exception(&exc, encoding, 5461 n/a str, pos - 1, pos, 5462 n/a "surrogates not allowed"); 5463 n/a goto error; 5464 n/a } 5465 n/a } 5466 n/a 5467 n/a /* four bytes are reserved for each surrogate */ 5468 n/a if (moreunits > 1) { 5469 n/a Py_ssize_t outpos = out - (uint32_t*) PyBytes_AS_STRING(v); 5470 n/a Py_ssize_t morebytes = 4 * (moreunits - 1); 5471 n/a if (PyBytes_GET_SIZE(v) > PY_SSIZE_T_MAX - morebytes) { 5472 n/a /* integer overflow */ 5473 n/a PyErr_NoMemory(); 5474 n/a goto error; 5475 n/a } 5476 n/a if (_PyBytes_Resize(&v, PyBytes_GET_SIZE(v) + morebytes) < 0) 5477 n/a goto error; 5478 n/a out = (uint32_t*) PyBytes_AS_STRING(v) + outpos; 5479 n/a } 5480 n/a 5481 n/a if (PyBytes_Check(rep)) { 5482 n/a memcpy(out, PyBytes_AS_STRING(rep), repsize); 5483 n/a out += moreunits; 5484 n/a } else /* rep is unicode */ { 5485 n/a assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND); 5486 n/a ucs1lib_utf32_encode(PyUnicode_1BYTE_DATA(rep), repsize, 5487 n/a &out, native_ordering); 5488 n/a } 5489 n/a 5490 n/a Py_CLEAR(rep); 5491 n/a } 5492 n/a 5493 n/a /* Cut back to size actually needed. This is necessary for, for example, 5494 n/a encoding of a string containing isolated surrogates and the 'ignore' 5495 n/a handler is used. */ 5496 n/a nsize = (unsigned char*) out - (unsigned char*) PyBytes_AS_STRING(v); 5497 n/a if (nsize != PyBytes_GET_SIZE(v)) 5498 n/a _PyBytes_Resize(&v, nsize); 5499 n/a Py_XDECREF(errorHandler); 5500 n/a Py_XDECREF(exc); 5501 n/a done: 5502 n/a return v; 5503 n/a error: 5504 n/a Py_XDECREF(rep); 5505 n/a Py_XDECREF(errorHandler); 5506 n/a Py_XDECREF(exc); 5507 n/a Py_XDECREF(v); 5508 n/a return NULL; 5509 n/a } 5510 n/a 5511 n/a PyObject * 5512 n/a PyUnicode_EncodeUTF32(const Py_UNICODE *s, 5513 n/a Py_ssize_t size, 5514 n/a const char *errors, 5515 n/a int byteorder) 5516 n/a { 5517 n/a PyObject *result; 5518 n/a PyObject *tmp = PyUnicode_FromWideChar(s, size); 5519 n/a if (tmp == NULL) 5520 n/a return NULL; 5521 n/a result = _PyUnicode_EncodeUTF32(tmp, errors, byteorder); 5522 n/a Py_DECREF(tmp); 5523 n/a return result; 5524 n/a } 5525 n/a 5526 n/a PyObject * 5527 n/a PyUnicode_AsUTF32String(PyObject *unicode) 5528 n/a { 5529 n/a return _PyUnicode_EncodeUTF32(unicode, NULL, 0); 5530 n/a } 5531 n/a 5532 n/a /* --- UTF-16 Codec ------------------------------------------------------- */ 5533 n/a 5534 n/a PyObject * 5535 n/a PyUnicode_DecodeUTF16(const char *s, 5536 n/a Py_ssize_t size, 5537 n/a const char *errors, 5538 n/a int *byteorder) 5539 n/a { 5540 n/a return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL); 5541 n/a } 5542 n/a 5543 n/a PyObject * 5544 n/a PyUnicode_DecodeUTF16Stateful(const char *s, 5545 n/a Py_ssize_t size, 5546 n/a const char *errors, 5547 n/a int *byteorder, 5548 n/a Py_ssize_t *consumed) 5549 n/a { 5550 n/a const char *starts = s; 5551 n/a Py_ssize_t startinpos; 5552 n/a Py_ssize_t endinpos; 5553 n/a _PyUnicodeWriter writer; 5554 n/a const unsigned char *q, *e; 5555 n/a int bo = 0; /* assume native ordering by default */ 5556 n/a int native_ordering; 5557 n/a const char *errmsg = ""; 5558 n/a PyObject *errorHandler = NULL; 5559 n/a PyObject *exc = NULL; 5560 n/a const char *encoding; 5561 n/a 5562 n/a q = (unsigned char *)s; 5563 n/a e = q + size; 5564 n/a 5565 n/a if (byteorder) 5566 n/a bo = *byteorder; 5567 n/a 5568 n/a /* Check for BOM marks (U+FEFF) in the input and adjust current 5569 n/a byte order setting accordingly. In native mode, the leading BOM 5570 n/a mark is skipped, in all other modes, it is copied to the output 5571 n/a stream as-is (giving a ZWNBSP character). */ 5572 n/a if (bo == 0 && size >= 2) { 5573 n/a const Py_UCS4 bom = (q[1] << 8) | q[0]; 5574 n/a if (bom == 0xFEFF) { 5575 n/a q += 2; 5576 n/a bo = -1; 5577 n/a } 5578 n/a else if (bom == 0xFFFE) { 5579 n/a q += 2; 5580 n/a bo = 1; 5581 n/a } 5582 n/a if (byteorder) 5583 n/a *byteorder = bo; 5584 n/a } 5585 n/a 5586 n/a if (q == e) { 5587 n/a if (consumed) 5588 n/a *consumed = size; 5589 n/a _Py_RETURN_UNICODE_EMPTY(); 5590 n/a } 5591 n/a 5592 n/a #if PY_LITTLE_ENDIAN 5593 n/a native_ordering = bo <= 0; 5594 n/a encoding = bo <= 0 ? "utf-16-le" : "utf-16-be"; 5595 n/a #else 5596 n/a native_ordering = bo >= 0; 5597 n/a encoding = bo >= 0 ? "utf-16-be" : "utf-16-le"; 5598 n/a #endif 5599 n/a 5600 n/a /* Note: size will always be longer than the resulting Unicode 5601 n/a character count */ 5602 n/a _PyUnicodeWriter_Init(&writer); 5603 n/a writer.min_length = (e - q + 1) / 2; 5604 n/a if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1) 5605 n/a goto onError; 5606 n/a 5607 n/a while (1) { 5608 n/a Py_UCS4 ch = 0; 5609 n/a if (e - q >= 2) { 5610 n/a int kind = writer.kind; 5611 n/a if (kind == PyUnicode_1BYTE_KIND) { 5612 n/a if (PyUnicode_IS_ASCII(writer.buffer)) 5613 n/a ch = asciilib_utf16_decode(&q, e, 5614 n/a (Py_UCS1*)writer.data, &writer.pos, 5615 n/a native_ordering); 5616 n/a else 5617 n/a ch = ucs1lib_utf16_decode(&q, e, 5618 n/a (Py_UCS1*)writer.data, &writer.pos, 5619 n/a native_ordering); 5620 n/a } else if (kind == PyUnicode_2BYTE_KIND) { 5621 n/a ch = ucs2lib_utf16_decode(&q, e, 5622 n/a (Py_UCS2*)writer.data, &writer.pos, 5623 n/a native_ordering); 5624 n/a } else { 5625 n/a assert(kind == PyUnicode_4BYTE_KIND); 5626 n/a ch = ucs4lib_utf16_decode(&q, e, 5627 n/a (Py_UCS4*)writer.data, &writer.pos, 5628 n/a native_ordering); 5629 n/a } 5630 n/a } 5631 n/a 5632 n/a switch (ch) 5633 n/a { 5634 n/a case 0: 5635 n/a /* remaining byte at the end? (size should be even) */ 5636 n/a if (q == e || consumed) 5637 n/a goto End; 5638 n/a errmsg = "truncated data"; 5639 n/a startinpos = ((const char *)q) - starts; 5640 n/a endinpos = ((const char *)e) - starts; 5641 n/a break; 5642 n/a /* The remaining input chars are ignored if the callback 5643 n/a chooses to skip the input */ 5644 n/a case 1: 5645 n/a q -= 2; 5646 n/a if (consumed) 5647 n/a goto End; 5648 n/a errmsg = "unexpected end of data"; 5649 n/a startinpos = ((const char *)q) - starts; 5650 n/a endinpos = ((const char *)e) - starts; 5651 n/a break; 5652 n/a case 2: 5653 n/a errmsg = "illegal encoding"; 5654 n/a startinpos = ((const char *)q) - 2 - starts; 5655 n/a endinpos = startinpos + 2; 5656 n/a break; 5657 n/a case 3: 5658 n/a errmsg = "illegal UTF-16 surrogate"; 5659 n/a startinpos = ((const char *)q) - 4 - starts; 5660 n/a endinpos = startinpos + 2; 5661 n/a break; 5662 n/a default: 5663 n/a if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0) 5664 n/a goto onError; 5665 n/a continue; 5666 n/a } 5667 n/a 5668 n/a if (unicode_decode_call_errorhandler_writer( 5669 n/a errors, 5670 n/a &errorHandler, 5671 n/a encoding, errmsg, 5672 n/a &starts, 5673 n/a (const char **)&e, 5674 n/a &startinpos, 5675 n/a &endinpos, 5676 n/a &exc, 5677 n/a (const char **)&q, 5678 n/a &writer)) 5679 n/a goto onError; 5680 n/a } 5681 n/a 5682 n/a End: 5683 n/a if (consumed) 5684 n/a *consumed = (const char *)q-starts; 5685 n/a 5686 n/a Py_XDECREF(errorHandler); 5687 n/a Py_XDECREF(exc); 5688 n/a return _PyUnicodeWriter_Finish(&writer); 5689 n/a 5690 n/a onError: 5691 n/a _PyUnicodeWriter_Dealloc(&writer); 5692 n/a Py_XDECREF(errorHandler); 5693 n/a Py_XDECREF(exc); 5694 n/a return NULL; 5695 n/a } 5696 n/a 5697 n/a PyObject * 5698 n/a _PyUnicode_EncodeUTF16(PyObject *str, 5699 n/a const char *errors, 5700 n/a int byteorder) 5701 n/a { 5702 n/a enum PyUnicode_Kind kind; 5703 n/a const void *data; 5704 n/a Py_ssize_t len; 5705 n/a PyObject *v; 5706 n/a unsigned short *out; 5707 n/a Py_ssize_t pairs; 5708 n/a #if PY_BIG_ENDIAN 5709 n/a int native_ordering = byteorder >= 0; 5710 n/a #else 5711 n/a int native_ordering = byteorder <= 0; 5712 n/a #endif 5713 n/a const char *encoding; 5714 n/a Py_ssize_t nsize, pos; 5715 n/a PyObject *errorHandler = NULL; 5716 n/a PyObject *exc = NULL; 5717 n/a PyObject *rep = NULL; 5718 n/a 5719 n/a if (!PyUnicode_Check(str)) { 5720 n/a PyErr_BadArgument(); 5721 n/a return NULL; 5722 n/a } 5723 n/a if (PyUnicode_READY(str) == -1) 5724 n/a return NULL; 5725 n/a kind = PyUnicode_KIND(str); 5726 n/a data = PyUnicode_DATA(str); 5727 n/a len = PyUnicode_GET_LENGTH(str); 5728 n/a 5729 n/a pairs = 0; 5730 n/a if (kind == PyUnicode_4BYTE_KIND) { 5731 n/a const Py_UCS4 *in = (const Py_UCS4 *)data; 5732 n/a const Py_UCS4 *end = in + len; 5733 n/a while (in < end) { 5734 n/a if (*in++ >= 0x10000) { 5735 n/a pairs++; 5736 n/a } 5737 n/a } 5738 n/a } 5739 n/a if (len > PY_SSIZE_T_MAX / 2 - pairs - (byteorder == 0)) { 5740 n/a return PyErr_NoMemory(); 5741 n/a } 5742 n/a nsize = len + pairs + (byteorder == 0); 5743 n/a v = PyBytes_FromStringAndSize(NULL, nsize * 2); 5744 n/a if (v == NULL) { 5745 n/a return NULL; 5746 n/a } 5747 n/a 5748 n/a /* output buffer is 2-bytes aligned */ 5749 n/a assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), 2)); 5750 n/a out = (unsigned short *)PyBytes_AS_STRING(v); 5751 n/a if (byteorder == 0) { 5752 n/a *out++ = 0xFEFF; 5753 n/a } 5754 n/a if (len == 0) { 5755 n/a goto done; 5756 n/a } 5757 n/a 5758 n/a if (kind == PyUnicode_1BYTE_KIND) { 5759 n/a ucs1lib_utf16_encode((const Py_UCS1 *)data, len, &out, native_ordering); 5760 n/a goto done; 5761 n/a } 5762 n/a 5763 n/a if (byteorder < 0) { 5764 n/a encoding = "utf-16-le"; 5765 n/a } 5766 n/a else if (byteorder > 0) { 5767 n/a encoding = "utf-16-be"; 5768 n/a } 5769 n/a else { 5770 n/a encoding = "utf-16"; 5771 n/a } 5772 n/a 5773 n/a pos = 0; 5774 n/a while (pos < len) { 5775 n/a Py_ssize_t repsize, moreunits; 5776 n/a 5777 n/a if (kind == PyUnicode_2BYTE_KIND) { 5778 n/a pos += ucs2lib_utf16_encode((const Py_UCS2 *)data + pos, len - pos, 5779 n/a &out, native_ordering); 5780 n/a } 5781 n/a else { 5782 n/a assert(kind == PyUnicode_4BYTE_KIND); 5783 n/a pos += ucs4lib_utf16_encode((const Py_UCS4 *)data + pos, len - pos, 5784 n/a &out, native_ordering); 5785 n/a } 5786 n/a if (pos == len) 5787 n/a break; 5788 n/a 5789 n/a rep = unicode_encode_call_errorhandler( 5790 n/a errors, &errorHandler, 5791 n/a encoding, "surrogates not allowed", 5792 n/a str, &exc, pos, pos + 1, &pos); 5793 n/a if (!rep) 5794 n/a goto error; 5795 n/a 5796 n/a if (PyBytes_Check(rep)) { 5797 n/a repsize = PyBytes_GET_SIZE(rep); 5798 n/a if (repsize & 1) { 5799 n/a raise_encode_exception(&exc, encoding, 5800 n/a str, pos - 1, pos, 5801 n/a "surrogates not allowed"); 5802 n/a goto error; 5803 n/a } 5804 n/a moreunits = repsize / 2; 5805 n/a } 5806 n/a else { 5807 n/a assert(PyUnicode_Check(rep)); 5808 n/a if (PyUnicode_READY(rep) < 0) 5809 n/a goto error; 5810 n/a moreunits = repsize = PyUnicode_GET_LENGTH(rep); 5811 n/a if (!PyUnicode_IS_ASCII(rep)) { 5812 n/a raise_encode_exception(&exc, encoding, 5813 n/a str, pos - 1, pos, 5814 n/a "surrogates not allowed"); 5815 n/a goto error; 5816 n/a } 5817 n/a } 5818 n/a 5819 n/a /* two bytes are reserved for each surrogate */ 5820 n/a if (moreunits > 1) { 5821 n/a Py_ssize_t outpos = out - (unsigned short*) PyBytes_AS_STRING(v); 5822 n/a Py_ssize_t morebytes = 2 * (moreunits - 1); 5823 n/a if (PyBytes_GET_SIZE(v) > PY_SSIZE_T_MAX - morebytes) { 5824 n/a /* integer overflow */ 5825 n/a PyErr_NoMemory(); 5826 n/a goto error; 5827 n/a } 5828 n/a if (_PyBytes_Resize(&v, PyBytes_GET_SIZE(v) + morebytes) < 0) 5829 n/a goto error; 5830 n/a out = (unsigned short*) PyBytes_AS_STRING(v) + outpos; 5831 n/a } 5832 n/a 5833 n/a if (PyBytes_Check(rep)) { 5834 n/a memcpy(out, PyBytes_AS_STRING(rep), repsize); 5835 n/a out += moreunits; 5836 n/a } else /* rep is unicode */ { 5837 n/a assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND); 5838 n/a ucs1lib_utf16_encode(PyUnicode_1BYTE_DATA(rep), repsize, 5839 n/a &out, native_ordering); 5840 n/a } 5841 n/a 5842 n/a Py_CLEAR(rep); 5843 n/a } 5844 n/a 5845 n/a /* Cut back to size actually needed. This is necessary for, for example, 5846 n/a encoding of a string containing isolated surrogates and the 'ignore' handler 5847 n/a is used. */ 5848 n/a nsize = (unsigned char*) out - (unsigned char*) PyBytes_AS_STRING(v); 5849 n/a if (nsize != PyBytes_GET_SIZE(v)) 5850 n/a _PyBytes_Resize(&v, nsize); 5851 n/a Py_XDECREF(errorHandler); 5852 n/a Py_XDECREF(exc); 5853 n/a done: 5854 n/a return v; 5855 n/a error: 5856 n/a Py_XDECREF(rep); 5857 n/a Py_XDECREF(errorHandler); 5858 n/a Py_XDECREF(exc); 5859 n/a Py_XDECREF(v); 5860 n/a return NULL; 5861 n/a #undef STORECHAR 5862 n/a } 5863 n/a 5864 n/a PyObject * 5865 n/a PyUnicode_EncodeUTF16(const Py_UNICODE *s, 5866 n/a Py_ssize_t size, 5867 n/a const char *errors, 5868 n/a int byteorder) 5869 n/a { 5870 n/a PyObject *result; 5871 n/a PyObject *tmp = PyUnicode_FromWideChar(s, size); 5872 n/a if (tmp == NULL) 5873 n/a return NULL; 5874 n/a result = _PyUnicode_EncodeUTF16(tmp, errors, byteorder); 5875 n/a Py_DECREF(tmp); 5876 n/a return result; 5877 n/a } 5878 n/a 5879 n/a PyObject * 5880 n/a PyUnicode_AsUTF16String(PyObject *unicode) 5881 n/a { 5882 n/a return _PyUnicode_EncodeUTF16(unicode, NULL, 0); 5883 n/a } 5884 n/a 5885 n/a /* --- Unicode Escape Codec ----------------------------------------------- */ 5886 n/a 5887 n/a static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL; 5888 n/a 5889 n/a PyObject * 5890 n/a _PyUnicode_DecodeUnicodeEscape(const char *s, 5891 n/a Py_ssize_t size, 5892 n/a const char *errors, 5893 n/a const char **first_invalid_escape) 5894 n/a { 5895 n/a const char *starts = s; 5896 n/a _PyUnicodeWriter writer; 5897 n/a const char *end; 5898 n/a PyObject *errorHandler = NULL; 5899 n/a PyObject *exc = NULL; 5900 n/a 5901 n/a // so we can remember if we've seen an invalid escape char or not 5902 n/a *first_invalid_escape = NULL; 5903 n/a 5904 n/a if (size == 0) { 5905 n/a _Py_RETURN_UNICODE_EMPTY(); 5906 n/a } 5907 n/a /* Escaped strings will always be longer than the resulting 5908 n/a Unicode string, so we start with size here and then reduce the 5909 n/a length after conversion to the true value. 5910 n/a (but if the error callback returns a long replacement string 5911 n/a we'll have to allocate more space) */ 5912 n/a _PyUnicodeWriter_Init(&writer); 5913 n/a writer.min_length = size; 5914 n/a if (_PyUnicodeWriter_Prepare(&writer, size, 127) < 0) { 5915 n/a goto onError; 5916 n/a } 5917 n/a 5918 n/a end = s + size; 5919 n/a while (s < end) { 5920 n/a unsigned char c = (unsigned char) *s++; 5921 n/a Py_UCS4 ch; 5922 n/a int count; 5923 n/a Py_ssize_t startinpos; 5924 n/a Py_ssize_t endinpos; 5925 n/a const char *message; 5926 n/a 5927 n/a #define WRITE_ASCII_CHAR(ch) \ 5928 n/a do { \ 5929 n/a assert(ch <= 127); \ 5930 n/a assert(writer.pos < writer.size); \ 5931 n/a PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch); \ 5932 n/a } while(0) 5933 n/a 5934 n/a #define WRITE_CHAR(ch) \ 5935 n/a do { \ 5936 n/a if (ch <= writer.maxchar) { \ 5937 n/a assert(writer.pos < writer.size); \ 5938 n/a PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch); \ 5939 n/a } \ 5940 n/a else if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0) { \ 5941 n/a goto onError; \ 5942 n/a } \ 5943 n/a } while(0) 5944 n/a 5945 n/a /* Non-escape characters are interpreted as Unicode ordinals */ 5946 n/a if (c != '\\') { 5947 n/a WRITE_CHAR(c); 5948 n/a continue; 5949 n/a } 5950 n/a 5951 n/a startinpos = s - starts - 1; 5952 n/a /* \ - Escapes */ 5953 n/a if (s >= end) { 5954 n/a message = "\\ at end of string"; 5955 n/a goto error; 5956 n/a } 5957 n/a c = (unsigned char) *s++; 5958 n/a 5959 n/a assert(writer.pos < writer.size); 5960 n/a switch (c) { 5961 n/a 5962 n/a /* \x escapes */ 5963 n/a case '\n': continue; 5964 n/a case '\\': WRITE_ASCII_CHAR('\\'); continue; 5965 n/a case '\'': WRITE_ASCII_CHAR('\''); continue; 5966 n/a case '\"': WRITE_ASCII_CHAR('\"'); continue; 5967 n/a case 'b': WRITE_ASCII_CHAR('\b'); continue; 5968 n/a /* FF */ 5969 n/a case 'f': WRITE_ASCII_CHAR('\014'); continue; 5970 n/a case 't': WRITE_ASCII_CHAR('\t'); continue; 5971 n/a case 'n': WRITE_ASCII_CHAR('\n'); continue; 5972 n/a case 'r': WRITE_ASCII_CHAR('\r'); continue; 5973 n/a /* VT */ 5974 n/a case 'v': WRITE_ASCII_CHAR('\013'); continue; 5975 n/a /* BEL, not classic C */ 5976 n/a case 'a': WRITE_ASCII_CHAR('\007'); continue; 5977 n/a 5978 n/a /* \OOO (octal) escapes */ 5979 n/a case '0': case '1': case '2': case '3': 5980 n/a case '4': case '5': case '6': case '7': 5981 n/a ch = c - '0'; 5982 n/a if (s < end && '0' <= *s && *s <= '7') { 5983 n/a ch = (ch<<3) + *s++ - '0'; 5984 n/a if (s < end && '0' <= *s && *s <= '7') { 5985 n/a ch = (ch<<3) + *s++ - '0'; 5986 n/a } 5987 n/a } 5988 n/a WRITE_CHAR(ch); 5989 n/a continue; 5990 n/a 5991 n/a /* hex escapes */ 5992 n/a /* \xXX */ 5993 n/a case 'x': 5994 n/a count = 2; 5995 n/a message = "truncated \\xXX escape"; 5996 n/a goto hexescape; 5997 n/a 5998 n/a /* \uXXXX */ 5999 n/a case 'u': 6000 n/a count = 4; 6001 n/a message = "truncated \\uXXXX escape"; 6002 n/a goto hexescape; 6003 n/a 6004 n/a /* \UXXXXXXXX */ 6005 n/a case 'U': 6006 n/a count = 8; 6007 n/a message = "truncated \\UXXXXXXXX escape"; 6008 n/a hexescape: 6009 n/a for (ch = 0; count && s < end; ++s, --count) { 6010 n/a c = (unsigned char)*s; 6011 n/a ch <<= 4; 6012 n/a if (c >= '0' && c <= '9') { 6013 n/a ch += c - '0'; 6014 n/a } 6015 n/a else if (c >= 'a' && c <= 'f') { 6016 n/a ch += c - ('a' - 10); 6017 n/a } 6018 n/a else if (c >= 'A' && c <= 'F') { 6019 n/a ch += c - ('A' - 10); 6020 n/a } 6021 n/a else { 6022 n/a break; 6023 n/a } 6024 n/a } 6025 n/a if (count) { 6026 n/a goto error; 6027 n/a } 6028 n/a 6029 n/a /* when we get here, ch is a 32-bit unicode character */ 6030 n/a if (ch > MAX_UNICODE) { 6031 n/a message = "illegal Unicode character"; 6032 n/a goto error; 6033 n/a } 6034 n/a 6035 n/a WRITE_CHAR(ch); 6036 n/a continue; 6037 n/a 6038 n/a /* \N{name} */ 6039 n/a case 'N': 6040 n/a if (ucnhash_CAPI == NULL) { 6041 n/a /* load the unicode data module */ 6042 n/a ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import( 6043 n/a PyUnicodeData_CAPSULE_NAME, 1); 6044 n/a if (ucnhash_CAPI == NULL) { 6045 n/a PyErr_SetString( 6046 n/a PyExc_UnicodeError, 6047 n/a "\\N escapes not supported (can't load unicodedata module)" 6048 n/a ); 6049 n/a goto onError; 6050 n/a } 6051 n/a } 6052 n/a 6053 n/a message = "malformed \\N character escape"; 6054 n/a if (*s == '{') { 6055 n/a const char *start = ++s; 6056 n/a size_t namelen; 6057 n/a /* look for the closing brace */ 6058 n/a while (s < end && *s != '}') 6059 n/a s++; 6060 n/a namelen = s - start; 6061 n/a if (namelen && s < end) { 6062 n/a /* found a name. look it up in the unicode database */ 6063 n/a s++; 6064 n/a ch = 0xffffffff; /* in case 'getcode' messes up */ 6065 n/a if (namelen <= INT_MAX && 6066 n/a ucnhash_CAPI->getcode(NULL, start, (int)namelen, 6067 n/a &ch, 0)) { 6068 n/a assert(ch <= MAX_UNICODE); 6069 n/a WRITE_CHAR(ch); 6070 n/a continue; 6071 n/a } 6072 n/a message = "unknown Unicode character name"; 6073 n/a } 6074 n/a } 6075 n/a goto error; 6076 n/a 6077 n/a default: 6078 n/a if (*first_invalid_escape == NULL) { 6079 n/a *first_invalid_escape = s-1; /* Back up one char, since we've 6080 n/a already incremented s. */ 6081 n/a } 6082 n/a WRITE_ASCII_CHAR('\\'); 6083 n/a WRITE_CHAR(c); 6084 n/a continue; 6085 n/a } 6086 n/a 6087 n/a error: 6088 n/a endinpos = s-starts; 6089 n/a writer.min_length = end - s + writer.pos; 6090 n/a if (unicode_decode_call_errorhandler_writer( 6091 n/a errors, &errorHandler, 6092 n/a "unicodeescape", message, 6093 n/a &starts, &end, &startinpos, &endinpos, &exc, &s, 6094 n/a &writer)) { 6095 n/a goto onError; 6096 n/a } 6097 n/a if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) < 0) { 6098 n/a goto onError; 6099 n/a } 6100 n/a 6101 n/a #undef WRITE_ASCII_CHAR 6102 n/a #undef WRITE_CHAR 6103 n/a } 6104 n/a 6105 n/a Py_XDECREF(errorHandler); 6106 n/a Py_XDECREF(exc); 6107 n/a return _PyUnicodeWriter_Finish(&writer); 6108 n/a 6109 n/a onError: 6110 n/a _PyUnicodeWriter_Dealloc(&writer); 6111 n/a Py_XDECREF(errorHandler); 6112 n/a Py_XDECREF(exc); 6113 n/a return NULL; 6114 n/a } 6115 n/a 6116 n/a PyObject * 6117 n/a PyUnicode_DecodeUnicodeEscape(const char *s, 6118 n/a Py_ssize_t size, 6119 n/a const char *errors) 6120 n/a { 6121 n/a const char *first_invalid_escape; 6122 n/a PyObject *result = _PyUnicode_DecodeUnicodeEscape(s, size, errors, 6123 n/a &first_invalid_escape); 6124 n/a if (result == NULL) 6125 n/a return NULL; 6126 n/a if (first_invalid_escape != NULL) { 6127 n/a if (PyErr_WarnFormat(PyExc_DeprecationWarning, 1, 6128 n/a "invalid escape sequence '\\%c'", 6129 n/a *first_invalid_escape) < 0) { 6130 n/a Py_DECREF(result); 6131 n/a return NULL; 6132 n/a } 6133 n/a } 6134 n/a return result; 6135 n/a } 6136 n/a 6137 n/a /* Return a Unicode-Escape string version of the Unicode object. */ 6138 n/a 6139 n/a PyObject * 6140 n/a PyUnicode_AsUnicodeEscapeString(PyObject *unicode) 6141 n/a { 6142 n/a Py_ssize_t i, len; 6143 n/a PyObject *repr; 6144 n/a char *p; 6145 n/a enum PyUnicode_Kind kind; 6146 n/a void *data; 6147 n/a Py_ssize_t expandsize; 6148 n/a 6149 n/a /* Initial allocation is based on the longest-possible character 6150 n/a escape. 6151 n/a 6152 n/a For UCS1 strings it's '\xxx', 4 bytes per source character. 6153 n/a For UCS2 strings it's '\uxxxx', 6 bytes per source character. 6154 n/a For UCS4 strings it's '\U00xxxxxx', 10 bytes per source character. 6155 n/a */ 6156 n/a 6157 n/a if (!PyUnicode_Check(unicode)) { 6158 n/a PyErr_BadArgument(); 6159 n/a return NULL; 6160 n/a } 6161 n/a if (PyUnicode_READY(unicode) == -1) { 6162 n/a return NULL; 6163 n/a } 6164 n/a 6165 n/a len = PyUnicode_GET_LENGTH(unicode); 6166 n/a if (len == 0) { 6167 n/a return PyBytes_FromStringAndSize(NULL, 0); 6168 n/a } 6169 n/a 6170 n/a kind = PyUnicode_KIND(unicode); 6171 n/a data = PyUnicode_DATA(unicode); 6172 n/a /* 4 byte characters can take up 10 bytes, 2 byte characters can take up 6 6173 n/a bytes, and 1 byte characters 4. */ 6174 n/a expandsize = kind * 2 + 2; 6175 n/a if (len > PY_SSIZE_T_MAX / expandsize) { 6176 n/a return PyErr_NoMemory(); 6177 n/a } 6178 n/a repr = PyBytes_FromStringAndSize(NULL, expandsize * len); 6179 n/a if (repr == NULL) { 6180 n/a return NULL; 6181 n/a } 6182 n/a 6183 n/a p = PyBytes_AS_STRING(repr); 6184 n/a for (i = 0; i < len; i++) { 6185 n/a Py_UCS4 ch = PyUnicode_READ(kind, data, i); 6186 n/a 6187 n/a /* U+0000-U+00ff range */ 6188 n/a if (ch < 0x100) { 6189 n/a if (ch >= ' ' && ch < 127) { 6190 n/a if (ch != '\\') { 6191 n/a /* Copy printable US ASCII as-is */ 6192 n/a *p++ = (char) ch; 6193 n/a } 6194 n/a /* Escape backslashes */ 6195 n/a else { 6196 n/a *p++ = '\\'; 6197 n/a *p++ = '\\'; 6198 n/a } 6199 n/a } 6200 n/a 6201 n/a /* Map special whitespace to '\t', \n', '\r' */ 6202 n/a else if (ch == '\t') { 6203 n/a *p++ = '\\'; 6204 n/a *p++ = 't'; 6205 n/a } 6206 n/a else if (ch == '\n') { 6207 n/a *p++ = '\\'; 6208 n/a *p++ = 'n'; 6209 n/a } 6210 n/a else if (ch == '\r') { 6211 n/a *p++ = '\\'; 6212 n/a *p++ = 'r'; 6213 n/a } 6214 n/a 6215 n/a /* Map non-printable US ASCII and 8-bit characters to '\xHH' */ 6216 n/a else { 6217 n/a *p++ = '\\'; 6218 n/a *p++ = 'x'; 6219 n/a *p++ = Py_hexdigits[(ch >> 4) & 0x000F]; 6220 n/a *p++ = Py_hexdigits[ch & 0x000F]; 6221 n/a } 6222 n/a } 6223 n/a /* U+0100-U+ffff range: Map 16-bit characters to '\uHHHH' */ 6224 n/a else if (ch < 0x10000) { 6225 n/a *p++ = '\\'; 6226 n/a *p++ = 'u'; 6227 n/a *p++ = Py_hexdigits[(ch >> 12) & 0x000F]; 6228 n/a *p++ = Py_hexdigits[(ch >> 8) & 0x000F]; 6229 n/a *p++ = Py_hexdigits[(ch >> 4) & 0x000F]; 6230 n/a *p++ = Py_hexdigits[ch & 0x000F]; 6231 n/a } 6232 n/a /* U+010000-U+10ffff range: Map 21-bit characters to '\U00HHHHHH' */ 6233 n/a else { 6234 n/a 6235 n/a /* Make sure that the first two digits are zero */ 6236 n/a assert(ch <= MAX_UNICODE && MAX_UNICODE <= 0x10ffff); 6237 n/a *p++ = '\\'; 6238 n/a *p++ = 'U'; 6239 n/a *p++ = '0'; 6240 n/a *p++ = '0'; 6241 n/a *p++ = Py_hexdigits[(ch >> 20) & 0x0000000F]; 6242 n/a *p++ = Py_hexdigits[(ch >> 16) & 0x0000000F]; 6243 n/a *p++ = Py_hexdigits[(ch >> 12) & 0x0000000F]; 6244 n/a *p++ = Py_hexdigits[(ch >> 8) & 0x0000000F]; 6245 n/a *p++ = Py_hexdigits[(ch >> 4) & 0x0000000F]; 6246 n/a *p++ = Py_hexdigits[ch & 0x0000000F]; 6247 n/a } 6248 n/a } 6249 n/a 6250 n/a assert(p - PyBytes_AS_STRING(repr) > 0); 6251 n/a if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0) { 6252 n/a return NULL; 6253 n/a } 6254 n/a return repr; 6255 n/a } 6256 n/a 6257 n/a PyObject * 6258 n/a PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s, 6259 n/a Py_ssize_t size) 6260 n/a { 6261 n/a PyObject *result; 6262 n/a PyObject *tmp = PyUnicode_FromWideChar(s, size); 6263 n/a if (tmp == NULL) { 6264 n/a return NULL; 6265 n/a } 6266 n/a 6267 n/a result = PyUnicode_AsUnicodeEscapeString(tmp); 6268 n/a Py_DECREF(tmp); 6269 n/a return result; 6270 n/a } 6271 n/a 6272 n/a /* --- Raw Unicode Escape Codec ------------------------------------------- */ 6273 n/a 6274 n/a PyObject * 6275 n/a PyUnicode_DecodeRawUnicodeEscape(const char *s, 6276 n/a Py_ssize_t size, 6277 n/a const char *errors) 6278 n/a { 6279 n/a const char *starts = s; 6280 n/a _PyUnicodeWriter writer; 6281 n/a const char *end; 6282 n/a PyObject *errorHandler = NULL; 6283 n/a PyObject *exc = NULL; 6284 n/a 6285 n/a if (size == 0) { 6286 n/a _Py_RETURN_UNICODE_EMPTY(); 6287 n/a } 6288 n/a 6289 n/a /* Escaped strings will always be longer than the resulting 6290 n/a Unicode string, so we start with size here and then reduce the 6291 n/a length after conversion to the true value. (But decoding error 6292 n/a handler might have to resize the string) */ 6293 n/a _PyUnicodeWriter_Init(&writer); 6294 n/a writer.min_length = size; 6295 n/a if (_PyUnicodeWriter_Prepare(&writer, size, 127) < 0) { 6296 n/a goto onError; 6297 n/a } 6298 n/a 6299 n/a end = s + size; 6300 n/a while (s < end) { 6301 n/a unsigned char c = (unsigned char) *s++; 6302 n/a Py_UCS4 ch; 6303 n/a int count; 6304 n/a Py_ssize_t startinpos; 6305 n/a Py_ssize_t endinpos; 6306 n/a const char *message; 6307 n/a 6308 n/a #define WRITE_CHAR(ch) \ 6309 n/a do { \ 6310 n/a if (ch <= writer.maxchar) { \ 6311 n/a assert(writer.pos < writer.size); \ 6312 n/a PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch); \ 6313 n/a } \ 6314 n/a else if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0) { \ 6315 n/a goto onError; \ 6316 n/a } \ 6317 n/a } while(0) 6318 n/a 6319 n/a /* Non-escape characters are interpreted as Unicode ordinals */ 6320 n/a if (c != '\\' || s >= end) { 6321 n/a WRITE_CHAR(c); 6322 n/a continue; 6323 n/a } 6324 n/a 6325 n/a c = (unsigned char) *s++; 6326 n/a if (c == 'u') { 6327 n/a count = 4; 6328 n/a message = "truncated \\uXXXX escape"; 6329 n/a } 6330 n/a else if (c == 'U') { 6331 n/a count = 8; 6332 n/a message = "truncated \\UXXXXXXXX escape"; 6333 n/a } 6334 n/a else { 6335 n/a assert(writer.pos < writer.size); 6336 n/a PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, '\\'); 6337 n/a WRITE_CHAR(c); 6338 n/a continue; 6339 n/a } 6340 n/a startinpos = s - starts - 2; 6341 n/a 6342 n/a /* \uHHHH with 4 hex digits, \U00HHHHHH with 8 */ 6343 n/a for (ch = 0; count && s < end; ++s, --count) { 6344 n/a c = (unsigned char)*s; 6345 n/a ch <<= 4; 6346 n/a if (c >= '0' && c <= '9') { 6347 n/a ch += c - '0'; 6348 n/a } 6349 n/a else if (c >= 'a' && c <= 'f') { 6350 n/a ch += c - ('a' - 10); 6351 n/a } 6352 n/a else if (c >= 'A' && c <= 'F') { 6353 n/a ch += c - ('A' - 10); 6354 n/a } 6355 n/a else { 6356 n/a break; 6357 n/a } 6358 n/a } 6359 n/a if (!count) { 6360 n/a if (ch <= MAX_UNICODE) { 6361 n/a WRITE_CHAR(ch); 6362 n/a continue; 6363 n/a } 6364 n/a message = "\\Uxxxxxxxx out of range"; 6365 n/a } 6366 n/a 6367 n/a endinpos = s-starts; 6368 n/a writer.min_length = end - s + writer.pos; 6369 n/a if (unicode_decode_call_errorhandler_writer( 6370 n/a errors, &errorHandler, 6371 n/a "rawunicodeescape", message, 6372 n/a &starts, &end, &startinpos, &endinpos, &exc, &s, 6373 n/a &writer)) { 6374 n/a goto onError; 6375 n/a } 6376 n/a if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) < 0) { 6377 n/a goto onError; 6378 n/a } 6379 n/a 6380 n/a #undef WRITE_CHAR 6381 n/a } 6382 n/a Py_XDECREF(errorHandler); 6383 n/a Py_XDECREF(exc); 6384 n/a return _PyUnicodeWriter_Finish(&writer); 6385 n/a 6386 n/a onError: 6387 n/a _PyUnicodeWriter_Dealloc(&writer); 6388 n/a Py_XDECREF(errorHandler); 6389 n/a Py_XDECREF(exc); 6390 n/a return NULL; 6391 n/a 6392 n/a } 6393 n/a 6394 n/a 6395 n/a PyObject * 6396 n/a PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode) 6397 n/a { 6398 n/a PyObject *repr; 6399 n/a char *p; 6400 n/a Py_ssize_t expandsize, pos; 6401 n/a int kind; 6402 n/a void *data; 6403 n/a Py_ssize_t len; 6404 n/a 6405 n/a if (!PyUnicode_Check(unicode)) { 6406 n/a PyErr_BadArgument(); 6407 n/a return NULL; 6408 n/a } 6409 n/a if (PyUnicode_READY(unicode) == -1) { 6410 n/a return NULL; 6411 n/a } 6412 n/a kind = PyUnicode_KIND(unicode); 6413 n/a data = PyUnicode_DATA(unicode); 6414 n/a len = PyUnicode_GET_LENGTH(unicode); 6415 n/a if (kind == PyUnicode_1BYTE_KIND) { 6416 n/a return PyBytes_FromStringAndSize(data, len); 6417 n/a } 6418 n/a 6419 n/a /* 4 byte characters can take up 10 bytes, 2 byte characters can take up 6 6420 n/a bytes, and 1 byte characters 4. */ 6421 n/a expandsize = kind * 2 + 2; 6422 n/a 6423 n/a if (len > PY_SSIZE_T_MAX / expandsize) { 6424 n/a return PyErr_NoMemory(); 6425 n/a } 6426 n/a repr = PyBytes_FromStringAndSize(NULL, expandsize * len); 6427 n/a if (repr == NULL) { 6428 n/a return NULL; 6429 n/a } 6430 n/a if (len == 0) { 6431 n/a return repr; 6432 n/a } 6433 n/a 6434 n/a p = PyBytes_AS_STRING(repr); 6435 n/a for (pos = 0; pos < len; pos++) { 6436 n/a Py_UCS4 ch = PyUnicode_READ(kind, data, pos); 6437 n/a 6438 n/a /* U+0000-U+00ff range: Copy 8-bit characters as-is */ 6439 n/a if (ch < 0x100) { 6440 n/a *p++ = (char) ch; 6441 n/a } 6442 n/a /* U+0000-U+00ff range: Map 16-bit characters to '\uHHHH' */ 6443 n/a else if (ch < 0x10000) { 6444 n/a *p++ = '\\'; 6445 n/a *p++ = 'u'; 6446 n/a *p++ = Py_hexdigits[(ch >> 12) & 0xf]; 6447 n/a *p++ = Py_hexdigits[(ch >> 8) & 0xf]; 6448 n/a *p++ = Py_hexdigits[(ch >> 4) & 0xf]; 6449 n/a *p++ = Py_hexdigits[ch & 15]; 6450 n/a } 6451 n/a /* U+010000-U+10ffff range: Map 32-bit characters to '\U00HHHHHH' */ 6452 n/a else { 6453 n/a assert(ch <= MAX_UNICODE && MAX_UNICODE <= 0x10ffff); 6454 n/a *p++ = '\\'; 6455 n/a *p++ = 'U'; 6456 n/a *p++ = '0'; 6457 n/a *p++ = '0'; 6458 n/a *p++ = Py_hexdigits[(ch >> 20) & 0xf]; 6459 n/a *p++ = Py_hexdigits[(ch >> 16) & 0xf]; 6460 n/a *p++ = Py_hexdigits[(ch >> 12) & 0xf]; 6461 n/a *p++ = Py_hexdigits[(ch >> 8) & 0xf]; 6462 n/a *p++ = Py_hexdigits[(ch >> 4) & 0xf]; 6463 n/a *p++ = Py_hexdigits[ch & 15]; 6464 n/a } 6465 n/a } 6466 n/a 6467 n/a assert(p > PyBytes_AS_STRING(repr)); 6468 n/a if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0) { 6469 n/a return NULL; 6470 n/a } 6471 n/a return repr; 6472 n/a } 6473 n/a 6474 n/a PyObject * 6475 n/a PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s, 6476 n/a Py_ssize_t size) 6477 n/a { 6478 n/a PyObject *result; 6479 n/a PyObject *tmp = PyUnicode_FromWideChar(s, size); 6480 n/a if (tmp == NULL) 6481 n/a return NULL; 6482 n/a result = PyUnicode_AsRawUnicodeEscapeString(tmp); 6483 n/a Py_DECREF(tmp); 6484 n/a return result; 6485 n/a } 6486 n/a 6487 n/a /* --- Unicode Internal Codec ------------------------------------------- */ 6488 n/a 6489 n/a PyObject * 6490 n/a _PyUnicode_DecodeUnicodeInternal(const char *s, 6491 n/a Py_ssize_t size, 6492 n/a const char *errors) 6493 n/a { 6494 n/a const char *starts = s; 6495 n/a Py_ssize_t startinpos; 6496 n/a Py_ssize_t endinpos; 6497 n/a _PyUnicodeWriter writer; 6498 n/a const char *end; 6499 n/a const char *reason; 6500 n/a PyObject *errorHandler = NULL; 6501 n/a PyObject *exc = NULL; 6502 n/a 6503 n/a if (PyErr_WarnEx(PyExc_DeprecationWarning, 6504 n/a "unicode_internal codec has been deprecated", 6505 n/a 1)) 6506 n/a return NULL; 6507 n/a 6508 n/a if (size == 0) 6509 n/a _Py_RETURN_UNICODE_EMPTY(); 6510 n/a 6511 n/a _PyUnicodeWriter_Init(&writer); 6512 n/a if (size / Py_UNICODE_SIZE > PY_SSIZE_T_MAX - 1) { 6513 n/a PyErr_NoMemory(); 6514 n/a goto onError; 6515 n/a } 6516 n/a writer.min_length = (size + (Py_UNICODE_SIZE - 1)) / Py_UNICODE_SIZE; 6517 n/a 6518 n/a end = s + size; 6519 n/a while (s < end) { 6520 n/a Py_UNICODE uch; 6521 n/a Py_UCS4 ch; 6522 n/a if (end - s < Py_UNICODE_SIZE) { 6523 n/a endinpos = end-starts; 6524 n/a reason = "truncated input"; 6525 n/a goto error; 6526 n/a } 6527 n/a /* We copy the raw representation one byte at a time because the 6528 n/a pointer may be unaligned (see test_codeccallbacks). */ 6529 n/a ((char *) &uch)[0] = s[0]; 6530 n/a ((char *) &uch)[1] = s[1]; 6531 n/a #ifdef Py_UNICODE_WIDE 6532 n/a ((char *) &uch)[2] = s[2]; 6533 n/a ((char *) &uch)[3] = s[3]; 6534 n/a #endif 6535 n/a ch = uch; 6536 n/a #ifdef Py_UNICODE_WIDE 6537 n/a /* We have to sanity check the raw data, otherwise doom looms for 6538 n/a some malformed UCS-4 data. */ 6539 n/a if (ch > 0x10ffff) { 6540 n/a endinpos = s - starts + Py_UNICODE_SIZE; 6541 n/a reason = "illegal code point (> 0x10FFFF)"; 6542 n/a goto error; 6543 n/a } 6544 n/a #endif 6545 n/a s += Py_UNICODE_SIZE; 6546 n/a #ifndef Py_UNICODE_WIDE 6547 n/a if (Py_UNICODE_IS_HIGH_SURROGATE(ch) && end - s >= Py_UNICODE_SIZE) 6548 n/a { 6549 n/a Py_UNICODE uch2; 6550 n/a ((char *) &uch2)[0] = s[0]; 6551 n/a ((char *) &uch2)[1] = s[1]; 6552 n/a if (Py_UNICODE_IS_LOW_SURROGATE(uch2)) 6553 n/a { 6554 n/a ch = Py_UNICODE_JOIN_SURROGATES(uch, uch2); 6555 n/a s += Py_UNICODE_SIZE; 6556 n/a } 6557 n/a } 6558 n/a #endif 6559 n/a 6560 n/a if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0) 6561 n/a goto onError; 6562 n/a continue; 6563 n/a 6564 n/a error: 6565 n/a startinpos = s - starts; 6566 n/a if (unicode_decode_call_errorhandler_writer( 6567 n/a errors, &errorHandler, 6568 n/a "unicode_internal", reason, 6569 n/a &starts, &end, &startinpos, &endinpos, &exc, &s, 6570 n/a &writer)) 6571 n/a goto onError; 6572 n/a } 6573 n/a 6574 n/a Py_XDECREF(errorHandler); 6575 n/a Py_XDECREF(exc); 6576 n/a return _PyUnicodeWriter_Finish(&writer); 6577 n/a 6578 n/a onError: 6579 n/a _PyUnicodeWriter_Dealloc(&writer); 6580 n/a Py_XDECREF(errorHandler); 6581 n/a Py_XDECREF(exc); 6582 n/a return NULL; 6583 n/a } 6584 n/a 6585 n/a /* --- Latin-1 Codec ------------------------------------------------------ */ 6586 n/a 6587 n/a PyObject * 6588 n/a PyUnicode_DecodeLatin1(const char *s, 6589 n/a Py_ssize_t size, 6590 n/a const char *errors) 6591 n/a { 6592 n/a /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */ 6593 n/a return _PyUnicode_FromUCS1((unsigned char*)s, size); 6594 n/a } 6595 n/a 6596 n/a /* create or adjust a UnicodeEncodeError */ 6597 n/a static void 6598 n/a make_encode_exception(PyObject **exceptionObject, 6599 n/a const char *encoding, 6600 n/a PyObject *unicode, 6601 n/a Py_ssize_t startpos, Py_ssize_t endpos, 6602 n/a const char *reason) 6603 n/a { 6604 n/a if (*exceptionObject == NULL) { 6605 n/a *exceptionObject = PyObject_CallFunction( 6606 n/a PyExc_UnicodeEncodeError, "sOnns", 6607 n/a encoding, unicode, startpos, endpos, reason); 6608 n/a } 6609 n/a else { 6610 n/a if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos)) 6611 n/a goto onError; 6612 n/a if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos)) 6613 n/a goto onError; 6614 n/a if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason)) 6615 n/a goto onError; 6616 n/a return; 6617 n/a onError: 6618 n/a Py_CLEAR(*exceptionObject); 6619 n/a } 6620 n/a } 6621 n/a 6622 n/a /* raises a UnicodeEncodeError */ 6623 n/a static void 6624 n/a raise_encode_exception(PyObject **exceptionObject, 6625 n/a const char *encoding, 6626 n/a PyObject *unicode, 6627 n/a Py_ssize_t startpos, Py_ssize_t endpos, 6628 n/a const char *reason) 6629 n/a { 6630 n/a make_encode_exception(exceptionObject, 6631 n/a encoding, unicode, startpos, endpos, reason); 6632 n/a if (*exceptionObject != NULL) 6633 n/a PyCodec_StrictErrors(*exceptionObject); 6634 n/a } 6635 n/a 6636 n/a /* error handling callback helper: 6637 n/a build arguments, call the callback and check the arguments, 6638 n/a put the result into newpos and return the replacement string, which 6639 n/a has to be freed by the caller */ 6640 n/a static PyObject * 6641 n/a unicode_encode_call_errorhandler(const char *errors, 6642 n/a PyObject **errorHandler, 6643 n/a const char *encoding, const char *reason, 6644 n/a PyObject *unicode, PyObject **exceptionObject, 6645 n/a Py_ssize_t startpos, Py_ssize_t endpos, 6646 n/a Py_ssize_t *newpos) 6647 n/a { 6648 n/a static const char *argparse = "On;encoding error handler must return (str/bytes, int) tuple"; 6649 n/a Py_ssize_t len; 6650 n/a PyObject *restuple; 6651 n/a PyObject *resunicode; 6652 n/a 6653 n/a if (*errorHandler == NULL) { 6654 n/a *errorHandler = PyCodec_LookupError(errors); 6655 n/a if (*errorHandler == NULL) 6656 n/a return NULL; 6657 n/a } 6658 n/a 6659 n/a if (PyUnicode_READY(unicode) == -1) 6660 n/a return NULL; 6661 n/a len = PyUnicode_GET_LENGTH(unicode); 6662 n/a 6663 n/a make_encode_exception(exceptionObject, 6664 n/a encoding, unicode, startpos, endpos, reason); 6665 n/a if (*exceptionObject == NULL) 6666 n/a return NULL; 6667 n/a 6668 n/a restuple = PyObject_CallFunctionObjArgs( 6669 n/a *errorHandler, *exceptionObject, NULL); 6670 n/a if (restuple == NULL) 6671 n/a return NULL; 6672 n/a if (!PyTuple_Check(restuple)) { 6673 n/a PyErr_SetString(PyExc_TypeError, &argparse[3]); 6674 n/a Py_DECREF(restuple); 6675 n/a return NULL; 6676 n/a } 6677 n/a if (!PyArg_ParseTuple(restuple, argparse, 6678 n/a &resunicode, newpos)) { 6679 n/a Py_DECREF(restuple); 6680 n/a return NULL; 6681 n/a } 6682 n/a if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) { 6683 n/a PyErr_SetString(PyExc_TypeError, &argparse[3]); 6684 n/a Py_DECREF(restuple); 6685 n/a return NULL; 6686 n/a } 6687 n/a if (*newpos<0) 6688 n/a *newpos = len + *newpos; 6689 n/a if (*newpos<0 || *newpos>len) { 6690 n/a PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos); 6691 n/a Py_DECREF(restuple); 6692 n/a return NULL; 6693 n/a } 6694 n/a Py_INCREF(resunicode); 6695 n/a Py_DECREF(restuple); 6696 n/a return resunicode; 6697 n/a } 6698 n/a 6699 n/a static PyObject * 6700 n/a unicode_encode_ucs1(PyObject *unicode, 6701 n/a const char *errors, 6702 n/a const Py_UCS4 limit) 6703 n/a { 6704 n/a /* input state */ 6705 n/a Py_ssize_t pos=0, size; 6706 n/a int kind; 6707 n/a void *data; 6708 n/a /* pointer into the output */ 6709 n/a char *str; 6710 n/a const char *encoding = (limit == 256) ? "latin-1" : "ascii"; 6711 n/a const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)"; 6712 n/a PyObject *error_handler_obj = NULL; 6713 n/a PyObject *exc = NULL; 6714 n/a _Py_error_handler error_handler = _Py_ERROR_UNKNOWN; 6715 n/a PyObject *rep = NULL; 6716 n/a /* output object */ 6717 n/a _PyBytesWriter writer; 6718 n/a 6719 n/a if (PyUnicode_READY(unicode) == -1) 6720 n/a return NULL; 6721 n/a size = PyUnicode_GET_LENGTH(unicode); 6722 n/a kind = PyUnicode_KIND(unicode); 6723 n/a data = PyUnicode_DATA(unicode); 6724 n/a /* allocate enough for a simple encoding without 6725 n/a replacements, if we need more, we'll resize */ 6726 n/a if (size == 0) 6727 n/a return PyBytes_FromStringAndSize(NULL, 0); 6728 n/a 6729 n/a _PyBytesWriter_Init(&writer); 6730 n/a str = _PyBytesWriter_Alloc(&writer, size); 6731 n/a if (str == NULL) 6732 n/a return NULL; 6733 n/a 6734 n/a while (pos < size) { 6735 n/a Py_UCS4 ch = PyUnicode_READ(kind, data, pos); 6736 n/a 6737 n/a /* can we encode this? */ 6738 n/a if (ch < limit) { 6739 n/a /* no overflow check, because we know that the space is enough */ 6740 n/a *str++ = (char)ch; 6741 n/a ++pos; 6742 n/a } 6743 n/a else { 6744 n/a Py_ssize_t newpos, i; 6745 n/a /* startpos for collecting unencodable chars */ 6746 n/a Py_ssize_t collstart = pos; 6747 n/a Py_ssize_t collend = collstart + 1; 6748 n/a /* find all unecodable characters */ 6749 n/a 6750 n/a while ((collend < size) && (PyUnicode_READ(kind, data, collend) >= limit)) 6751 n/a ++collend; 6752 n/a 6753 n/a /* Only overallocate the buffer if it's not the last write */ 6754 n/a writer.overallocate = (collend < size); 6755 n/a 6756 n/a /* cache callback name lookup (if not done yet, i.e. it's the first error) */ 6757 n/a if (error_handler == _Py_ERROR_UNKNOWN) 6758 n/a error_handler = get_error_handler(errors); 6759 n/a 6760 n/a switch (error_handler) { 6761 n/a case _Py_ERROR_STRICT: 6762 n/a raise_encode_exception(&exc, encoding, unicode, collstart, collend, reason); 6763 n/a goto onError; 6764 n/a 6765 n/a case _Py_ERROR_REPLACE: 6766 n/a memset(str, '?', collend - collstart); 6767 n/a str += (collend - collstart); 6768 n/a /* fall through ignore error handler */ 6769 n/a case _Py_ERROR_IGNORE: 6770 n/a pos = collend; 6771 n/a break; 6772 n/a 6773 n/a case _Py_ERROR_BACKSLASHREPLACE: 6774 n/a /* subtract preallocated bytes */ 6775 n/a writer.min_size -= (collend - collstart); 6776 n/a str = backslashreplace(&writer, str, 6777 n/a unicode, collstart, collend); 6778 n/a if (str == NULL) 6779 n/a goto onError; 6780 n/a pos = collend; 6781 n/a break; 6782 n/a 6783 n/a case _Py_ERROR_XMLCHARREFREPLACE: 6784 n/a /* subtract preallocated bytes */ 6785 n/a writer.min_size -= (collend - collstart); 6786 n/a str = xmlcharrefreplace(&writer, str, 6787 n/a unicode, collstart, collend); 6788 n/a if (str == NULL) 6789 n/a goto onError; 6790 n/a pos = collend; 6791 n/a break; 6792 n/a 6793 n/a case _Py_ERROR_SURROGATEESCAPE: 6794 n/a for (i = collstart; i < collend; ++i) { 6795 n/a ch = PyUnicode_READ(kind, data, i); 6796 n/a if (ch < 0xdc80 || 0xdcff < ch) { 6797 n/a /* Not a UTF-8b surrogate */ 6798 n/a break; 6799 n/a } 6800 n/a *str++ = (char)(ch - 0xdc00); 6801 n/a ++pos; 6802 n/a } 6803 n/a if (i >= collend) 6804 n/a break; 6805 n/a collstart = pos; 6806 n/a assert(collstart != collend); 6807 n/a /* fallback to general error handling */ 6808 n/a 6809 n/a default: 6810 n/a rep = unicode_encode_call_errorhandler(errors, &error_handler_obj, 6811 n/a encoding, reason, unicode, &exc, 6812 n/a collstart, collend, &newpos); 6813 n/a if (rep == NULL) 6814 n/a goto onError; 6815 n/a 6816 n/a /* subtract preallocated bytes */ 6817 n/a writer.min_size -= newpos - collstart; 6818 n/a 6819 n/a if (PyBytes_Check(rep)) { 6820 n/a /* Directly copy bytes result to output. */ 6821 n/a str = _PyBytesWriter_WriteBytes(&writer, str, 6822 n/a PyBytes_AS_STRING(rep), 6823 n/a PyBytes_GET_SIZE(rep)); 6824 n/a if (str == NULL) 6825 n/a goto onError; 6826 n/a } 6827 n/a else { 6828 n/a assert(PyUnicode_Check(rep)); 6829 n/a 6830 n/a if (PyUnicode_READY(rep) < 0) 6831 n/a goto onError; 6832 n/a 6833 n/a if (limit == 256 ? 6834 n/a PyUnicode_KIND(rep) != PyUnicode_1BYTE_KIND : 6835 n/a !PyUnicode_IS_ASCII(rep)) 6836 n/a { 6837 n/a /* Not all characters are smaller than limit */ 6838 n/a raise_encode_exception(&exc, encoding, unicode, 6839 n/a collstart, collend, reason); 6840 n/a goto onError; 6841 n/a } 6842 n/a assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND); 6843 n/a str = _PyBytesWriter_WriteBytes(&writer, str, 6844 n/a PyUnicode_DATA(rep), 6845 n/a PyUnicode_GET_LENGTH(rep)); 6846 n/a } 6847 n/a pos = newpos; 6848 n/a Py_CLEAR(rep); 6849 n/a } 6850 n/a 6851 n/a /* If overallocation was disabled, ensure that it was the last 6852 n/a write. Otherwise, we missed an optimization */ 6853 n/a assert(writer.overallocate || pos == size); 6854 n/a } 6855 n/a } 6856 n/a 6857 n/a Py_XDECREF(error_handler_obj); 6858 n/a Py_XDECREF(exc); 6859 n/a return _PyBytesWriter_Finish(&writer, str); 6860 n/a 6861 n/a onError: 6862 n/a Py_XDECREF(rep); 6863 n/a _PyBytesWriter_Dealloc(&writer); 6864 n/a Py_XDECREF(error_handler_obj); 6865 n/a Py_XDECREF(exc); 6866 n/a return NULL; 6867 n/a } 6868 n/a 6869 n/a /* Deprecated */ 6870 n/a PyObject * 6871 n/a PyUnicode_EncodeLatin1(const Py_UNICODE *p, 6872 n/a Py_ssize_t size, 6873 n/a const char *errors) 6874 n/a { 6875 n/a PyObject *result; 6876 n/a PyObject *unicode = PyUnicode_FromWideChar(p, size); 6877 n/a if (unicode == NULL) 6878 n/a return NULL; 6879 n/a result = unicode_encode_ucs1(unicode, errors, 256); 6880 n/a Py_DECREF(unicode); 6881 n/a return result; 6882 n/a } 6883 n/a 6884 n/a PyObject * 6885 n/a _PyUnicode_AsLatin1String(PyObject *unicode, const char *errors) 6886 n/a { 6887 n/a if (!PyUnicode_Check(unicode)) { 6888 n/a PyErr_BadArgument(); 6889 n/a return NULL; 6890 n/a } 6891 n/a if (PyUnicode_READY(unicode) == -1) 6892 n/a return NULL; 6893 n/a /* Fast path: if it is a one-byte string, construct 6894 n/a bytes object directly. */ 6895 n/a if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND) 6896 n/a return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode), 6897 n/a PyUnicode_GET_LENGTH(unicode)); 6898 n/a /* Non-Latin-1 characters present. Defer to above function to 6899 n/a raise the exception. */ 6900 n/a return unicode_encode_ucs1(unicode, errors, 256); 6901 n/a } 6902 n/a 6903 n/a PyObject* 6904 n/a PyUnicode_AsLatin1String(PyObject *unicode) 6905 n/a { 6906 n/a return _PyUnicode_AsLatin1String(unicode, NULL); 6907 n/a } 6908 n/a 6909 n/a /* --- 7-bit ASCII Codec -------------------------------------------------- */ 6910 n/a 6911 n/a PyObject * 6912 n/a PyUnicode_DecodeASCII(const char *s, 6913 n/a Py_ssize_t size, 6914 n/a const char *errors) 6915 n/a { 6916 n/a const char *starts = s; 6917 n/a _PyUnicodeWriter writer; 6918 n/a int kind; 6919 n/a void *data; 6920 n/a Py_ssize_t startinpos; 6921 n/a Py_ssize_t endinpos; 6922 n/a Py_ssize_t outpos; 6923 n/a const char *e; 6924 n/a PyObject *error_handler_obj = NULL; 6925 n/a PyObject *exc = NULL; 6926 n/a _Py_error_handler error_handler = _Py_ERROR_UNKNOWN; 6927 n/a 6928 n/a if (size == 0) 6929 n/a _Py_RETURN_UNICODE_EMPTY(); 6930 n/a 6931 n/a /* ASCII is equivalent to the first 128 ordinals in Unicode. */ 6932 n/a if (size == 1 && (unsigned char)s[0] < 128) 6933 n/a return get_latin1_char((unsigned char)s[0]); 6934 n/a 6935 n/a _PyUnicodeWriter_Init(&writer); 6936 n/a writer.min_length = size; 6937 n/a if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) < 0) 6938 n/a return NULL; 6939 n/a 6940 n/a e = s + size; 6941 n/a data = writer.data; 6942 n/a outpos = ascii_decode(s, e, (Py_UCS1 *)data); 6943 n/a writer.pos = outpos; 6944 n/a if (writer.pos == size) 6945 n/a return _PyUnicodeWriter_Finish(&writer); 6946 n/a 6947 n/a s += writer.pos; 6948 n/a kind = writer.kind; 6949 n/a while (s < e) { 6950 n/a unsigned char c = (unsigned char)*s; 6951 n/a if (c < 128) { 6952 n/a PyUnicode_WRITE(kind, data, writer.pos, c); 6953 n/a writer.pos++; 6954 n/a ++s; 6955 n/a continue; 6956 n/a } 6957 n/a 6958 n/a /* byte outsize range 0x00..0x7f: call the error handler */ 6959 n/a 6960 n/a if (error_handler == _Py_ERROR_UNKNOWN) 6961 n/a error_handler = get_error_handler(errors); 6962 n/a 6963 n/a switch (error_handler) 6964 n/a { 6965 n/a case _Py_ERROR_REPLACE: 6966 n/a case _Py_ERROR_SURROGATEESCAPE: 6967 n/a /* Fast-path: the error handler only writes one character, 6968 n/a but we may switch to UCS2 at the first write */ 6969 n/a if (_PyUnicodeWriter_PrepareKind(&writer, PyUnicode_2BYTE_KIND) < 0) 6970 n/a goto onError; 6971 n/a kind = writer.kind; 6972 n/a data = writer.data; 6973 n/a 6974 n/a if (error_handler == _Py_ERROR_REPLACE) 6975 n/a PyUnicode_WRITE(kind, data, writer.pos, 0xfffd); 6976 n/a else 6977 n/a PyUnicode_WRITE(kind, data, writer.pos, c + 0xdc00); 6978 n/a writer.pos++; 6979 n/a ++s; 6980 n/a break; 6981 n/a 6982 n/a case _Py_ERROR_IGNORE: 6983 n/a ++s; 6984 n/a break; 6985 n/a 6986 n/a default: 6987 n/a startinpos = s-starts; 6988 n/a endinpos = startinpos + 1; 6989 n/a if (unicode_decode_call_errorhandler_writer( 6990 n/a errors, &error_handler_obj, 6991 n/a "ascii", "ordinal not in range(128)", 6992 n/a &starts, &e, &startinpos, &endinpos, &exc, &s, 6993 n/a &writer)) 6994 n/a goto onError; 6995 n/a kind = writer.kind; 6996 n/a data = writer.data; 6997 n/a } 6998 n/a } 6999 n/a Py_XDECREF(error_handler_obj); 7000 n/a Py_XDECREF(exc); 7001 n/a return _PyUnicodeWriter_Finish(&writer); 7002 n/a 7003 n/a onError: 7004 n/a _PyUnicodeWriter_Dealloc(&writer); 7005 n/a Py_XDECREF(error_handler_obj); 7006 n/a Py_XDECREF(exc); 7007 n/a return NULL; 7008 n/a } 7009 n/a 7010 n/a /* Deprecated */ 7011 n/a PyObject * 7012 n/a PyUnicode_EncodeASCII(const Py_UNICODE *p, 7013 n/a Py_ssize_t size, 7014 n/a const char *errors) 7015 n/a { 7016 n/a PyObject *result; 7017 n/a PyObject *unicode = PyUnicode_FromWideChar(p, size); 7018 n/a if (unicode == NULL) 7019 n/a return NULL; 7020 n/a result = unicode_encode_ucs1(unicode, errors, 128); 7021 n/a Py_DECREF(unicode); 7022 n/a return result; 7023 n/a } 7024 n/a 7025 n/a PyObject * 7026 n/a _PyUnicode_AsASCIIString(PyObject *unicode, const char *errors) 7027 n/a { 7028 n/a if (!PyUnicode_Check(unicode)) { 7029 n/a PyErr_BadArgument(); 7030 n/a return NULL; 7031 n/a } 7032 n/a if (PyUnicode_READY(unicode) == -1) 7033 n/a return NULL; 7034 n/a /* Fast path: if it is an ASCII-only string, construct bytes object 7035 n/a directly. Else defer to above function to raise the exception. */ 7036 n/a if (PyUnicode_IS_ASCII(unicode)) 7037 n/a return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode), 7038 n/a PyUnicode_GET_LENGTH(unicode)); 7039 n/a return unicode_encode_ucs1(unicode, errors, 128); 7040 n/a } 7041 n/a 7042 n/a PyObject * 7043 n/a PyUnicode_AsASCIIString(PyObject *unicode) 7044 n/a { 7045 n/a return _PyUnicode_AsASCIIString(unicode, NULL); 7046 n/a } 7047 n/a 7048 n/a #ifdef MS_WINDOWS 7049 n/a 7050 n/a /* --- MBCS codecs for Windows -------------------------------------------- */ 7051 n/a 7052 n/a #if SIZEOF_INT < SIZEOF_SIZE_T 7053 n/a #define NEED_RETRY 7054 n/a #endif 7055 n/a 7056 n/a #ifndef WC_ERR_INVALID_CHARS 7057 n/a # define WC_ERR_INVALID_CHARS 0x0080 7058 n/a #endif 7059 n/a 7060 n/a static const char* 7061 n/a code_page_name(UINT code_page, PyObject **obj) 7062 n/a { 7063 n/a *obj = NULL; 7064 n/a if (code_page == CP_ACP) 7065 n/a return "mbcs"; 7066 n/a if (code_page == CP_UTF7) 7067 n/a return "CP_UTF7"; 7068 n/a if (code_page == CP_UTF8) 7069 n/a return "CP_UTF8"; 7070 n/a 7071 n/a *obj = PyBytes_FromFormat("cp%u", code_page); 7072 n/a if (*obj == NULL) 7073 n/a return NULL; 7074 n/a return PyBytes_AS_STRING(*obj); 7075 n/a } 7076 n/a 7077 n/a static DWORD 7078 n/a decode_code_page_flags(UINT code_page) 7079 n/a { 7080 n/a if (code_page == CP_UTF7) { 7081 n/a /* The CP_UTF7 decoder only supports flags=0 */ 7082 n/a return 0; 7083 n/a } 7084 n/a else 7085 n/a return MB_ERR_INVALID_CHARS; 7086 n/a } 7087 n/a 7088 n/a /* 7089 n/a * Decode a byte string from a Windows code page into unicode object in strict 7090 n/a * mode. 7091 n/a * 7092 n/a * Returns consumed size if succeed, returns -2 on decode error, or raise an 7093 n/a * OSError and returns -1 on other error. 7094 n/a */ 7095 n/a static int 7096 n/a decode_code_page_strict(UINT code_page, 7097 n/a PyObject **v, 7098 n/a const char *in, 7099 n/a int insize) 7100 n/a { 7101 n/a const DWORD flags = decode_code_page_flags(code_page); 7102 n/a wchar_t *out; 7103 n/a DWORD outsize; 7104 n/a 7105 n/a /* First get the size of the result */ 7106 n/a assert(insize > 0); 7107 n/a outsize = MultiByteToWideChar(code_page, flags, in, insize, NULL, 0); 7108 n/a if (outsize <= 0) 7109 n/a goto error; 7110 n/a 7111 n/a if (*v == NULL) { 7112 n/a /* Create unicode object */ 7113 n/a /* FIXME: don't use _PyUnicode_New(), but allocate a wchar_t* buffer */ 7114 n/a *v = (PyObject*)_PyUnicode_New(outsize); 7115 n/a if (*v == NULL) 7116 n/a return -1; 7117 n/a out = PyUnicode_AS_UNICODE(*v); 7118 n/a } 7119 n/a else { 7120 n/a /* Extend unicode object */ 7121 n/a Py_ssize_t n = PyUnicode_GET_SIZE(*v); 7122 n/a if (unicode_resize(v, n + outsize) < 0) 7123 n/a return -1; 7124 n/a out = PyUnicode_AS_UNICODE(*v) + n; 7125 n/a } 7126 n/a 7127 n/a /* Do the conversion */ 7128 n/a outsize = MultiByteToWideChar(code_page, flags, in, insize, out, outsize); 7129 n/a if (outsize <= 0) 7130 n/a goto error; 7131 n/a return insize; 7132 n/a 7133 n/a error: 7134 n/a if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION) 7135 n/a return -2; 7136 n/a PyErr_SetFromWindowsErr(0); 7137 n/a return -1; 7138 n/a } 7139 n/a 7140 n/a /* 7141 n/a * Decode a byte string from a code page into unicode object with an error 7142 n/a * handler. 7143 n/a * 7144 n/a * Returns consumed size if succeed, or raise an OSError or 7145 n/a * UnicodeDecodeError exception and returns -1 on error. 7146 n/a */ 7147 n/a static int 7148 n/a decode_code_page_errors(UINT code_page, 7149 n/a PyObject **v, 7150 n/a const char *in, const int size, 7151 n/a const char *errors, int final) 7152 n/a { 7153 n/a const char *startin = in; 7154 n/a const char *endin = in + size; 7155 n/a const DWORD flags = decode_code_page_flags(code_page); 7156 n/a /* Ideally, we should get reason from FormatMessage. This is the Windows 7157 n/a 2000 English version of the message. */ 7158 n/a const char *reason = "No mapping for the Unicode character exists " 7159 n/a "in the target code page."; 7160 n/a /* each step cannot decode more than 1 character, but a character can be 7161 n/a represented as a surrogate pair */ 7162 n/a wchar_t buffer[2], *startout, *out; 7163 n/a int insize; 7164 n/a Py_ssize_t outsize; 7165 n/a PyObject *errorHandler = NULL; 7166 n/a PyObject *exc = NULL; 7167 n/a PyObject *encoding_obj = NULL; 7168 n/a const char *encoding; 7169 n/a DWORD err; 7170 n/a int ret = -1; 7171 n/a 7172 n/a assert(size > 0); 7173 n/a 7174 n/a encoding = code_page_name(code_page, &encoding_obj); 7175 n/a if (encoding == NULL) 7176 n/a return -1; 7177 n/a 7178 n/a if ((errors == NULL || strcmp(errors, "strict") == 0) && final) { 7179 n/a /* The last error was ERROR_NO_UNICODE_TRANSLATION, then we raise a 7180 n/a UnicodeDecodeError. */ 7181 n/a make_decode_exception(&exc, encoding, in, size, 0, 0, reason); 7182 n/a if (exc != NULL) { 7183 n/a PyCodec_StrictErrors(exc); 7184 n/a Py_CLEAR(exc); 7185 n/a } 7186 n/a goto error; 7187 n/a } 7188 n/a 7189 n/a if (*v == NULL) { 7190 n/a /* Create unicode object */ 7191 n/a if (size > PY_SSIZE_T_MAX / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) { 7192 n/a PyErr_NoMemory(); 7193 n/a goto error; 7194 n/a } 7195 n/a /* FIXME: don't use _PyUnicode_New(), but allocate a wchar_t* buffer */ 7196 n/a *v = (PyObject*)_PyUnicode_New(size * Py_ARRAY_LENGTH(buffer)); 7197 n/a if (*v == NULL) 7198 n/a goto error; 7199 n/a startout = PyUnicode_AS_UNICODE(*v); 7200 n/a } 7201 n/a else { 7202 n/a /* Extend unicode object */ 7203 n/a Py_ssize_t n = PyUnicode_GET_SIZE(*v); 7204 n/a if (size > (PY_SSIZE_T_MAX - n) / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) { 7205 n/a PyErr_NoMemory(); 7206 n/a goto error; 7207 n/a } 7208 n/a if (unicode_resize(v, n + size * Py_ARRAY_LENGTH(buffer)) < 0) 7209 n/a goto error; 7210 n/a startout = PyUnicode_AS_UNICODE(*v) + n; 7211 n/a } 7212 n/a 7213 n/a /* Decode the byte string character per character */ 7214 n/a out = startout; 7215 n/a while (in < endin) 7216 n/a { 7217 n/a /* Decode a character */ 7218 n/a insize = 1; 7219 n/a do 7220 n/a { 7221 n/a outsize = MultiByteToWideChar(code_page, flags, 7222 n/a in, insize, 7223 n/a buffer, Py_ARRAY_LENGTH(buffer)); 7224 n/a if (outsize > 0) 7225 n/a break; 7226 n/a err = GetLastError(); 7227 n/a if (err != ERROR_NO_UNICODE_TRANSLATION 7228 n/a && err != ERROR_INSUFFICIENT_BUFFER) 7229 n/a { 7230 n/a PyErr_SetFromWindowsErr(0); 7231 n/a goto error; 7232 n/a } 7233 n/a insize++; 7234 n/a } 7235 n/a /* 4=maximum length of a UTF-8 sequence */ 7236 n/a while (insize <= 4 && (in + insize) <= endin); 7237 n/a 7238 n/a if (outsize <= 0) { 7239 n/a Py_ssize_t startinpos, endinpos, outpos; 7240 n/a 7241 n/a /* last character in partial decode? */ 7242 n/a if (in + insize >= endin && !final) 7243 n/a break; 7244 n/a 7245 n/a startinpos = in - startin; 7246 n/a endinpos = startinpos + 1; 7247 n/a outpos = out - PyUnicode_AS_UNICODE(*v); 7248 n/a if (unicode_decode_call_errorhandler_wchar( 7249 n/a errors, &errorHandler, 7250 n/a encoding, reason, 7251 n/a &startin, &endin, &startinpos, &endinpos, &exc, &in, 7252 n/a v, &outpos)) 7253 n/a { 7254 n/a goto error; 7255 n/a } 7256 n/a out = PyUnicode_AS_UNICODE(*v) + outpos; 7257 n/a } 7258 n/a else { 7259 n/a in += insize; 7260 n/a memcpy(out, buffer, outsize * sizeof(wchar_t)); 7261 n/a out += outsize; 7262 n/a } 7263 n/a } 7264 n/a 7265 n/a /* write a NUL character at the end */ 7266 n/a *out = 0; 7267 n/a 7268 n/a /* Extend unicode object */ 7269 n/a outsize = out - startout; 7270 n/a assert(outsize <= PyUnicode_WSTR_LENGTH(*v)); 7271 n/a if (unicode_resize(v, outsize) < 0) 7272 n/a goto error; 7273 n/a /* (in - startin) <= size and size is an int */ 7274 n/a ret = Py_SAFE_DOWNCAST(in - startin, Py_ssize_t, int); 7275 n/a 7276 n/a error: 7277 n/a Py_XDECREF(encoding_obj); 7278 n/a Py_XDECREF(errorHandler); 7279 n/a Py_XDECREF(exc); 7280 n/a return ret; 7281 n/a } 7282 n/a 7283 n/a static PyObject * 7284 n/a decode_code_page_stateful(int code_page, 7285 n/a const char *s, Py_ssize_t size, 7286 n/a const char *errors, Py_ssize_t *consumed) 7287 n/a { 7288 n/a PyObject *v = NULL; 7289 n/a int chunk_size, final, converted, done; 7290 n/a 7291 n/a if (code_page < 0) { 7292 n/a PyErr_SetString(PyExc_ValueError, "invalid code page number"); 7293 n/a return NULL; 7294 n/a } 7295 n/a 7296 n/a if (consumed) 7297 n/a *consumed = 0; 7298 n/a 7299 n/a do 7300 n/a { 7301 n/a #ifdef NEED_RETRY 7302 n/a if (size > INT_MAX) { 7303 n/a chunk_size = INT_MAX; 7304 n/a final = 0; 7305 n/a done = 0; 7306 n/a } 7307 n/a else 7308 n/a #endif 7309 n/a { 7310 n/a chunk_size = (int)size; 7311 n/a final = (consumed == NULL); 7312 n/a done = 1; 7313 n/a } 7314 n/a 7315 n/a if (chunk_size == 0 && done) { 7316 n/a if (v != NULL) 7317 n/a break; 7318 n/a _Py_RETURN_UNICODE_EMPTY(); 7319 n/a } 7320 n/a 7321 n/a converted = decode_code_page_strict(code_page, &v, 7322 n/a s, chunk_size); 7323 n/a if (converted == -2) 7324 n/a converted = decode_code_page_errors(code_page, &v, 7325 n/a s, chunk_size, 7326 n/a errors, final); 7327 n/a assert(converted != 0 || done); 7328 n/a 7329 n/a if (converted < 0) { 7330 n/a Py_XDECREF(v); 7331 n/a return NULL; 7332 n/a } 7333 n/a 7334 n/a if (consumed) 7335 n/a *consumed += converted; 7336 n/a 7337 n/a s += converted; 7338 n/a size -= converted; 7339 n/a } while (!done); 7340 n/a 7341 n/a return unicode_result(v); 7342 n/a } 7343 n/a 7344 n/a PyObject * 7345 n/a PyUnicode_DecodeCodePageStateful(int code_page, 7346 n/a const char *s, 7347 n/a Py_ssize_t size, 7348 n/a const char *errors, 7349 n/a Py_ssize_t *consumed) 7350 n/a { 7351 n/a return decode_code_page_stateful(code_page, s, size, errors, consumed); 7352 n/a } 7353 n/a 7354 n/a PyObject * 7355 n/a PyUnicode_DecodeMBCSStateful(const char *s, 7356 n/a Py_ssize_t size, 7357 n/a const char *errors, 7358 n/a Py_ssize_t *consumed) 7359 n/a { 7360 n/a return decode_code_page_stateful(CP_ACP, s, size, errors, consumed); 7361 n/a } 7362 n/a 7363 n/a PyObject * 7364 n/a PyUnicode_DecodeMBCS(const char *s, 7365 n/a Py_ssize_t size, 7366 n/a const char *errors) 7367 n/a { 7368 n/a return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL); 7369 n/a } 7370 n/a 7371 n/a static DWORD 7372 n/a encode_code_page_flags(UINT code_page, const char *errors) 7373 n/a { 7374 n/a if (code_page == CP_UTF8) { 7375 n/a return WC_ERR_INVALID_CHARS; 7376 n/a } 7377 n/a else if (code_page == CP_UTF7) { 7378 n/a /* CP_UTF7 only supports flags=0 */ 7379 n/a return 0; 7380 n/a } 7381 n/a else { 7382 n/a if (errors != NULL && strcmp(errors, "replace") == 0) 7383 n/a return 0; 7384 n/a else 7385 n/a return WC_NO_BEST_FIT_CHARS; 7386 n/a } 7387 n/a } 7388 n/a 7389 n/a /* 7390 n/a * Encode a Unicode string to a Windows code page into a byte string in strict 7391 n/a * mode. 7392 n/a * 7393 n/a * Returns consumed characters if succeed, returns -2 on encode error, or raise 7394 n/a * an OSError and returns -1 on other error. 7395 n/a */ 7396 n/a static int 7397 n/a encode_code_page_strict(UINT code_page, PyObject **outbytes, 7398 n/a PyObject *unicode, Py_ssize_t offset, int len, 7399 n/a const char* errors) 7400 n/a { 7401 n/a BOOL usedDefaultChar = FALSE; 7402 n/a BOOL *pusedDefaultChar = &usedDefaultChar; 7403 n/a int outsize; 7404 n/a wchar_t *p; 7405 n/a Py_ssize_t size; 7406 n/a const DWORD flags = encode_code_page_flags(code_page, NULL); 7407 n/a char *out; 7408 n/a /* Create a substring so that we can get the UTF-16 representation 7409 n/a of just the slice under consideration. */ 7410 n/a PyObject *substring; 7411 n/a 7412 n/a assert(len > 0); 7413 n/a 7414 n/a if (code_page != CP_UTF8 && code_page != CP_UTF7) 7415 n/a pusedDefaultChar = &usedDefaultChar; 7416 n/a else 7417 n/a pusedDefaultChar = NULL; 7418 n/a 7419 n/a substring = PyUnicode_Substring(unicode, offset, offset+len); 7420 n/a if (substring == NULL) 7421 n/a return -1; 7422 n/a p = PyUnicode_AsUnicodeAndSize(substring, &size); 7423 n/a if (p == NULL) { 7424 n/a Py_DECREF(substring); 7425 n/a return -1; 7426 n/a } 7427 n/a assert(size <= INT_MAX); 7428 n/a 7429 n/a /* First get the size of the result */ 7430 n/a outsize = WideCharToMultiByte(code_page, flags, 7431 n/a p, (int)size, 7432 n/a NULL, 0, 7433 n/a NULL, pusedDefaultChar); 7434 n/a if (outsize <= 0) 7435 n/a goto error; 7436 n/a /* If we used a default char, then we failed! */ 7437 n/a if (pusedDefaultChar && *pusedDefaultChar) { 7438 n/a Py_DECREF(substring); 7439 n/a return -2; 7440 n/a } 7441 n/a 7442 n/a if (*outbytes == NULL) { 7443 n/a /* Create string object */ 7444 n/a *outbytes = PyBytes_FromStringAndSize(NULL, outsize); 7445 n/a if (*outbytes == NULL) { 7446 n/a Py_DECREF(substring); 7447 n/a return -1; 7448 n/a } 7449 n/a out = PyBytes_AS_STRING(*outbytes); 7450 n/a } 7451 n/a else { 7452 n/a /* Extend string object */ 7453 n/a const Py_ssize_t n = PyBytes_Size(*outbytes); 7454 n/a if (outsize > PY_SSIZE_T_MAX - n) { 7455 n/a PyErr_NoMemory(); 7456 n/a Py_DECREF(substring); 7457 n/a return -1; 7458 n/a } 7459 n/a if (_PyBytes_Resize(outbytes, n + outsize) < 0) { 7460 n/a Py_DECREF(substring); 7461 n/a return -1; 7462 n/a } 7463 n/a out = PyBytes_AS_STRING(*outbytes) + n; 7464 n/a } 7465 n/a 7466 n/a /* Do the conversion */ 7467 n/a outsize = WideCharToMultiByte(code_page, flags, 7468 n/a p, (int)size, 7469 n/a out, outsize, 7470 n/a NULL, pusedDefaultChar); 7471 n/a Py_CLEAR(substring); 7472 n/a if (outsize <= 0) 7473 n/a goto error; 7474 n/a if (pusedDefaultChar && *pusedDefaultChar) 7475 n/a return -2; 7476 n/a return 0; 7477 n/a 7478 n/a error: 7479 n/a Py_XDECREF(substring); 7480 n/a if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION) 7481 n/a return -2; 7482 n/a PyErr_SetFromWindowsErr(0); 7483 n/a return -1; 7484 n/a } 7485 n/a 7486 n/a /* 7487 n/a * Encode a Unicode string to a Windows code page into a byte string using an 7488 n/a * error handler. 7489 n/a * 7490 n/a * Returns consumed characters if succeed, or raise an OSError and returns 7491 n/a * -1 on other error. 7492 n/a */ 7493 n/a static int 7494 n/a encode_code_page_errors(UINT code_page, PyObject **outbytes, 7495 n/a PyObject *unicode, Py_ssize_t unicode_offset, 7496 n/a Py_ssize_t insize, const char* errors) 7497 n/a { 7498 n/a const DWORD flags = encode_code_page_flags(code_page, errors); 7499 n/a Py_ssize_t pos = unicode_offset; 7500 n/a Py_ssize_t endin = unicode_offset + insize; 7501 n/a /* Ideally, we should get reason from FormatMessage. This is the Windows 7502 n/a 2000 English version of the message. */ 7503 n/a const char *reason = "invalid character"; 7504 n/a /* 4=maximum length of a UTF-8 sequence */ 7505 n/a char buffer[4]; 7506 n/a BOOL usedDefaultChar = FALSE, *pusedDefaultChar; 7507 n/a Py_ssize_t outsize; 7508 n/a char *out; 7509 n/a PyObject *errorHandler = NULL; 7510 n/a PyObject *exc = NULL; 7511 n/a PyObject *encoding_obj = NULL; 7512 n/a const char *encoding; 7513 n/a Py_ssize_t newpos, newoutsize; 7514 n/a PyObject *rep; 7515 n/a int ret = -1; 7516 n/a 7517 n/a assert(insize > 0); 7518 n/a 7519 n/a encoding = code_page_name(code_page, &encoding_obj); 7520 n/a if (encoding == NULL) 7521 n/a return -1; 7522 n/a 7523 n/a if (errors == NULL || strcmp(errors, "strict") == 0) { 7524 n/a /* The last error was ERROR_NO_UNICODE_TRANSLATION, 7525 n/a then we raise a UnicodeEncodeError. */ 7526 n/a make_encode_exception(&exc, encoding, unicode, 0, 0, reason); 7527 n/a if (exc != NULL) { 7528 n/a PyCodec_StrictErrors(exc); 7529 n/a Py_DECREF(exc); 7530 n/a } 7531 n/a Py_XDECREF(encoding_obj); 7532 n/a return -1; 7533 n/a } 7534 n/a 7535 n/a if (code_page != CP_UTF8 && code_page != CP_UTF7) 7536 n/a pusedDefaultChar = &usedDefaultChar; 7537 n/a else 7538 n/a pusedDefaultChar = NULL; 7539 n/a 7540 n/a if (Py_ARRAY_LENGTH(buffer) > PY_SSIZE_T_MAX / insize) { 7541 n/a PyErr_NoMemory(); 7542 n/a goto error; 7543 n/a } 7544 n/a outsize = insize * Py_ARRAY_LENGTH(buffer); 7545 n/a 7546 n/a if (*outbytes == NULL) { 7547 n/a /* Create string object */ 7548 n/a *outbytes = PyBytes_FromStringAndSize(NULL, outsize); 7549 n/a if (*outbytes == NULL) 7550 n/a goto error; 7551 n/a out = PyBytes_AS_STRING(*outbytes); 7552 n/a } 7553 n/a else { 7554 n/a /* Extend string object */ 7555 n/a Py_ssize_t n = PyBytes_Size(*outbytes); 7556 n/a if (n > PY_SSIZE_T_MAX - outsize) { 7557 n/a PyErr_NoMemory(); 7558 n/a goto error; 7559 n/a } 7560 n/a if (_PyBytes_Resize(outbytes, n + outsize) < 0) 7561 n/a goto error; 7562 n/a out = PyBytes_AS_STRING(*outbytes) + n; 7563 n/a } 7564 n/a 7565 n/a /* Encode the string character per character */ 7566 n/a while (pos < endin) 7567 n/a { 7568 n/a Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, pos); 7569 n/a wchar_t chars[2]; 7570 n/a int charsize; 7571 n/a if (ch < 0x10000) { 7572 n/a chars[0] = (wchar_t)ch; 7573 n/a charsize = 1; 7574 n/a } 7575 n/a else { 7576 n/a chars[0] = Py_UNICODE_HIGH_SURROGATE(ch); 7577 n/a chars[1] = Py_UNICODE_LOW_SURROGATE(ch); 7578 n/a charsize = 2; 7579 n/a } 7580 n/a 7581 n/a outsize = WideCharToMultiByte(code_page, flags, 7582 n/a chars, charsize, 7583 n/a buffer, Py_ARRAY_LENGTH(buffer), 7584 n/a NULL, pusedDefaultChar); 7585 n/a if (outsize > 0) { 7586 n/a if (pusedDefaultChar == NULL || !(*pusedDefaultChar)) 7587 n/a { 7588 n/a pos++; 7589 n/a memcpy(out, buffer, outsize); 7590 n/a out += outsize; 7591 n/a continue; 7592 n/a } 7593 n/a } 7594 n/a else if (GetLastError() != ERROR_NO_UNICODE_TRANSLATION) { 7595 n/a PyErr_SetFromWindowsErr(0); 7596 n/a goto error; 7597 n/a } 7598 n/a 7599 n/a rep = unicode_encode_call_errorhandler( 7600 n/a errors, &errorHandler, encoding, reason, 7601 n/a unicode, &exc, 7602 n/a pos, pos + 1, &newpos); 7603 n/a if (rep == NULL) 7604 n/a goto error; 7605 n/a pos = newpos; 7606 n/a 7607 n/a if (PyBytes_Check(rep)) { 7608 n/a outsize = PyBytes_GET_SIZE(rep); 7609 n/a if (outsize != 1) { 7610 n/a Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes); 7611 n/a newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1); 7612 n/a if (_PyBytes_Resize(outbytes, newoutsize) < 0) { 7613 n/a Py_DECREF(rep); 7614 n/a goto error; 7615 n/a } 7616 n/a out = PyBytes_AS_STRING(*outbytes) + offset; 7617 n/a } 7618 n/a memcpy(out, PyBytes_AS_STRING(rep), outsize); 7619 n/a out += outsize; 7620 n/a } 7621 n/a else { 7622 n/a Py_ssize_t i; 7623 n/a enum PyUnicode_Kind kind; 7624 n/a void *data; 7625 n/a 7626 n/a if (PyUnicode_READY(rep) == -1) { 7627 n/a Py_DECREF(rep); 7628 n/a goto error; 7629 n/a } 7630 n/a 7631 n/a outsize = PyUnicode_GET_LENGTH(rep); 7632 n/a if (outsize != 1) { 7633 n/a Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes); 7634 n/a newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1); 7635 n/a if (_PyBytes_Resize(outbytes, newoutsize) < 0) { 7636 n/a Py_DECREF(rep); 7637 n/a goto error; 7638 n/a } 7639 n/a out = PyBytes_AS_STRING(*outbytes) + offset; 7640 n/a } 7641 n/a kind = PyUnicode_KIND(rep); 7642 n/a data = PyUnicode_DATA(rep); 7643 n/a for (i=0; i < outsize; i++) { 7644 n/a Py_UCS4 ch = PyUnicode_READ(kind, data, i); 7645 n/a if (ch > 127) { 7646 n/a raise_encode_exception(&exc, 7647 n/a encoding, unicode, 7648 n/a pos, pos + 1, 7649 n/a "unable to encode error handler result to ASCII"); 7650 n/a Py_DECREF(rep); 7651 n/a goto error; 7652 n/a } 7653 n/a *out = (unsigned char)ch; 7654 n/a out++; 7655 n/a } 7656 n/a } 7657 n/a Py_DECREF(rep); 7658 n/a } 7659 n/a /* write a NUL byte */ 7660 n/a *out = 0; 7661 n/a outsize = out - PyBytes_AS_STRING(*outbytes); 7662 n/a assert(outsize <= PyBytes_GET_SIZE(*outbytes)); 7663 n/a if (_PyBytes_Resize(outbytes, outsize) < 0) 7664 n/a goto error; 7665 n/a ret = 0; 7666 n/a 7667 n/a error: 7668 n/a Py_XDECREF(encoding_obj); 7669 n/a Py_XDECREF(errorHandler); 7670 n/a Py_XDECREF(exc); 7671 n/a return ret; 7672 n/a } 7673 n/a 7674 n/a static PyObject * 7675 n/a encode_code_page(int code_page, 7676 n/a PyObject *unicode, 7677 n/a const char *errors) 7678 n/a { 7679 n/a Py_ssize_t len; 7680 n/a PyObject *outbytes = NULL; 7681 n/a Py_ssize_t offset; 7682 n/a int chunk_len, ret, done; 7683 n/a 7684 n/a if (!PyUnicode_Check(unicode)) { 7685 n/a PyErr_BadArgument(); 7686 n/a return NULL; 7687 n/a } 7688 n/a 7689 n/a if (PyUnicode_READY(unicode) == -1) 7690 n/a return NULL; 7691 n/a len = PyUnicode_GET_LENGTH(unicode); 7692 n/a 7693 n/a if (code_page < 0) { 7694 n/a PyErr_SetString(PyExc_ValueError, "invalid code page number"); 7695 n/a return NULL; 7696 n/a } 7697 n/a 7698 n/a if (len == 0) 7699 n/a return PyBytes_FromStringAndSize(NULL, 0); 7700 n/a 7701 n/a offset = 0; 7702 n/a do 7703 n/a { 7704 n/a #ifdef NEED_RETRY 7705 n/a /* UTF-16 encoding may double the size, so use only INT_MAX/2 7706 n/a chunks. */ 7707 n/a if (len > INT_MAX/2) { 7708 n/a chunk_len = INT_MAX/2; 7709 n/a done = 0; 7710 n/a } 7711 n/a else 7712 n/a #endif 7713 n/a { 7714 n/a chunk_len = (int)len; 7715 n/a done = 1; 7716 n/a } 7717 n/a 7718 n/a ret = encode_code_page_strict(code_page, &outbytes, 7719 n/a unicode, offset, chunk_len, 7720 n/a errors); 7721 n/a if (ret == -2) 7722 n/a ret = encode_code_page_errors(code_page, &outbytes, 7723 n/a unicode, offset, 7724 n/a chunk_len, errors); 7725 n/a if (ret < 0) { 7726 n/a Py_XDECREF(outbytes); 7727 n/a return NULL; 7728 n/a } 7729 n/a 7730 n/a offset += chunk_len; 7731 n/a len -= chunk_len; 7732 n/a } while (!done); 7733 n/a 7734 n/a return outbytes; 7735 n/a } 7736 n/a 7737 n/a PyObject * 7738 n/a PyUnicode_EncodeMBCS(const Py_UNICODE *p, 7739 n/a Py_ssize_t size, 7740 n/a const char *errors) 7741 n/a { 7742 n/a PyObject *unicode, *res; 7743 n/a unicode = PyUnicode_FromWideChar(p, size); 7744 n/a if (unicode == NULL) 7745 n/a return NULL; 7746 n/a res = encode_code_page(CP_ACP, unicode, errors); 7747 n/a Py_DECREF(unicode); 7748 n/a return res; 7749 n/a } 7750 n/a 7751 n/a PyObject * 7752 n/a PyUnicode_EncodeCodePage(int code_page, 7753 n/a PyObject *unicode, 7754 n/a const char *errors) 7755 n/a { 7756 n/a return encode_code_page(code_page, unicode, errors); 7757 n/a } 7758 n/a 7759 n/a PyObject * 7760 n/a PyUnicode_AsMBCSString(PyObject *unicode) 7761 n/a { 7762 n/a return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL); 7763 n/a } 7764 n/a 7765 n/a #undef NEED_RETRY 7766 n/a 7767 n/a #endif /* MS_WINDOWS */ 7768 n/a 7769 n/a /* --- Character Mapping Codec -------------------------------------------- */ 7770 n/a 7771 n/a static int 7772 n/a charmap_decode_string(const char *s, 7773 n/a Py_ssize_t size, 7774 n/a PyObject *mapping, 7775 n/a const char *errors, 7776 n/a _PyUnicodeWriter *writer) 7777 n/a { 7778 n/a const char *starts = s; 7779 n/a const char *e; 7780 n/a Py_ssize_t startinpos, endinpos; 7781 n/a PyObject *errorHandler = NULL, *exc = NULL; 7782 n/a Py_ssize_t maplen; 7783 n/a enum PyUnicode_Kind mapkind; 7784 n/a void *mapdata; 7785 n/a Py_UCS4 x; 7786 n/a unsigned char ch; 7787 n/a 7788 n/a if (PyUnicode_READY(mapping) == -1) 7789 n/a return -1; 7790 n/a 7791 n/a maplen = PyUnicode_GET_LENGTH(mapping); 7792 n/a mapdata = PyUnicode_DATA(mapping); 7793 n/a mapkind = PyUnicode_KIND(mapping); 7794 n/a 7795 n/a e = s + size; 7796 n/a 7797 n/a if (mapkind == PyUnicode_1BYTE_KIND && maplen >= 256) { 7798 n/a /* fast-path for cp037, cp500 and iso8859_1 encodings. iso8859_1 7799 n/a * is disabled in encoding aliases, latin1 is preferred because 7800 n/a * its implementation is faster. */ 7801 n/a Py_UCS1 *mapdata_ucs1 = (Py_UCS1 *)mapdata; 7802 n/a Py_UCS1 *outdata = (Py_UCS1 *)writer->data; 7803 n/a Py_UCS4 maxchar = writer->maxchar; 7804 n/a 7805 n/a assert (writer->kind == PyUnicode_1BYTE_KIND); 7806 n/a while (s < e) { 7807 n/a ch = *s; 7808 n/a x = mapdata_ucs1[ch]; 7809 n/a if (x > maxchar) { 7810 n/a if (_PyUnicodeWriter_Prepare(writer, 1, 0xff) == -1) 7811 n/a goto onError; 7812 n/a maxchar = writer->maxchar; 7813 n/a outdata = (Py_UCS1 *)writer->data; 7814 n/a } 7815 n/a outdata[writer->pos] = x; 7816 n/a writer->pos++; 7817 n/a ++s; 7818 n/a } 7819 n/a return 0; 7820 n/a } 7821 n/a 7822 n/a while (s < e) { 7823 n/a if (mapkind == PyUnicode_2BYTE_KIND && maplen >= 256) { 7824 n/a enum PyUnicode_Kind outkind = writer->kind; 7825 n/a Py_UCS2 *mapdata_ucs2 = (Py_UCS2 *)mapdata; 7826 n/a if (outkind == PyUnicode_1BYTE_KIND) { 7827 n/a Py_UCS1 *outdata = (Py_UCS1 *)writer->data; 7828 n/a Py_UCS4 maxchar = writer->maxchar; 7829 n/a while (s < e) { 7830 n/a ch = *s; 7831 n/a x = mapdata_ucs2[ch]; 7832 n/a if (x > maxchar) 7833 n/a goto Error; 7834 n/a outdata[writer->pos] = x; 7835 n/a writer->pos++; 7836 n/a ++s; 7837 n/a } 7838 n/a break; 7839 n/a } 7840 n/a else if (outkind == PyUnicode_2BYTE_KIND) { 7841 n/a Py_UCS2 *outdata = (Py_UCS2 *)writer->data; 7842 n/a while (s < e) { 7843 n/a ch = *s; 7844 n/a x = mapdata_ucs2[ch]; 7845 n/a if (x == 0xFFFE) 7846 n/a goto Error; 7847 n/a outdata[writer->pos] = x; 7848 n/a writer->pos++; 7849 n/a ++s; 7850 n/a } 7851 n/a break; 7852 n/a } 7853 n/a } 7854 n/a ch = *s; 7855 n/a 7856 n/a if (ch < maplen) 7857 n/a x = PyUnicode_READ(mapkind, mapdata, ch); 7858 n/a else 7859 n/a x = 0xfffe; /* invalid value */ 7860 n/a Error: 7861 n/a if (x == 0xfffe) 7862 n/a { 7863 n/a /* undefined mapping */ 7864 n/a startinpos = s-starts; 7865 n/a endinpos = startinpos+1; 7866 n/a if (unicode_decode_call_errorhandler_writer( 7867 n/a errors, &errorHandler, 7868 n/a "charmap", "character maps to <undefined>", 7869 n/a &starts, &e, &startinpos, &endinpos, &exc, &s, 7870 n/a writer)) { 7871 n/a goto onError; 7872 n/a } 7873 n/a continue; 7874 n/a } 7875 n/a 7876 n/a if (_PyUnicodeWriter_WriteCharInline(writer, x) < 0) 7877 n/a goto onError; 7878 n/a ++s; 7879 n/a } 7880 n/a Py_XDECREF(errorHandler); 7881 n/a Py_XDECREF(exc); 7882 n/a return 0; 7883 n/a 7884 n/a onError: 7885 n/a Py_XDECREF(errorHandler); 7886 n/a Py_XDECREF(exc); 7887 n/a return -1; 7888 n/a } 7889 n/a 7890 n/a static int 7891 n/a charmap_decode_mapping(const char *s, 7892 n/a Py_ssize_t size, 7893 n/a PyObject *mapping, 7894 n/a const char *errors, 7895 n/a _PyUnicodeWriter *writer) 7896 n/a { 7897 n/a const char *starts = s; 7898 n/a const char *e; 7899 n/a Py_ssize_t startinpos, endinpos; 7900 n/a PyObject *errorHandler = NULL, *exc = NULL; 7901 n/a unsigned char ch; 7902 n/a PyObject *key, *item = NULL; 7903 n/a 7904 n/a e = s + size; 7905 n/a 7906 n/a while (s < e) { 7907 n/a ch = *s; 7908 n/a 7909 n/a /* Get mapping (char ordinal -> integer, Unicode char or None) */ 7910 n/a key = PyLong_FromLong((long)ch); 7911 n/a if (key == NULL) 7912 n/a goto onError; 7913 n/a 7914 n/a item = PyObject_GetItem(mapping, key); 7915 n/a Py_DECREF(key); 7916 n/a if (item == NULL) { 7917 n/a if (PyErr_ExceptionMatches(PyExc_LookupError)) { 7918 n/a /* No mapping found means: mapping is undefined. */ 7919 n/a PyErr_Clear(); 7920 n/a goto Undefined; 7921 n/a } else 7922 n/a goto onError; 7923 n/a } 7924 n/a 7925 n/a /* Apply mapping */ 7926 n/a if (item == Py_None) 7927 n/a goto Undefined; 7928 n/a if (PyLong_Check(item)) { 7929 n/a long value = PyLong_AS_LONG(item); 7930 n/a if (value == 0xFFFE) 7931 n/a goto Undefined; 7932 n/a if (value < 0 || value > MAX_UNICODE) { 7933 n/a PyErr_Format(PyExc_TypeError, 7934 n/a "character mapping must be in range(0x%lx)", 7935 n/a (unsigned long)MAX_UNICODE + 1); 7936 n/a goto onError; 7937 n/a } 7938 n/a 7939 n/a if (_PyUnicodeWriter_WriteCharInline(writer, value) < 0) 7940 n/a goto onError; 7941 n/a } 7942 n/a else if (PyUnicode_Check(item)) { 7943 n/a if (PyUnicode_READY(item) == -1) 7944 n/a goto onError; 7945 n/a if (PyUnicode_GET_LENGTH(item) == 1) { 7946 n/a Py_UCS4 value = PyUnicode_READ_CHAR(item, 0); 7947 n/a if (value == 0xFFFE) 7948 n/a goto Undefined; 7949 n/a if (_PyUnicodeWriter_WriteCharInline(writer, value) < 0) 7950 n/a goto onError; 7951 n/a } 7952 n/a else { 7953 n/a writer->overallocate = 1; 7954 n/a if (_PyUnicodeWriter_WriteStr(writer, item) == -1) 7955 n/a goto onError; 7956 n/a } 7957 n/a } 7958 n/a else { 7959 n/a /* wrong return value */ 7960 n/a PyErr_SetString(PyExc_TypeError, 7961 n/a "character mapping must return integer, None or str"); 7962 n/a goto onError; 7963 n/a } 7964 n/a Py_CLEAR(item); 7965 n/a ++s; 7966 n/a continue; 7967 n/a 7968 n/a Undefined: 7969 n/a /* undefined mapping */ 7970 n/a Py_CLEAR(item); 7971 n/a startinpos = s-starts; 7972 n/a endinpos = startinpos+1; 7973 n/a if (unicode_decode_call_errorhandler_writer( 7974 n/a errors, &errorHandler, 7975 n/a "charmap", "character maps to <undefined>", 7976 n/a &starts, &e, &startinpos, &endinpos, &exc, &s, 7977 n/a writer)) { 7978 n/a goto onError; 7979 n/a } 7980 n/a } 7981 n/a Py_XDECREF(errorHandler); 7982 n/a Py_XDECREF(exc); 7983 n/a return 0; 7984 n/a 7985 n/a onError: 7986 n/a Py_XDECREF(item); 7987 n/a Py_XDECREF(errorHandler); 7988 n/a Py_XDECREF(exc); 7989 n/a return -1; 7990 n/a } 7991 n/a 7992 n/a PyObject * 7993 n/a PyUnicode_DecodeCharmap(const char *s, 7994 n/a Py_ssize_t size, 7995 n/a PyObject *mapping, 7996 n/a const char *errors) 7997 n/a { 7998 n/a _PyUnicodeWriter writer; 7999 n/a 8000 n/a /* Default to Latin-1 */ 8001 n/a if (mapping == NULL) 8002 n/a return PyUnicode_DecodeLatin1(s, size, errors); 8003 n/a 8004 n/a if (size == 0) 8005 n/a _Py_RETURN_UNICODE_EMPTY(); 8006 n/a _PyUnicodeWriter_Init(&writer); 8007 n/a writer.min_length = size; 8008 n/a if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1) 8009 n/a goto onError; 8010 n/a 8011 n/a if (PyUnicode_CheckExact(mapping)) { 8012 n/a if (charmap_decode_string(s, size, mapping, errors, &writer) < 0) 8013 n/a goto onError; 8014 n/a } 8015 n/a else { 8016 n/a if (charmap_decode_mapping(s, size, mapping, errors, &writer) < 0) 8017 n/a goto onError; 8018 n/a } 8019 n/a return _PyUnicodeWriter_Finish(&writer); 8020 n/a 8021 n/a onError: 8022 n/a _PyUnicodeWriter_Dealloc(&writer); 8023 n/a return NULL; 8024 n/a } 8025 n/a 8026 n/a /* Charmap encoding: the lookup table */ 8027 n/a 8028 n/a struct encoding_map { 8029 n/a PyObject_HEAD 8030 n/a unsigned char level1[32]; 8031 n/a int count2, count3; 8032 n/a unsigned char level23[1]; 8033 n/a }; 8034 n/a 8035 n/a static PyObject* 8036 n/a encoding_map_size(PyObject *obj, PyObject* args) 8037 n/a { 8038 n/a struct encoding_map *map = (struct encoding_map*)obj; 8039 n/a return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 + 8040 n/a 128*map->count3); 8041 n/a } 8042 n/a 8043 n/a static PyMethodDef encoding_map_methods[] = { 8044 n/a {"size", encoding_map_size, METH_NOARGS, 8045 n/a PyDoc_STR("Return the size (in bytes) of this object") }, 8046 n/a { 0 } 8047 n/a }; 8048 n/a 8049 n/a static void 8050 n/a encoding_map_dealloc(PyObject* o) 8051 n/a { 8052 n/a PyObject_FREE(o); 8053 n/a } 8054 n/a 8055 n/a static PyTypeObject EncodingMapType = { 8056 n/a PyVarObject_HEAD_INIT(NULL, 0) 8057 n/a "EncodingMap", /*tp_name*/ 8058 n/a sizeof(struct encoding_map), /*tp_basicsize*/ 8059 n/a 0, /*tp_itemsize*/ 8060 n/a /* methods */ 8061 n/a encoding_map_dealloc, /*tp_dealloc*/ 8062 n/a 0, /*tp_print*/ 8063 n/a 0, /*tp_getattr*/ 8064 n/a 0, /*tp_setattr*/ 8065 n/a 0, /*tp_reserved*/ 8066 n/a 0, /*tp_repr*/ 8067 n/a 0, /*tp_as_number*/ 8068 n/a 0, /*tp_as_sequence*/ 8069 n/a 0, /*tp_as_mapping*/ 8070 n/a 0, /*tp_hash*/ 8071 n/a 0, /*tp_call*/ 8072 n/a 0, /*tp_str*/ 8073 n/a 0, /*tp_getattro*/ 8074 n/a 0, /*tp_setattro*/ 8075 n/a 0, /*tp_as_buffer*/ 8076 n/a Py_TPFLAGS_DEFAULT, /*tp_flags*/ 8077 n/a 0, /*tp_doc*/ 8078 n/a 0, /*tp_traverse*/ 8079 n/a 0, /*tp_clear*/ 8080 n/a 0, /*tp_richcompare*/ 8081 n/a 0, /*tp_weaklistoffset*/ 8082 n/a 0, /*tp_iter*/ 8083 n/a 0, /*tp_iternext*/ 8084 n/a encoding_map_methods, /*tp_methods*/ 8085 n/a 0, /*tp_members*/ 8086 n/a 0, /*tp_getset*/ 8087 n/a 0, /*tp_base*/ 8088 n/a 0, /*tp_dict*/ 8089 n/a 0, /*tp_descr_get*/ 8090 n/a 0, /*tp_descr_set*/ 8091 n/a 0, /*tp_dictoffset*/ 8092 n/a 0, /*tp_init*/ 8093 n/a 0, /*tp_alloc*/ 8094 n/a 0, /*tp_new*/ 8095 n/a 0, /*tp_free*/ 8096 n/a 0, /*tp_is_gc*/ 8097 n/a }; 8098 n/a 8099 n/a PyObject* 8100 n/a PyUnicode_BuildEncodingMap(PyObject* string) 8101 n/a { 8102 n/a PyObject *result; 8103 n/a struct encoding_map *mresult; 8104 n/a int i; 8105 n/a int need_dict = 0; 8106 n/a unsigned char level1[32]; 8107 n/a unsigned char level2[512]; 8108 n/a unsigned char *mlevel1, *mlevel2, *mlevel3; 8109 n/a int count2 = 0, count3 = 0; 8110 n/a int kind; 8111 n/a void *data; 8112 n/a Py_ssize_t length; 8113 n/a Py_UCS4 ch; 8114 n/a 8115 n/a if (!PyUnicode_Check(string) || !PyUnicode_GET_LENGTH(string)) { 8116 n/a PyErr_BadArgument(); 8117 n/a return NULL; 8118 n/a } 8119 n/a kind = PyUnicode_KIND(string); 8120 n/a data = PyUnicode_DATA(string); 8121 n/a length = PyUnicode_GET_LENGTH(string); 8122 n/a length = Py_MIN(length, 256); 8123 n/a memset(level1, 0xFF, sizeof level1); 8124 n/a memset(level2, 0xFF, sizeof level2); 8125 n/a 8126 n/a /* If there isn't a one-to-one mapping of NULL to \0, 8127 n/a or if there are non-BMP characters, we need to use 8128 n/a a mapping dictionary. */ 8129 n/a if (PyUnicode_READ(kind, data, 0) != 0) 8130 n/a need_dict = 1; 8131 n/a for (i = 1; i < length; i++) { 8132 n/a int l1, l2; 8133 n/a ch = PyUnicode_READ(kind, data, i); 8134 n/a if (ch == 0 || ch > 0xFFFF) { 8135 n/a need_dict = 1; 8136 n/a break; 8137 n/a } 8138 n/a if (ch == 0xFFFE) 8139 n/a /* unmapped character */ 8140 n/a continue; 8141 n/a l1 = ch >> 11; 8142 n/a l2 = ch >> 7; 8143 n/a if (level1[l1] == 0xFF) 8144 n/a level1[l1] = count2++; 8145 n/a if (level2[l2] == 0xFF) 8146 n/a level2[l2] = count3++; 8147 n/a } 8148 n/a 8149 n/a if (count2 >= 0xFF || count3 >= 0xFF) 8150 n/a need_dict = 1; 8151 n/a 8152 n/a if (need_dict) { 8153 n/a PyObject *result = PyDict_New(); 8154 n/a PyObject *key, *value; 8155 n/a if (!result) 8156 n/a return NULL; 8157 n/a for (i = 0; i < length; i++) { 8158 n/a key = PyLong_FromLong(PyUnicode_READ(kind, data, i)); 8159 n/a value = PyLong_FromLong(i); 8160 n/a if (!key || !value) 8161 n/a goto failed1; 8162 n/a if (PyDict_SetItem(result, key, value) == -1) 8163 n/a goto failed1; 8164 n/a Py_DECREF(key); 8165 n/a Py_DECREF(value); 8166 n/a } 8167 n/a return result; 8168 n/a failed1: 8169 n/a Py_XDECREF(key); 8170 n/a Py_XDECREF(value); 8171 n/a Py_DECREF(result); 8172 n/a return NULL; 8173 n/a } 8174 n/a 8175 n/a /* Create a three-level trie */ 8176 n/a result = PyObject_MALLOC(sizeof(struct encoding_map) + 8177 n/a 16*count2 + 128*count3 - 1); 8178 n/a if (!result) 8179 n/a return PyErr_NoMemory(); 8180 n/a PyObject_Init(result, &EncodingMapType); 8181 n/a mresult = (struct encoding_map*)result; 8182 n/a mresult->count2 = count2; 8183 n/a mresult->count3 = count3; 8184 n/a mlevel1 = mresult->level1; 8185 n/a mlevel2 = mresult->level23; 8186 n/a mlevel3 = mresult->level23 + 16*count2; 8187 n/a memcpy(mlevel1, level1, 32); 8188 n/a memset(mlevel2, 0xFF, 16*count2); 8189 n/a memset(mlevel3, 0, 128*count3); 8190 n/a count3 = 0; 8191 n/a for (i = 1; i < length; i++) { 8192 n/a int o1, o2, o3, i2, i3; 8193 n/a Py_UCS4 ch = PyUnicode_READ(kind, data, i); 8194 n/a if (ch == 0xFFFE) 8195 n/a /* unmapped character */ 8196 n/a continue; 8197 n/a o1 = ch>>11; 8198 n/a o2 = (ch>>7) & 0xF; 8199 n/a i2 = 16*mlevel1[o1] + o2; 8200 n/a if (mlevel2[i2] == 0xFF) 8201 n/a mlevel2[i2] = count3++; 8202 n/a o3 = ch & 0x7F; 8203 n/a i3 = 128*mlevel2[i2] + o3; 8204 n/a mlevel3[i3] = i; 8205 n/a } 8206 n/a return result; 8207 n/a } 8208 n/a 8209 n/a static int 8210 n/a encoding_map_lookup(Py_UCS4 c, PyObject *mapping) 8211 n/a { 8212 n/a struct encoding_map *map = (struct encoding_map*)mapping; 8213 n/a int l1 = c>>11; 8214 n/a int l2 = (c>>7) & 0xF; 8215 n/a int l3 = c & 0x7F; 8216 n/a int i; 8217 n/a 8218 n/a if (c > 0xFFFF) 8219 n/a return -1; 8220 n/a if (c == 0) 8221 n/a return 0; 8222 n/a /* level 1*/ 8223 n/a i = map->level1[l1]; 8224 n/a if (i == 0xFF) { 8225 n/a return -1; 8226 n/a } 8227 n/a /* level 2*/ 8228 n/a i = map->level23[16*i+l2]; 8229 n/a if (i == 0xFF) { 8230 n/a return -1; 8231 n/a } 8232 n/a /* level 3 */ 8233 n/a i = map->level23[16*map->count2 + 128*i + l3]; 8234 n/a if (i == 0) { 8235 n/a return -1; 8236 n/a } 8237 n/a return i; 8238 n/a } 8239 n/a 8240 n/a /* Lookup the character ch in the mapping. If the character 8241 n/a can't be found, Py_None is returned (or NULL, if another 8242 n/a error occurred). */ 8243 n/a static PyObject * 8244 n/a charmapencode_lookup(Py_UCS4 c, PyObject *mapping) 8245 n/a { 8246 n/a PyObject *w = PyLong_FromLong((long)c); 8247 n/a PyObject *x; 8248 n/a 8249 n/a if (w == NULL) 8250 n/a return NULL; 8251 n/a x = PyObject_GetItem(mapping, w); 8252 n/a Py_DECREF(w); 8253 n/a if (x == NULL) { 8254 n/a if (PyErr_ExceptionMatches(PyExc_LookupError)) { 8255 n/a /* No mapping found means: mapping is undefined. */ 8256 n/a PyErr_Clear(); 8257 n/a Py_RETURN_NONE; 8258 n/a } else 8259 n/a return NULL; 8260 n/a } 8261 n/a else if (x == Py_None) 8262 n/a return x; 8263 n/a else if (PyLong_Check(x)) { 8264 n/a long value = PyLong_AS_LONG(x); 8265 n/a if (value < 0 || value > 255) { 8266 n/a PyErr_SetString(PyExc_TypeError, 8267 n/a "character mapping must be in range(256)"); 8268 n/a Py_DECREF(x); 8269 n/a return NULL; 8270 n/a } 8271 n/a return x; 8272 n/a } 8273 n/a else if (PyBytes_Check(x)) 8274 n/a return x; 8275 n/a else { 8276 n/a /* wrong return value */ 8277 n/a PyErr_Format(PyExc_TypeError, 8278 n/a "character mapping must return integer, bytes or None, not %.400s", 8279 n/a x->ob_type->tp_name); 8280 n/a Py_DECREF(x); 8281 n/a return NULL; 8282 n/a } 8283 n/a } 8284 n/a 8285 n/a static int 8286 n/a charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize) 8287 n/a { 8288 n/a Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj); 8289 n/a /* exponentially overallocate to minimize reallocations */ 8290 n/a if (requiredsize < 2*outsize) 8291 n/a requiredsize = 2*outsize; 8292 n/a if (_PyBytes_Resize(outobj, requiredsize)) 8293 n/a return -1; 8294 n/a return 0; 8295 n/a } 8296 n/a 8297 n/a typedef enum charmapencode_result { 8298 n/a enc_SUCCESS, enc_FAILED, enc_EXCEPTION 8299 n/a } charmapencode_result; 8300 n/a /* lookup the character, put the result in the output string and adjust 8301 n/a various state variables. Resize the output bytes object if not enough 8302 n/a space is available. Return a new reference to the object that 8303 n/a was put in the output buffer, or Py_None, if the mapping was undefined 8304 n/a (in which case no character was written) or NULL, if a 8305 n/a reallocation error occurred. The caller must decref the result */ 8306 n/a static charmapencode_result 8307 n/a charmapencode_output(Py_UCS4 c, PyObject *mapping, 8308 n/a PyObject **outobj, Py_ssize_t *outpos) 8309 n/a { 8310 n/a PyObject *rep; 8311 n/a char *outstart; 8312 n/a Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj); 8313 n/a 8314 n/a if (Py_TYPE(mapping) == &EncodingMapType) { 8315 n/a int res = encoding_map_lookup(c, mapping); 8316 n/a Py_ssize_t requiredsize = *outpos+1; 8317 n/a if (res == -1) 8318 n/a return enc_FAILED; 8319 n/a if (outsize<requiredsize) 8320 n/a if (charmapencode_resize(outobj, outpos, requiredsize)) 8321 n/a return enc_EXCEPTION; 8322 n/a outstart = PyBytes_AS_STRING(*outobj); 8323 n/a outstart[(*outpos)++] = (char)res; 8324 n/a return enc_SUCCESS; 8325 n/a } 8326 n/a 8327 n/a rep = charmapencode_lookup(c, mapping); 8328 n/a if (rep==NULL) 8329 n/a return enc_EXCEPTION; 8330 n/a else if (rep==Py_None) { 8331 n/a Py_DECREF(rep); 8332 n/a return enc_FAILED; 8333 n/a } else { 8334 n/a if (PyLong_Check(rep)) { 8335 n/a Py_ssize_t requiredsize = *outpos+1; 8336 n/a if (outsize<requiredsize) 8337 n/a if (charmapencode_resize(outobj, outpos, requiredsize)) { 8338 n/a Py_DECREF(rep); 8339 n/a return enc_EXCEPTION; 8340 n/a } 8341 n/a outstart = PyBytes_AS_STRING(*outobj); 8342 n/a outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep); 8343 n/a } 8344 n/a else { 8345 n/a const char *repchars = PyBytes_AS_STRING(rep); 8346 n/a Py_ssize_t repsize = PyBytes_GET_SIZE(rep); 8347 n/a Py_ssize_t requiredsize = *outpos+repsize; 8348 n/a if (outsize<requiredsize) 8349 n/a if (charmapencode_resize(outobj, outpos, requiredsize)) { 8350 n/a Py_DECREF(rep); 8351 n/a return enc_EXCEPTION; 8352 n/a } 8353 n/a outstart = PyBytes_AS_STRING(*outobj); 8354 n/a memcpy(outstart + *outpos, repchars, repsize); 8355 n/a *outpos += repsize; 8356 n/a } 8357 n/a } 8358 n/a Py_DECREF(rep); 8359 n/a return enc_SUCCESS; 8360 n/a } 8361 n/a 8362 n/a /* handle an error in PyUnicode_EncodeCharmap 8363 n/a Return 0 on success, -1 on error */ 8364 n/a static int 8365 n/a charmap_encoding_error( 8366 n/a PyObject *unicode, Py_ssize_t *inpos, PyObject *mapping, 8367 n/a PyObject **exceptionObject, 8368 n/a _Py_error_handler *error_handler, PyObject **error_handler_obj, const char *errors, 8369 n/a PyObject **res, Py_ssize_t *respos) 8370 n/a { 8371 n/a PyObject *repunicode = NULL; /* initialize to prevent gcc warning */ 8372 n/a Py_ssize_t size, repsize; 8373 n/a Py_ssize_t newpos; 8374 n/a enum PyUnicode_Kind kind; 8375 n/a void *data; 8376 n/a Py_ssize_t index; 8377 n/a /* startpos for collecting unencodable chars */ 8378 n/a Py_ssize_t collstartpos = *inpos; 8379 n/a Py_ssize_t collendpos = *inpos+1; 8380 n/a Py_ssize_t collpos; 8381 n/a char *encoding = "charmap"; 8382 n/a char *reason = "character maps to <undefined>"; 8383 n/a charmapencode_result x; 8384 n/a Py_UCS4 ch; 8385 n/a int val; 8386 n/a 8387 n/a if (PyUnicode_READY(unicode) == -1) 8388 n/a return -1; 8389 n/a size = PyUnicode_GET_LENGTH(unicode); 8390 n/a /* find all unencodable characters */ 8391 n/a while (collendpos < size) { 8392 n/a PyObject *rep; 8393 n/a if (Py_TYPE(mapping) == &EncodingMapType) { 8394 n/a ch = PyUnicode_READ_CHAR(unicode, collendpos); 8395 n/a val = encoding_map_lookup(ch, mapping); 8396 n/a if (val != -1) 8397 n/a break; 8398 n/a ++collendpos; 8399 n/a continue; 8400 n/a } 8401 n/a 8402 n/a ch = PyUnicode_READ_CHAR(unicode, collendpos); 8403 n/a rep = charmapencode_lookup(ch, mapping); 8404 n/a if (rep==NULL) 8405 n/a return -1; 8406 n/a else if (rep!=Py_None) { 8407 n/a Py_DECREF(rep); 8408 n/a break; 8409 n/a } 8410 n/a Py_DECREF(rep); 8411 n/a ++collendpos; 8412 n/a } 8413 n/a /* cache callback name lookup 8414 n/a * (if not done yet, i.e. it's the first error) */ 8415 n/a if (*error_handler == _Py_ERROR_UNKNOWN) 8416 n/a *error_handler = get_error_handler(errors); 8417 n/a 8418 n/a switch (*error_handler) { 8419 n/a case _Py_ERROR_STRICT: 8420 n/a raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason); 8421 n/a return -1; 8422 n/a 8423 n/a case _Py_ERROR_REPLACE: 8424 n/a for (collpos = collstartpos; collpos<collendpos; ++collpos) { 8425 n/a x = charmapencode_output('?', mapping, res, respos); 8426 n/a if (x==enc_EXCEPTION) { 8427 n/a return -1; 8428 n/a } 8429 n/a else if (x==enc_FAILED) { 8430 n/a raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason); 8431 n/a return -1; 8432 n/a } 8433 n/a } 8434 n/a /* fall through */ 8435 n/a case _Py_ERROR_IGNORE: 8436 n/a *inpos = collendpos; 8437 n/a break; 8438 n/a 8439 n/a case _Py_ERROR_XMLCHARREFREPLACE: 8440 n/a /* generate replacement (temporarily (mis)uses p) */ 8441 n/a for (collpos = collstartpos; collpos < collendpos; ++collpos) { 8442 n/a char buffer[2+29+1+1]; 8443 n/a char *cp; 8444 n/a sprintf(buffer, "&#%d;", (int)PyUnicode_READ_CHAR(unicode, collpos)); 8445 n/a for (cp = buffer; *cp; ++cp) { 8446 n/a x = charmapencode_output(*cp, mapping, res, respos); 8447 n/a if (x==enc_EXCEPTION) 8448 n/a return -1; 8449 n/a else if (x==enc_FAILED) { 8450 n/a raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason); 8451 n/a return -1; 8452 n/a } 8453 n/a } 8454 n/a } 8455 n/a *inpos = collendpos; 8456 n/a break; 8457 n/a 8458 n/a default: 8459 n/a repunicode = unicode_encode_call_errorhandler(errors, error_handler_obj, 8460 n/a encoding, reason, unicode, exceptionObject, 8461 n/a collstartpos, collendpos, &newpos); 8462 n/a if (repunicode == NULL) 8463 n/a return -1; 8464 n/a if (PyBytes_Check(repunicode)) { 8465 n/a /* Directly copy bytes result to output. */ 8466 n/a Py_ssize_t outsize = PyBytes_Size(*res); 8467 n/a Py_ssize_t requiredsize; 8468 n/a repsize = PyBytes_Size(repunicode); 8469 n/a requiredsize = *respos + repsize; 8470 n/a if (requiredsize > outsize) 8471 n/a /* Make room for all additional bytes. */ 8472 n/a if (charmapencode_resize(res, respos, requiredsize)) { 8473 n/a Py_DECREF(repunicode); 8474 n/a return -1; 8475 n/a } 8476 n/a memcpy(PyBytes_AsString(*res) + *respos, 8477 n/a PyBytes_AsString(repunicode), repsize); 8478 n/a *respos += repsize; 8479 n/a *inpos = newpos; 8480 n/a Py_DECREF(repunicode); 8481 n/a break; 8482 n/a } 8483 n/a /* generate replacement */ 8484 n/a if (PyUnicode_READY(repunicode) == -1) { 8485 n/a Py_DECREF(repunicode); 8486 n/a return -1; 8487 n/a } 8488 n/a repsize = PyUnicode_GET_LENGTH(repunicode); 8489 n/a data = PyUnicode_DATA(repunicode); 8490 n/a kind = PyUnicode_KIND(repunicode); 8491 n/a for (index = 0; index < repsize; index++) { 8492 n/a Py_UCS4 repch = PyUnicode_READ(kind, data, index); 8493 n/a x = charmapencode_output(repch, mapping, res, respos); 8494 n/a if (x==enc_EXCEPTION) { 8495 n/a Py_DECREF(repunicode); 8496 n/a return -1; 8497 n/a } 8498 n/a else if (x==enc_FAILED) { 8499 n/a Py_DECREF(repunicode); 8500 n/a raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason); 8501 n/a return -1; 8502 n/a } 8503 n/a } 8504 n/a *inpos = newpos; 8505 n/a Py_DECREF(repunicode); 8506 n/a } 8507 n/a return 0; 8508 n/a } 8509 n/a 8510 n/a PyObject * 8511 n/a _PyUnicode_EncodeCharmap(PyObject *unicode, 8512 n/a PyObject *mapping, 8513 n/a const char *errors) 8514 n/a { 8515 n/a /* output object */ 8516 n/a PyObject *res = NULL; 8517 n/a /* current input position */ 8518 n/a Py_ssize_t inpos = 0; 8519 n/a Py_ssize_t size; 8520 n/a /* current output position */ 8521 n/a Py_ssize_t respos = 0; 8522 n/a PyObject *error_handler_obj = NULL; 8523 n/a PyObject *exc = NULL; 8524 n/a _Py_error_handler error_handler = _Py_ERROR_UNKNOWN; 8525 n/a void *data; 8526 n/a int kind; 8527 n/a 8528 n/a if (PyUnicode_READY(unicode) == -1) 8529 n/a return NULL; 8530 n/a size = PyUnicode_GET_LENGTH(unicode); 8531 n/a data = PyUnicode_DATA(unicode); 8532 n/a kind = PyUnicode_KIND(unicode); 8533 n/a 8534 n/a /* Default to Latin-1 */ 8535 n/a if (mapping == NULL) 8536 n/a return unicode_encode_ucs1(unicode, errors, 256); 8537 n/a 8538 n/a /* allocate enough for a simple encoding without 8539 n/a replacements, if we need more, we'll resize */ 8540 n/a res = PyBytes_FromStringAndSize(NULL, size); 8541 n/a if (res == NULL) 8542 n/a goto onError; 8543 n/a if (size == 0) 8544 n/a return res; 8545 n/a 8546 n/a while (inpos<size) { 8547 n/a Py_UCS4 ch = PyUnicode_READ(kind, data, inpos); 8548 n/a /* try to encode it */ 8549 n/a charmapencode_result x = charmapencode_output(ch, mapping, &res, &respos); 8550 n/a if (x==enc_EXCEPTION) /* error */ 8551 n/a goto onError; 8552 n/a if (x==enc_FAILED) { /* unencodable character */ 8553 n/a if (charmap_encoding_error(unicode, &inpos, mapping, 8554 n/a &exc, 8555 n/a &error_handler, &error_handler_obj, errors, 8556 n/a &res, &respos)) { 8557 n/a goto onError; 8558 n/a } 8559 n/a } 8560 n/a else 8561 n/a /* done with this character => adjust input position */ 8562 n/a ++inpos; 8563 n/a } 8564 n/a 8565 n/a /* Resize if we allocated to much */ 8566 n/a if (respos<PyBytes_GET_SIZE(res)) 8567 n/a if (_PyBytes_Resize(&res, respos) < 0) 8568 n/a goto onError; 8569 n/a 8570 n/a Py_XDECREF(exc); 8571 n/a Py_XDECREF(error_handler_obj); 8572 n/a return res; 8573 n/a 8574 n/a onError: 8575 n/a Py_XDECREF(res); 8576 n/a Py_XDECREF(exc); 8577 n/a Py_XDECREF(error_handler_obj); 8578 n/a return NULL; 8579 n/a } 8580 n/a 8581 n/a /* Deprecated */ 8582 n/a PyObject * 8583 n/a PyUnicode_EncodeCharmap(const Py_UNICODE *p, 8584 n/a Py_ssize_t size, 8585 n/a PyObject *mapping, 8586 n/a const char *errors) 8587 n/a { 8588 n/a PyObject *result; 8589 n/a PyObject *unicode = PyUnicode_FromWideChar(p, size); 8590 n/a if (unicode == NULL) 8591 n/a return NULL; 8592 n/a result = _PyUnicode_EncodeCharmap(unicode, mapping, errors); 8593 n/a Py_DECREF(unicode); 8594 n/a return result; 8595 n/a } 8596 n/a 8597 n/a PyObject * 8598 n/a PyUnicode_AsCharmapString(PyObject *unicode, 8599 n/a PyObject *mapping) 8600 n/a { 8601 n/a if (!PyUnicode_Check(unicode) || mapping == NULL) { 8602 n/a PyErr_BadArgument(); 8603 n/a return NULL; 8604 n/a } 8605 n/a return _PyUnicode_EncodeCharmap(unicode, mapping, NULL); 8606 n/a } 8607 n/a 8608 n/a /* create or adjust a UnicodeTranslateError */ 8609 n/a static void 8610 n/a make_translate_exception(PyObject **exceptionObject, 8611 n/a PyObject *unicode, 8612 n/a Py_ssize_t startpos, Py_ssize_t endpos, 8613 n/a const char *reason) 8614 n/a { 8615 n/a if (*exceptionObject == NULL) { 8616 n/a *exceptionObject = _PyUnicodeTranslateError_Create( 8617 n/a unicode, startpos, endpos, reason); 8618 n/a } 8619 n/a else { 8620 n/a if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos)) 8621 n/a goto onError; 8622 n/a if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos)) 8623 n/a goto onError; 8624 n/a if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason)) 8625 n/a goto onError; 8626 n/a return; 8627 n/a onError: 8628 n/a Py_CLEAR(*exceptionObject); 8629 n/a } 8630 n/a } 8631 n/a 8632 n/a /* error handling callback helper: 8633 n/a build arguments, call the callback and check the arguments, 8634 n/a put the result into newpos and return the replacement string, which 8635 n/a has to be freed by the caller */ 8636 n/a static PyObject * 8637 n/a unicode_translate_call_errorhandler(const char *errors, 8638 n/a PyObject **errorHandler, 8639 n/a const char *reason, 8640 n/a PyObject *unicode, PyObject **exceptionObject, 8641 n/a Py_ssize_t startpos, Py_ssize_t endpos, 8642 n/a Py_ssize_t *newpos) 8643 n/a { 8644 n/a static const char *argparse = "Un;translating error handler must return (str, int) tuple"; 8645 n/a 8646 n/a Py_ssize_t i_newpos; 8647 n/a PyObject *restuple; 8648 n/a PyObject *resunicode; 8649 n/a 8650 n/a if (*errorHandler == NULL) { 8651 n/a *errorHandler = PyCodec_LookupError(errors); 8652 n/a if (*errorHandler == NULL) 8653 n/a return NULL; 8654 n/a } 8655 n/a 8656 n/a make_translate_exception(exceptionObject, 8657 n/a unicode, startpos, endpos, reason); 8658 n/a if (*exceptionObject == NULL) 8659 n/a return NULL; 8660 n/a 8661 n/a restuple = PyObject_CallFunctionObjArgs( 8662 n/a *errorHandler, *exceptionObject, NULL); 8663 n/a if (restuple == NULL) 8664 n/a return NULL; 8665 n/a if (!PyTuple_Check(restuple)) { 8666 n/a PyErr_SetString(PyExc_TypeError, &argparse[3]); 8667 n/a Py_DECREF(restuple); 8668 n/a return NULL; 8669 n/a } 8670 n/a if (!PyArg_ParseTuple(restuple, argparse, 8671 n/a &resunicode, &i_newpos)) { 8672 n/a Py_DECREF(restuple); 8673 n/a return NULL; 8674 n/a } 8675 n/a if (i_newpos<0) 8676 n/a *newpos = PyUnicode_GET_LENGTH(unicode)+i_newpos; 8677 n/a else 8678 n/a *newpos = i_newpos; 8679 n/a if (*newpos<0 || *newpos>PyUnicode_GET_LENGTH(unicode)) { 8680 n/a PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos); 8681 n/a Py_DECREF(restuple); 8682 n/a return NULL; 8683 n/a } 8684 n/a Py_INCREF(resunicode); 8685 n/a Py_DECREF(restuple); 8686 n/a return resunicode; 8687 n/a } 8688 n/a 8689 n/a /* Lookup the character ch in the mapping and put the result in result, 8690 n/a which must be decrefed by the caller. 8691 n/a Return 0 on success, -1 on error */ 8692 n/a static int 8693 n/a charmaptranslate_lookup(Py_UCS4 c, PyObject *mapping, PyObject **result) 8694 n/a { 8695 n/a PyObject *w = PyLong_FromLong((long)c); 8696 n/a PyObject *x; 8697 n/a 8698 n/a if (w == NULL) 8699 n/a return -1; 8700 n/a x = PyObject_GetItem(mapping, w); 8701 n/a Py_DECREF(w); 8702 n/a if (x == NULL) { 8703 n/a if (PyErr_ExceptionMatches(PyExc_LookupError)) { 8704 n/a /* No mapping found means: use 1:1 mapping. */ 8705 n/a PyErr_Clear(); 8706 n/a *result = NULL; 8707 n/a return 0; 8708 n/a } else 8709 n/a return -1; 8710 n/a } 8711 n/a else if (x == Py_None) { 8712 n/a *result = x; 8713 n/a return 0; 8714 n/a } 8715 n/a else if (PyLong_Check(x)) { 8716 n/a long value = PyLong_AS_LONG(x); 8717 n/a if (value < 0 || value > MAX_UNICODE) { 8718 n/a PyErr_Format(PyExc_ValueError, 8719 n/a "character mapping must be in range(0x%x)", 8720 n/a MAX_UNICODE+1); 8721 n/a Py_DECREF(x); 8722 n/a return -1; 8723 n/a } 8724 n/a *result = x; 8725 n/a return 0; 8726 n/a } 8727 n/a else if (PyUnicode_Check(x)) { 8728 n/a *result = x; 8729 n/a return 0; 8730 n/a } 8731 n/a else { 8732 n/a /* wrong return value */ 8733 n/a PyErr_SetString(PyExc_TypeError, 8734 n/a "character mapping must return integer, None or str"); 8735 n/a Py_DECREF(x); 8736 n/a return -1; 8737 n/a } 8738 n/a } 8739 n/a 8740 n/a /* lookup the character, write the result into the writer. 8741 n/a Return 1 if the result was written into the writer, return 0 if the mapping 8742 n/a was undefined, raise an exception return -1 on error. */ 8743 n/a static int 8744 n/a charmaptranslate_output(Py_UCS4 ch, PyObject *mapping, 8745 n/a _PyUnicodeWriter *writer) 8746 n/a { 8747 n/a PyObject *item; 8748 n/a 8749 n/a if (charmaptranslate_lookup(ch, mapping, &item)) 8750 n/a return -1; 8751 n/a 8752 n/a if (item == NULL) { 8753 n/a /* not found => default to 1:1 mapping */ 8754 n/a if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0) { 8755 n/a return -1; 8756 n/a } 8757 n/a return 1; 8758 n/a } 8759 n/a 8760 n/a if (item == Py_None) { 8761 n/a Py_DECREF(item); 8762 n/a return 0; 8763 n/a } 8764 n/a 8765 n/a if (PyLong_Check(item)) { 8766 n/a long ch = (Py_UCS4)PyLong_AS_LONG(item); 8767 n/a /* PyLong_AS_LONG() cannot fail, charmaptranslate_lookup() already 8768 n/a used it */ 8769 n/a if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0) { 8770 n/a Py_DECREF(item); 8771 n/a return -1; 8772 n/a } 8773 n/a Py_DECREF(item); 8774 n/a return 1; 8775 n/a } 8776 n/a 8777 n/a if (!PyUnicode_Check(item)) { 8778 n/a Py_DECREF(item); 8779 n/a return -1; 8780 n/a } 8781 n/a 8782 n/a if (_PyUnicodeWriter_WriteStr(writer, item) < 0) { 8783 n/a Py_DECREF(item); 8784 n/a return -1; 8785 n/a } 8786 n/a 8787 n/a Py_DECREF(item); 8788 n/a return 1; 8789 n/a } 8790 n/a 8791 n/a static int 8792 n/a unicode_fast_translate_lookup(PyObject *mapping, Py_UCS1 ch, 8793 n/a Py_UCS1 *translate) 8794 n/a { 8795 n/a PyObject *item = NULL; 8796 n/a int ret = 0; 8797 n/a 8798 n/a if (charmaptranslate_lookup(ch, mapping, &item)) { 8799 n/a return -1; 8800 n/a } 8801 n/a 8802 n/a if (item == Py_None) { 8803 n/a /* deletion */ 8804 n/a translate[ch] = 0xfe; 8805 n/a } 8806 n/a else if (item == NULL) { 8807 n/a /* not found => default to 1:1 mapping */ 8808 n/a translate[ch] = ch; 8809 n/a return 1; 8810 n/a } 8811 n/a else if (PyLong_Check(item)) { 8812 n/a long replace = PyLong_AS_LONG(item); 8813 n/a /* PyLong_AS_LONG() cannot fail, charmaptranslate_lookup() already 8814 n/a used it */ 8815 n/a if (127 < replace) { 8816 n/a /* invalid character or character outside ASCII: 8817 n/a skip the fast translate */ 8818 n/a goto exit; 8819 n/a } 8820 n/a translate[ch] = (Py_UCS1)replace; 8821 n/a } 8822 n/a else if (PyUnicode_Check(item)) { 8823 n/a Py_UCS4 replace; 8824 n/a 8825 n/a if (PyUnicode_READY(item) == -1) { 8826 n/a Py_DECREF(item); 8827 n/a return -1; 8828 n/a } 8829 n/a if (PyUnicode_GET_LENGTH(item) != 1) 8830 n/a goto exit; 8831 n/a 8832 n/a replace = PyUnicode_READ_CHAR(item, 0); 8833 n/a if (replace > 127) 8834 n/a goto exit; 8835 n/a translate[ch] = (Py_UCS1)replace; 8836 n/a } 8837 n/a else { 8838 n/a /* not None, NULL, long or unicode */ 8839 n/a goto exit; 8840 n/a } 8841 n/a ret = 1; 8842 n/a 8843 n/a exit: 8844 n/a Py_DECREF(item); 8845 n/a return ret; 8846 n/a } 8847 n/a 8848 n/a /* Fast path for ascii => ascii translation. Return 1 if the whole string 8849 n/a was translated into writer, return 0 if the input string was partially 8850 n/a translated into writer, raise an exception and return -1 on error. */ 8851 n/a static int 8852 n/a unicode_fast_translate(PyObject *input, PyObject *mapping, 8853 n/a _PyUnicodeWriter *writer, int ignore, 8854 n/a Py_ssize_t *input_pos) 8855 n/a { 8856 n/a Py_UCS1 ascii_table[128], ch, ch2; 8857 n/a Py_ssize_t len; 8858 n/a Py_UCS1 *in, *end, *out; 8859 n/a int res = 0; 8860 n/a 8861 n/a len = PyUnicode_GET_LENGTH(input); 8862 n/a 8863 n/a memset(ascii_table, 0xff, 128); 8864 n/a 8865 n/a in = PyUnicode_1BYTE_DATA(input); 8866 n/a end = in + len; 8867 n/a 8868 n/a assert(PyUnicode_IS_ASCII(writer->buffer)); 8869 n/a assert(PyUnicode_GET_LENGTH(writer->buffer) == len); 8870 n/a out = PyUnicode_1BYTE_DATA(writer->buffer); 8871 n/a 8872 n/a for (; in < end; in++) { 8873 n/a ch = *in; 8874 n/a ch2 = ascii_table[ch]; 8875 n/a if (ch2 == 0xff) { 8876 n/a int translate = unicode_fast_translate_lookup(mapping, ch, 8877 n/a ascii_table); 8878 n/a if (translate < 0) 8879 n/a return -1; 8880 n/a if (translate == 0) 8881 n/a goto exit; 8882 n/a ch2 = ascii_table[ch]; 8883 n/a } 8884 n/a if (ch2 == 0xfe) { 8885 n/a if (ignore) 8886 n/a continue; 8887 n/a goto exit; 8888 n/a } 8889 n/a assert(ch2 < 128); 8890 n/a *out = ch2; 8891 n/a out++; 8892 n/a } 8893 n/a res = 1; 8894 n/a 8895 n/a exit: 8896 n/a writer->pos = out - PyUnicode_1BYTE_DATA(writer->buffer); 8897 n/a *input_pos = in - PyUnicode_1BYTE_DATA(input); 8898 n/a return res; 8899 n/a } 8900 n/a 8901 n/a static PyObject * 8902 n/a _PyUnicode_TranslateCharmap(PyObject *input, 8903 n/a PyObject *mapping, 8904 n/a const char *errors) 8905 n/a { 8906 n/a /* input object */ 8907 n/a char *data; 8908 n/a Py_ssize_t size, i; 8909 n/a int kind; 8910 n/a /* output buffer */ 8911 n/a _PyUnicodeWriter writer; 8912 n/a /* error handler */ 8913 n/a char *reason = "character maps to <undefined>"; 8914 n/a PyObject *errorHandler = NULL; 8915 n/a PyObject *exc = NULL; 8916 n/a int ignore; 8917 n/a int res; 8918 n/a 8919 n/a if (mapping == NULL) { 8920 n/a PyErr_BadArgument(); 8921 n/a return NULL; 8922 n/a } 8923 n/a 8924 n/a if (PyUnicode_READY(input) == -1) 8925 n/a return NULL; 8926 n/a data = (char*)PyUnicode_DATA(input); 8927 n/a kind = PyUnicode_KIND(input); 8928 n/a size = PyUnicode_GET_LENGTH(input); 8929 n/a 8930 n/a if (size == 0) 8931 n/a return PyUnicode_FromObject(input); 8932 n/a 8933 n/a /* allocate enough for a simple 1:1 translation without 8934 n/a replacements, if we need more, we'll resize */ 8935 n/a _PyUnicodeWriter_Init(&writer); 8936 n/a if (_PyUnicodeWriter_Prepare(&writer, size, 127) == -1) 8937 n/a goto onError; 8938 n/a 8939 n/a ignore = (errors != NULL && strcmp(errors, "ignore") == 0); 8940 n/a 8941 n/a if (PyUnicode_READY(input) == -1) 8942 n/a return NULL; 8943 n/a if (PyUnicode_IS_ASCII(input)) { 8944 n/a res = unicode_fast_translate(input, mapping, &writer, ignore, &i); 8945 n/a if (res < 0) { 8946 n/a _PyUnicodeWriter_Dealloc(&writer); 8947 n/a return NULL; 8948 n/a } 8949 n/a if (res == 1) 8950 n/a return _PyUnicodeWriter_Finish(&writer); 8951 n/a } 8952 n/a else { 8953 n/a i = 0; 8954 n/a } 8955 n/a 8956 n/a while (i<size) { 8957 n/a /* try to encode it */ 8958 n/a int translate; 8959 n/a PyObject *repunicode = NULL; /* initialize to prevent gcc warning */ 8960 n/a Py_ssize_t newpos; 8961 n/a /* startpos for collecting untranslatable chars */ 8962 n/a Py_ssize_t collstart; 8963 n/a Py_ssize_t collend; 8964 n/a Py_UCS4 ch; 8965 n/a 8966 n/a ch = PyUnicode_READ(kind, data, i); 8967 n/a translate = charmaptranslate_output(ch, mapping, &writer); 8968 n/a if (translate < 0) 8969 n/a goto onError; 8970 n/a 8971 n/a if (translate != 0) { 8972 n/a /* it worked => adjust input pointer */ 8973 n/a ++i; 8974 n/a continue; 8975 n/a } 8976 n/a 8977 n/a /* untranslatable character */ 8978 n/a collstart = i; 8979 n/a collend = i+1; 8980 n/a 8981 n/a /* find all untranslatable characters */ 8982 n/a while (collend < size) { 8983 n/a PyObject *x; 8984 n/a ch = PyUnicode_READ(kind, data, collend); 8985 n/a if (charmaptranslate_lookup(ch, mapping, &x)) 8986 n/a goto onError; 8987 n/a Py_XDECREF(x); 8988 n/a if (x != Py_None) 8989 n/a break; 8990 n/a ++collend; 8991 n/a } 8992 n/a 8993 n/a if (ignore) { 8994 n/a i = collend; 8995 n/a } 8996 n/a else { 8997 n/a repunicode = unicode_translate_call_errorhandler(errors, &errorHandler, 8998 n/a reason, input, &exc, 8999 n/a collstart, collend, &newpos); 9000 n/a if (repunicode == NULL) 9001 n/a goto onError; 9002 n/a if (_PyUnicodeWriter_WriteStr(&writer, repunicode) < 0) { 9003 n/a Py_DECREF(repunicode); 9004 n/a goto onError; 9005 n/a } 9006 n/a Py_DECREF(repunicode); 9007 n/a i = newpos; 9008 n/a } 9009 n/a } 9010 n/a Py_XDECREF(exc); 9011 n/a Py_XDECREF(errorHandler); 9012 n/a return _PyUnicodeWriter_Finish(&writer); 9013 n/a 9014 n/a onError: 9015 n/a _PyUnicodeWriter_Dealloc(&writer); 9016 n/a Py_XDECREF(exc); 9017 n/a Py_XDECREF(errorHandler); 9018 n/a return NULL; 9019 n/a } 9020 n/a 9021 n/a /* Deprecated. Use PyUnicode_Translate instead. */ 9022 n/a PyObject * 9023 n/a PyUnicode_TranslateCharmap(const Py_UNICODE *p, 9024 n/a Py_ssize_t size, 9025 n/a PyObject *mapping, 9026 n/a const char *errors) 9027 n/a { 9028 n/a PyObject *result; 9029 n/a PyObject *unicode = PyUnicode_FromWideChar(p, size); 9030 n/a if (!unicode) 9031 n/a return NULL; 9032 n/a result = _PyUnicode_TranslateCharmap(unicode, mapping, errors); 9033 n/a Py_DECREF(unicode); 9034 n/a return result; 9035 n/a } 9036 n/a 9037 n/a PyObject * 9038 n/a PyUnicode_Translate(PyObject *str, 9039 n/a PyObject *mapping, 9040 n/a const char *errors) 9041 n/a { 9042 n/a if (ensure_unicode(str) < 0) 9043 n/a return NULL; 9044 n/a return _PyUnicode_TranslateCharmap(str, mapping, errors); 9045 n/a } 9046 n/a 9047 n/a static Py_UCS4 9048 n/a fix_decimal_and_space_to_ascii(PyObject *self) 9049 n/a { 9050 n/a /* No need to call PyUnicode_READY(self) because this function is only 9051 n/a called as a callback from fixup() which does it already. */ 9052 n/a const Py_ssize_t len = PyUnicode_GET_LENGTH(self); 9053 n/a const int kind = PyUnicode_KIND(self); 9054 n/a void *data = PyUnicode_DATA(self); 9055 n/a Py_UCS4 maxchar = 127, ch, fixed; 9056 n/a int modified = 0; 9057 n/a Py_ssize_t i; 9058 n/a 9059 n/a for (i = 0; i < len; ++i) { 9060 n/a ch = PyUnicode_READ(kind, data, i); 9061 n/a fixed = 0; 9062 n/a if (ch > 127) { 9063 n/a if (Py_UNICODE_ISSPACE(ch)) 9064 n/a fixed = ' '; 9065 n/a else { 9066 n/a const int decimal = Py_UNICODE_TODECIMAL(ch); 9067 n/a if (decimal >= 0) 9068 n/a fixed = '0' + decimal; 9069 n/a } 9070 n/a if (fixed != 0) { 9071 n/a modified = 1; 9072 n/a maxchar = Py_MAX(maxchar, fixed); 9073 n/a PyUnicode_WRITE(kind, data, i, fixed); 9074 n/a } 9075 n/a else 9076 n/a maxchar = Py_MAX(maxchar, ch); 9077 n/a } 9078 n/a } 9079 n/a 9080 n/a return (modified) ? maxchar : 0; 9081 n/a } 9082 n/a 9083 n/a PyObject * 9084 n/a _PyUnicode_TransformDecimalAndSpaceToASCII(PyObject *unicode) 9085 n/a { 9086 n/a if (!PyUnicode_Check(unicode)) { 9087 n/a PyErr_BadInternalCall(); 9088 n/a return NULL; 9089 n/a } 9090 n/a if (PyUnicode_READY(unicode) == -1) 9091 n/a return NULL; 9092 n/a if (PyUnicode_MAX_CHAR_VALUE(unicode) <= 127) { 9093 n/a /* If the string is already ASCII, just return the same string */ 9094 n/a Py_INCREF(unicode); 9095 n/a return unicode; 9096 n/a } 9097 n/a return fixup(unicode, fix_decimal_and_space_to_ascii); 9098 n/a } 9099 n/a 9100 n/a PyObject * 9101 n/a PyUnicode_TransformDecimalToASCII(Py_UNICODE *s, 9102 n/a Py_ssize_t length) 9103 n/a { 9104 n/a PyObject *decimal; 9105 n/a Py_ssize_t i; 9106 n/a Py_UCS4 maxchar; 9107 n/a enum PyUnicode_Kind kind; 9108 n/a void *data; 9109 n/a 9110 n/a maxchar = 127; 9111 n/a for (i = 0; i < length; i++) { 9112 n/a Py_UCS4 ch = s[i]; 9113 n/a if (ch > 127) { 9114 n/a int decimal = Py_UNICODE_TODECIMAL(ch); 9115 n/a if (decimal >= 0) 9116 n/a ch = '0' + decimal; 9117 n/a maxchar = Py_MAX(maxchar, ch); 9118 n/a } 9119 n/a } 9120 n/a 9121 n/a /* Copy to a new string */ 9122 n/a decimal = PyUnicode_New(length, maxchar); 9123 n/a if (decimal == NULL) 9124 n/a return decimal; 9125 n/a kind = PyUnicode_KIND(decimal); 9126 n/a data = PyUnicode_DATA(decimal); 9127 n/a /* Iterate over code points */ 9128 n/a for (i = 0; i < length; i++) { 9129 n/a Py_UCS4 ch = s[i]; 9130 n/a if (ch > 127) { 9131 n/a int decimal = Py_UNICODE_TODECIMAL(ch); 9132 n/a if (decimal >= 0) 9133 n/a ch = '0' + decimal; 9134 n/a } 9135 n/a PyUnicode_WRITE(kind, data, i, ch); 9136 n/a } 9137 n/a return unicode_result(decimal); 9138 n/a } 9139 n/a /* --- Decimal Encoder ---------------------------------------------------- */ 9140 n/a 9141 n/a int 9142 n/a PyUnicode_EncodeDecimal(Py_UNICODE *s, 9143 n/a Py_ssize_t length, 9144 n/a char *output, 9145 n/a const char *errors) 9146 n/a { 9147 n/a PyObject *unicode; 9148 n/a Py_ssize_t i; 9149 n/a enum PyUnicode_Kind kind; 9150 n/a void *data; 9151 n/a 9152 n/a if (output == NULL) { 9153 n/a PyErr_BadArgument(); 9154 n/a return -1; 9155 n/a } 9156 n/a 9157 n/a unicode = PyUnicode_FromWideChar(s, length); 9158 n/a if (unicode == NULL) 9159 n/a return -1; 9160 n/a 9161 n/a kind = PyUnicode_KIND(unicode); 9162 n/a data = PyUnicode_DATA(unicode); 9163 n/a 9164 n/a for (i=0; i < length; ) { 9165 n/a PyObject *exc; 9166 n/a Py_UCS4 ch; 9167 n/a int decimal; 9168 n/a Py_ssize_t startpos; 9169 n/a 9170 n/a ch = PyUnicode_READ(kind, data, i); 9171 n/a 9172 n/a if (Py_UNICODE_ISSPACE(ch)) { 9173 n/a *output++ = ' '; 9174 n/a i++; 9175 n/a continue; 9176 n/a } 9177 n/a decimal = Py_UNICODE_TODECIMAL(ch); 9178 n/a if (decimal >= 0) { 9179 n/a *output++ = '0' + decimal; 9180 n/a i++; 9181 n/a continue; 9182 n/a } 9183 n/a if (0 < ch && ch < 256) { 9184 n/a *output++ = (char)ch; 9185 n/a i++; 9186 n/a continue; 9187 n/a } 9188 n/a 9189 n/a startpos = i; 9190 n/a exc = NULL; 9191 n/a raise_encode_exception(&exc, "decimal", unicode, 9192 n/a startpos, startpos+1, 9193 n/a "invalid decimal Unicode string"); 9194 n/a Py_XDECREF(exc); 9195 n/a Py_DECREF(unicode); 9196 n/a return -1; 9197 n/a } 9198 n/a /* 0-terminate the output string */ 9199 n/a *output++ = '\0'; 9200 n/a Py_DECREF(unicode); 9201 n/a return 0; 9202 n/a } 9203 n/a 9204 n/a /* --- Helpers ------------------------------------------------------------ */ 9205 n/a 9206 n/a /* helper macro to fixup start/end slice values */ 9207 n/a #define ADJUST_INDICES(start, end, len) \ 9208 n/a if (end > len) \ 9209 n/a end = len; \ 9210 n/a else if (end < 0) { \ 9211 n/a end += len; \ 9212 n/a if (end < 0) \ 9213 n/a end = 0; \ 9214 n/a } \ 9215 n/a if (start < 0) { \ 9216 n/a start += len; \ 9217 n/a if (start < 0) \ 9218 n/a start = 0; \ 9219 n/a } 9220 n/a 9221 n/a static Py_ssize_t 9222 n/a any_find_slice(PyObject* s1, PyObject* s2, 9223 n/a Py_ssize_t start, 9224 n/a Py_ssize_t end, 9225 n/a int direction) 9226 n/a { 9227 n/a int kind1, kind2; 9228 n/a void *buf1, *buf2; 9229 n/a Py_ssize_t len1, len2, result; 9230 n/a 9231 n/a kind1 = PyUnicode_KIND(s1); 9232 n/a kind2 = PyUnicode_KIND(s2); 9233 n/a if (kind1 < kind2) 9234 n/a return -1; 9235 n/a 9236 n/a len1 = PyUnicode_GET_LENGTH(s1); 9237 n/a len2 = PyUnicode_GET_LENGTH(s2); 9238 n/a ADJUST_INDICES(start, end, len1); 9239 n/a if (end - start < len2) 9240 n/a return -1; 9241 n/a 9242 n/a buf1 = PyUnicode_DATA(s1); 9243 n/a buf2 = PyUnicode_DATA(s2); 9244 n/a if (len2 == 1) { 9245 n/a Py_UCS4 ch = PyUnicode_READ(kind2, buf2, 0); 9246 n/a result = findchar((const char *)buf1 + kind1*start, 9247 n/a kind1, end - start, ch, direction); 9248 n/a if (result == -1) 9249 n/a return -1; 9250 n/a else 9251 n/a return start + result; 9252 n/a } 9253 n/a 9254 n/a if (kind2 != kind1) { 9255 n/a buf2 = _PyUnicode_AsKind(s2, kind1); 9256 n/a if (!buf2) 9257 n/a return -2; 9258 n/a } 9259 n/a 9260 n/a if (direction > 0) { 9261 n/a switch (kind1) { 9262 n/a case PyUnicode_1BYTE_KIND: 9263 n/a if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2)) 9264 n/a result = asciilib_find_slice(buf1, len1, buf2, len2, start, end); 9265 n/a else 9266 n/a result = ucs1lib_find_slice(buf1, len1, buf2, len2, start, end); 9267 n/a break; 9268 n/a case PyUnicode_2BYTE_KIND: 9269 n/a result = ucs2lib_find_slice(buf1, len1, buf2, len2, start, end); 9270 n/a break; 9271 n/a case PyUnicode_4BYTE_KIND: 9272 n/a result = ucs4lib_find_slice(buf1, len1, buf2, len2, start, end); 9273 n/a break; 9274 n/a default: 9275 n/a assert(0); result = -2; 9276 n/a } 9277 n/a } 9278 n/a else { 9279 n/a switch (kind1) { 9280 n/a case PyUnicode_1BYTE_KIND: 9281 n/a if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2)) 9282 n/a result = asciilib_rfind_slice(buf1, len1, buf2, len2, start, end); 9283 n/a else 9284 n/a result = ucs1lib_rfind_slice(buf1, len1, buf2, len2, start, end); 9285 n/a break; 9286 n/a case PyUnicode_2BYTE_KIND: 9287 n/a result = ucs2lib_rfind_slice(buf1, len1, buf2, len2, start, end); 9288 n/a break; 9289 n/a case PyUnicode_4BYTE_KIND: 9290 n/a result = ucs4lib_rfind_slice(buf1, len1, buf2, len2, start, end); 9291 n/a break; 9292 n/a default: 9293 n/a assert(0); result = -2; 9294 n/a } 9295 n/a } 9296 n/a 9297 n/a if (kind2 != kind1) 9298 n/a PyMem_Free(buf2); 9299 n/a 9300 n/a return result; 9301 n/a } 9302 n/a 9303 n/a Py_ssize_t 9304 n/a _PyUnicode_InsertThousandsGrouping( 9305 n/a PyObject *unicode, Py_ssize_t index, 9306 n/a Py_ssize_t n_buffer, 9307 n/a void *digits, Py_ssize_t n_digits, 9308 n/a Py_ssize_t min_width, 9309 n/a const char *grouping, PyObject *thousands_sep, 9310 n/a Py_UCS4 *maxchar) 9311 n/a { 9312 n/a unsigned int kind, thousands_sep_kind; 9313 n/a char *data, *thousands_sep_data; 9314 n/a Py_ssize_t thousands_sep_len; 9315 n/a Py_ssize_t len; 9316 n/a 9317 n/a if (unicode != NULL) { 9318 n/a kind = PyUnicode_KIND(unicode); 9319 n/a data = (char *) PyUnicode_DATA(unicode) + index * kind; 9320 n/a } 9321 n/a else { 9322 n/a kind = PyUnicode_1BYTE_KIND; 9323 n/a data = NULL; 9324 n/a } 9325 n/a thousands_sep_kind = PyUnicode_KIND(thousands_sep); 9326 n/a thousands_sep_data = PyUnicode_DATA(thousands_sep); 9327 n/a thousands_sep_len = PyUnicode_GET_LENGTH(thousands_sep); 9328 n/a if (unicode != NULL && thousands_sep_kind != kind) { 9329 n/a if (thousands_sep_kind < kind) { 9330 n/a thousands_sep_data = _PyUnicode_AsKind(thousands_sep, kind); 9331 n/a if (!thousands_sep_data) 9332 n/a return -1; 9333 n/a } 9334 n/a else { 9335 n/a data = _PyUnicode_AsKind(unicode, thousands_sep_kind); 9336 n/a if (!data) 9337 n/a return -1; 9338 n/a } 9339 n/a } 9340 n/a 9341 n/a switch (kind) { 9342 n/a case PyUnicode_1BYTE_KIND: 9343 n/a if (unicode != NULL && PyUnicode_IS_ASCII(unicode)) 9344 n/a len = asciilib_InsertThousandsGrouping( 9345 n/a (Py_UCS1 *) data, n_buffer, (Py_UCS1 *) digits, n_digits, 9346 n/a min_width, grouping, 9347 n/a (Py_UCS1 *) thousands_sep_data, thousands_sep_len); 9348 n/a else 9349 n/a len = ucs1lib_InsertThousandsGrouping( 9350 n/a (Py_UCS1*)data, n_buffer, (Py_UCS1*)digits, n_digits, 9351 n/a min_width, grouping, 9352 n/a (Py_UCS1 *) thousands_sep_data, thousands_sep_len); 9353 n/a break; 9354 n/a case PyUnicode_2BYTE_KIND: 9355 n/a len = ucs2lib_InsertThousandsGrouping( 9356 n/a (Py_UCS2 *) data, n_buffer, (Py_UCS2 *) digits, n_digits, 9357 n/a min_width, grouping, 9358 n/a (Py_UCS2 *) thousands_sep_data, thousands_sep_len); 9359 n/a break; 9360 n/a case PyUnicode_4BYTE_KIND: 9361 n/a len = ucs4lib_InsertThousandsGrouping( 9362 n/a (Py_UCS4 *) data, n_buffer, (Py_UCS4 *) digits, n_digits, 9363 n/a min_width, grouping, 9364 n/a (Py_UCS4 *) thousands_sep_data, thousands_sep_len); 9365 n/a break; 9366 n/a default: 9367 n/a assert(0); 9368 n/a return -1; 9369 n/a } 9370 n/a if (unicode != NULL && thousands_sep_kind != kind) { 9371 n/a if (thousands_sep_kind < kind) 9372 n/a PyMem_Free(thousands_sep_data); 9373 n/a else 9374 n/a PyMem_Free(data); 9375 n/a } 9376 n/a if (unicode == NULL) { 9377 n/a *maxchar = 127; 9378 n/a if (len != n_digits) { 9379 n/a *maxchar = Py_MAX(*maxchar, 9380 n/a PyUnicode_MAX_CHAR_VALUE(thousands_sep)); 9381 n/a } 9382 n/a } 9383 n/a return len; 9384 n/a } 9385 n/a 9386 n/a 9387 n/a Py_ssize_t 9388 n/a PyUnicode_Count(PyObject *str, 9389 n/a PyObject *substr, 9390 n/a Py_ssize_t start, 9391 n/a Py_ssize_t end) 9392 n/a { 9393 n/a Py_ssize_t result; 9394 n/a int kind1, kind2; 9395 n/a void *buf1 = NULL, *buf2 = NULL; 9396 n/a Py_ssize_t len1, len2; 9397 n/a 9398 n/a if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0) 9399 n/a return -1; 9400 n/a 9401 n/a kind1 = PyUnicode_KIND(str); 9402 n/a kind2 = PyUnicode_KIND(substr); 9403 n/a if (kind1 < kind2) 9404 n/a return 0; 9405 n/a 9406 n/a len1 = PyUnicode_GET_LENGTH(str); 9407 n/a len2 = PyUnicode_GET_LENGTH(substr); 9408 n/a ADJUST_INDICES(start, end, len1); 9409 n/a if (end - start < len2) 9410 n/a return 0; 9411 n/a 9412 n/a buf1 = PyUnicode_DATA(str); 9413 n/a buf2 = PyUnicode_DATA(substr); 9414 n/a if (kind2 != kind1) { 9415 n/a buf2 = _PyUnicode_AsKind(substr, kind1); 9416 n/a if (!buf2) 9417 n/a goto onError; 9418 n/a } 9419 n/a 9420 n/a switch (kind1) { 9421 n/a case PyUnicode_1BYTE_KIND: 9422 n/a if (PyUnicode_IS_ASCII(str) && PyUnicode_IS_ASCII(substr)) 9423 n/a result = asciilib_count( 9424 n/a ((Py_UCS1*)buf1) + start, end - start, 9425 n/a buf2, len2, PY_SSIZE_T_MAX 9426 n/a ); 9427 n/a else 9428 n/a result = ucs1lib_count( 9429 n/a ((Py_UCS1*)buf1) + start, end - start, 9430 n/a buf2, len2, PY_SSIZE_T_MAX 9431 n/a ); 9432 n/a break; 9433 n/a case PyUnicode_2BYTE_KIND: 9434 n/a result = ucs2lib_count( 9435 n/a ((Py_UCS2*)buf1) + start, end - start, 9436 n/a buf2, len2, PY_SSIZE_T_MAX 9437 n/a ); 9438 n/a break; 9439 n/a case PyUnicode_4BYTE_KIND: 9440 n/a result = ucs4lib_count( 9441 n/a ((Py_UCS4*)buf1) + start, end - start, 9442 n/a buf2, len2, PY_SSIZE_T_MAX 9443 n/a ); 9444 n/a break; 9445 n/a default: 9446 n/a assert(0); result = 0; 9447 n/a } 9448 n/a 9449 n/a if (kind2 != kind1) 9450 n/a PyMem_Free(buf2); 9451 n/a 9452 n/a return result; 9453 n/a onError: 9454 n/a if (kind2 != kind1 && buf2) 9455 n/a PyMem_Free(buf2); 9456 n/a return -1; 9457 n/a } 9458 n/a 9459 n/a Py_ssize_t 9460 n/a PyUnicode_Find(PyObject *str, 9461 n/a PyObject *substr, 9462 n/a Py_ssize_t start, 9463 n/a Py_ssize_t end, 9464 n/a int direction) 9465 n/a { 9466 n/a if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0) 9467 n/a return -2; 9468 n/a 9469 n/a return any_find_slice(str, substr, start, end, direction); 9470 n/a } 9471 n/a 9472 n/a Py_ssize_t 9473 n/a PyUnicode_FindChar(PyObject *str, Py_UCS4 ch, 9474 n/a Py_ssize_t start, Py_ssize_t end, 9475 n/a int direction) 9476 n/a { 9477 n/a int kind; 9478 n/a Py_ssize_t len, result; 9479 n/a if (PyUnicode_READY(str) == -1) 9480 n/a return -2; 9481 n/a len = PyUnicode_GET_LENGTH(str); 9482 n/a ADJUST_INDICES(start, end, len); 9483 n/a if (end - start < 1) 9484 n/a return -1; 9485 n/a kind = PyUnicode_KIND(str); 9486 n/a result = findchar(PyUnicode_1BYTE_DATA(str) + kind*start, 9487 n/a kind, end-start, ch, direction); 9488 n/a if (result == -1) 9489 n/a return -1; 9490 n/a else 9491 n/a return start + result; 9492 n/a } 9493 n/a 9494 n/a static int 9495 n/a tailmatch(PyObject *self, 9496 n/a PyObject *substring, 9497 n/a Py_ssize_t start, 9498 n/a Py_ssize_t end, 9499 n/a int direction) 9500 n/a { 9501 n/a int kind_self; 9502 n/a int kind_sub; 9503 n/a void *data_self; 9504 n/a void *data_sub; 9505 n/a Py_ssize_t offset; 9506 n/a Py_ssize_t i; 9507 n/a Py_ssize_t end_sub; 9508 n/a 9509 n/a if (PyUnicode_READY(self) == -1 || 9510 n/a PyUnicode_READY(substring) == -1) 9511 n/a return -1; 9512 n/a 9513 n/a ADJUST_INDICES(start, end, PyUnicode_GET_LENGTH(self)); 9514 n/a end -= PyUnicode_GET_LENGTH(substring); 9515 n/a if (end < start) 9516 n/a return 0; 9517 n/a 9518 n/a if (PyUnicode_GET_LENGTH(substring) == 0) 9519 n/a return 1; 9520 n/a 9521 n/a kind_self = PyUnicode_KIND(self); 9522 n/a data_self = PyUnicode_DATA(self); 9523 n/a kind_sub = PyUnicode_KIND(substring); 9524 n/a data_sub = PyUnicode_DATA(substring); 9525 n/a end_sub = PyUnicode_GET_LENGTH(substring) - 1; 9526 n/a 9527 n/a if (direction > 0) 9528 n/a offset = end; 9529 n/a else 9530 n/a offset = start; 9531 n/a 9532 n/a if (PyUnicode_READ(kind_self, data_self, offset) == 9533 n/a PyUnicode_READ(kind_sub, data_sub, 0) && 9534 n/a PyUnicode_READ(kind_self, data_self, offset + end_sub) == 9535 n/a PyUnicode_READ(kind_sub, data_sub, end_sub)) { 9536 n/a /* If both are of the same kind, memcmp is sufficient */ 9537 n/a if (kind_self == kind_sub) { 9538 n/a return ! memcmp((char *)data_self + 9539 n/a (offset * PyUnicode_KIND(substring)), 9540 n/a data_sub, 9541 n/a PyUnicode_GET_LENGTH(substring) * 9542 n/a PyUnicode_KIND(substring)); 9543 n/a } 9544 n/a /* otherwise we have to compare each character by first accessing it */ 9545 n/a else { 9546 n/a /* We do not need to compare 0 and len(substring)-1 because 9547 n/a the if statement above ensured already that they are equal 9548 n/a when we end up here. */ 9549 n/a for (i = 1; i < end_sub; ++i) { 9550 n/a if (PyUnicode_READ(kind_self, data_self, offset + i) != 9551 n/a PyUnicode_READ(kind_sub, data_sub, i)) 9552 n/a return 0; 9553 n/a } 9554 n/a return 1; 9555 n/a } 9556 n/a } 9557 n/a 9558 n/a return 0; 9559 n/a } 9560 n/a 9561 n/a Py_ssize_t 9562 n/a PyUnicode_Tailmatch(PyObject *str, 9563 n/a PyObject *substr, 9564 n/a Py_ssize_t start, 9565 n/a Py_ssize_t end, 9566 n/a int direction) 9567 n/a { 9568 n/a if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0) 9569 n/a return -1; 9570 n/a 9571 n/a return tailmatch(str, substr, start, end, direction); 9572 n/a } 9573 n/a 9574 n/a /* Apply fixfct filter to the Unicode object self and return a 9575 n/a reference to the modified object */ 9576 n/a 9577 n/a static PyObject * 9578 n/a fixup(PyObject *self, 9579 n/a Py_UCS4 (*fixfct)(PyObject *s)) 9580 n/a { 9581 n/a PyObject *u; 9582 n/a Py_UCS4 maxchar_old, maxchar_new = 0; 9583 n/a PyObject *v; 9584 n/a 9585 n/a u = _PyUnicode_Copy(self); 9586 n/a if (u == NULL) 9587 n/a return NULL; 9588 n/a maxchar_old = PyUnicode_MAX_CHAR_VALUE(u); 9589 n/a 9590 n/a /* fix functions return the new maximum character in a string, 9591 n/a if the kind of the resulting unicode object does not change, 9592 n/a everything is fine. Otherwise we need to change the string kind 9593 n/a and re-run the fix function. */ 9594 n/a maxchar_new = fixfct(u); 9595 n/a 9596 n/a if (maxchar_new == 0) { 9597 n/a /* no changes */; 9598 n/a if (PyUnicode_CheckExact(self)) { 9599 n/a Py_DECREF(u); 9600 n/a Py_INCREF(self); 9601 n/a return self; 9602 n/a } 9603 n/a else 9604 n/a return u; 9605 n/a } 9606 n/a 9607 n/a maxchar_new = align_maxchar(maxchar_new); 9608 n/a 9609 n/a if (maxchar_new == maxchar_old) 9610 n/a return u; 9611 n/a 9612 n/a /* In case the maximum character changed, we need to 9613 n/a convert the string to the new category. */ 9614 n/a v = PyUnicode_New(PyUnicode_GET_LENGTH(self), maxchar_new); 9615 n/a if (v == NULL) { 9616 n/a Py_DECREF(u); 9617 n/a return NULL; 9618 n/a } 9619 n/a if (maxchar_new > maxchar_old) { 9620 n/a /* If the maxchar increased so that the kind changed, not all 9621 n/a characters are representable anymore and we need to fix the 9622 n/a string again. This only happens in very few cases. */ 9623 n/a _PyUnicode_FastCopyCharacters(v, 0, 9624 n/a self, 0, PyUnicode_GET_LENGTH(self)); 9625 n/a maxchar_old = fixfct(v); 9626 n/a assert(maxchar_old > 0 && maxchar_old <= maxchar_new); 9627 n/a } 9628 n/a else { 9629 n/a _PyUnicode_FastCopyCharacters(v, 0, 9630 n/a u, 0, PyUnicode_GET_LENGTH(self)); 9631 n/a } 9632 n/a Py_DECREF(u); 9633 n/a assert(_PyUnicode_CheckConsistency(v, 1)); 9634 n/a return v; 9635 n/a } 9636 n/a 9637 n/a static PyObject * 9638 n/a ascii_upper_or_lower(PyObject *self, int lower) 9639 n/a { 9640 n/a Py_ssize_t len = PyUnicode_GET_LENGTH(self); 9641 n/a char *resdata, *data = PyUnicode_DATA(self); 9642 n/a PyObject *res; 9643 n/a 9644 n/a res = PyUnicode_New(len, 127); 9645 n/a if (res == NULL) 9646 n/a return NULL; 9647 n/a resdata = PyUnicode_DATA(res); 9648 n/a if (lower) 9649 n/a _Py_bytes_lower(resdata, data, len); 9650 n/a else 9651 n/a _Py_bytes_upper(resdata, data, len); 9652 n/a return res; 9653 n/a } 9654 n/a 9655 n/a static Py_UCS4 9656 n/a handle_capital_sigma(int kind, void *data, Py_ssize_t length, Py_ssize_t i) 9657 n/a { 9658 n/a Py_ssize_t j; 9659 n/a int final_sigma; 9660 n/a Py_UCS4 c = 0; /* initialize to prevent gcc warning */ 9661 n/a /* U+03A3 is in the Final_Sigma context when, it is found like this: 9662 n/a 9663 n/a \p{cased}\p{case-ignorable}*U+03A3!(\p{case-ignorable}*\p{cased}) 9664 n/a 9665 n/a where ! is a negation and \p{xxx} is a character with property xxx. 9666 n/a */ 9667 n/a for (j = i - 1; j >= 0; j--) { 9668 n/a c = PyUnicode_READ(kind, data, j); 9669 n/a if (!_PyUnicode_IsCaseIgnorable(c)) 9670 n/a break; 9671 n/a } 9672 n/a final_sigma = j >= 0 && _PyUnicode_IsCased(c); 9673 n/a if (final_sigma) { 9674 n/a for (j = i + 1; j < length; j++) { 9675 n/a c = PyUnicode_READ(kind, data, j); 9676 n/a if (!_PyUnicode_IsCaseIgnorable(c)) 9677 n/a break; 9678 n/a } 9679 n/a final_sigma = j == length || !_PyUnicode_IsCased(c); 9680 n/a } 9681 n/a return (final_sigma) ? 0x3C2 : 0x3C3; 9682 n/a } 9683 n/a 9684 n/a static int 9685 n/a lower_ucs4(int kind, void *data, Py_ssize_t length, Py_ssize_t i, 9686 n/a Py_UCS4 c, Py_UCS4 *mapped) 9687 n/a { 9688 n/a /* Obscure special case. */ 9689 n/a if (c == 0x3A3) { 9690 n/a mapped[0] = handle_capital_sigma(kind, data, length, i); 9691 n/a return 1; 9692 n/a } 9693 n/a return _PyUnicode_ToLowerFull(c, mapped); 9694 n/a } 9695 n/a 9696 n/a static Py_ssize_t 9697 n/a do_capitalize(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar) 9698 n/a { 9699 n/a Py_ssize_t i, k = 0; 9700 n/a int n_res, j; 9701 n/a Py_UCS4 c, mapped[3]; 9702 n/a 9703 n/a c = PyUnicode_READ(kind, data, 0); 9704 n/a n_res = _PyUnicode_ToUpperFull(c, mapped); 9705 n/a for (j = 0; j < n_res; j++) { 9706 n/a *maxchar = Py_MAX(*maxchar, mapped[j]); 9707 n/a res[k++] = mapped[j]; 9708 n/a } 9709 n/a for (i = 1; i < length; i++) { 9710 n/a c = PyUnicode_READ(kind, data, i); 9711 n/a n_res = lower_ucs4(kind, data, length, i, c, mapped); 9712 n/a for (j = 0; j < n_res; j++) { 9713 n/a *maxchar = Py_MAX(*maxchar, mapped[j]); 9714 n/a res[k++] = mapped[j]; 9715 n/a } 9716 n/a } 9717 n/a return k; 9718 n/a } 9719 n/a 9720 n/a static Py_ssize_t 9721 n/a do_swapcase(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar) { 9722 n/a Py_ssize_t i, k = 0; 9723 n/a 9724 n/a for (i = 0; i < length; i++) { 9725 n/a Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3]; 9726 n/a int n_res, j; 9727 n/a if (Py_UNICODE_ISUPPER(c)) { 9728 n/a n_res = lower_ucs4(kind, data, length, i, c, mapped); 9729 n/a } 9730 n/a else if (Py_UNICODE_ISLOWER(c)) { 9731 n/a n_res = _PyUnicode_ToUpperFull(c, mapped); 9732 n/a } 9733 n/a else { 9734 n/a n_res = 1; 9735 n/a mapped[0] = c; 9736 n/a } 9737 n/a for (j = 0; j < n_res; j++) { 9738 n/a *maxchar = Py_MAX(*maxchar, mapped[j]); 9739 n/a res[k++] = mapped[j]; 9740 n/a } 9741 n/a } 9742 n/a return k; 9743 n/a } 9744 n/a 9745 n/a static Py_ssize_t 9746 n/a do_upper_or_lower(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, 9747 n/a Py_UCS4 *maxchar, int lower) 9748 n/a { 9749 n/a Py_ssize_t i, k = 0; 9750 n/a 9751 n/a for (i = 0; i < length; i++) { 9752 n/a Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3]; 9753 n/a int n_res, j; 9754 n/a if (lower) 9755 n/a n_res = lower_ucs4(kind, data, length, i, c, mapped); 9756 n/a else 9757 n/a n_res = _PyUnicode_ToUpperFull(c, mapped); 9758 n/a for (j = 0; j < n_res; j++) { 9759 n/a *maxchar = Py_MAX(*maxchar, mapped[j]); 9760 n/a res[k++] = mapped[j]; 9761 n/a } 9762 n/a } 9763 n/a return k; 9764 n/a } 9765 n/a 9766 n/a static Py_ssize_t 9767 n/a do_upper(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar) 9768 n/a { 9769 n/a return do_upper_or_lower(kind, data, length, res, maxchar, 0); 9770 n/a } 9771 n/a 9772 n/a static Py_ssize_t 9773 n/a do_lower(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar) 9774 n/a { 9775 n/a return do_upper_or_lower(kind, data, length, res, maxchar, 1); 9776 n/a } 9777 n/a 9778 n/a static Py_ssize_t 9779 n/a do_casefold(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar) 9780 n/a { 9781 n/a Py_ssize_t i, k = 0; 9782 n/a 9783 n/a for (i = 0; i < length; i++) { 9784 n/a Py_UCS4 c = PyUnicode_READ(kind, data, i); 9785 n/a Py_UCS4 mapped[3]; 9786 n/a int j, n_res = _PyUnicode_ToFoldedFull(c, mapped); 9787 n/a for (j = 0; j < n_res; j++) { 9788 n/a *maxchar = Py_MAX(*maxchar, mapped[j]); 9789 n/a res[k++] = mapped[j]; 9790 n/a } 9791 n/a } 9792 n/a return k; 9793 n/a } 9794 n/a 9795 n/a static Py_ssize_t 9796 n/a do_title(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar) 9797 n/a { 9798 n/a Py_ssize_t i, k = 0; 9799 n/a int previous_is_cased; 9800 n/a 9801 n/a previous_is_cased = 0; 9802 n/a for (i = 0; i < length; i++) { 9803 n/a const Py_UCS4 c = PyUnicode_READ(kind, data, i); 9804 n/a Py_UCS4 mapped[3]; 9805 n/a int n_res, j; 9806 n/a 9807 n/a if (previous_is_cased) 9808 n/a n_res = lower_ucs4(kind, data, length, i, c, mapped); 9809 n/a else 9810 n/a n_res = _PyUnicode_ToTitleFull(c, mapped); 9811 n/a 9812 n/a for (j = 0; j < n_res; j++) { 9813 n/a *maxchar = Py_MAX(*maxchar, mapped[j]); 9814 n/a res[k++] = mapped[j]; 9815 n/a } 9816 n/a 9817 n/a previous_is_cased = _PyUnicode_IsCased(c); 9818 n/a } 9819 n/a return k; 9820 n/a } 9821 n/a 9822 n/a static PyObject * 9823 n/a case_operation(PyObject *self, 9824 n/a Py_ssize_t (*perform)(int, void *, Py_ssize_t, Py_UCS4 *, Py_UCS4 *)) 9825 n/a { 9826 n/a PyObject *res = NULL; 9827 n/a Py_ssize_t length, newlength = 0; 9828 n/a int kind, outkind; 9829 n/a void *data, *outdata; 9830 n/a Py_UCS4 maxchar = 0, *tmp, *tmpend; 9831 n/a 9832 n/a assert(PyUnicode_IS_READY(self)); 9833 n/a 9834 n/a kind = PyUnicode_KIND(self); 9835 n/a data = PyUnicode_DATA(self); 9836 n/a length = PyUnicode_GET_LENGTH(self); 9837 n/a if ((size_t) length > PY_SSIZE_T_MAX / (3 * sizeof(Py_UCS4))) { 9838 n/a PyErr_SetString(PyExc_OverflowError, "string is too long"); 9839 n/a return NULL; 9840 n/a } 9841 n/a tmp = PyMem_MALLOC(sizeof(Py_UCS4) * 3 * length); 9842 n/a if (tmp == NULL) 9843 n/a return PyErr_NoMemory(); 9844 n/a newlength = perform(kind, data, length, tmp, &maxchar); 9845 n/a res = PyUnicode_New(newlength, maxchar); 9846 n/a if (res == NULL) 9847 n/a goto leave; 9848 n/a tmpend = tmp + newlength; 9849 n/a outdata = PyUnicode_DATA(res); 9850 n/a outkind = PyUnicode_KIND(res); 9851 n/a switch (outkind) { 9852 n/a case PyUnicode_1BYTE_KIND: 9853 n/a _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, tmp, tmpend, outdata); 9854 n/a break; 9855 n/a case PyUnicode_2BYTE_KIND: 9856 n/a _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, tmp, tmpend, outdata); 9857 n/a break; 9858 n/a case PyUnicode_4BYTE_KIND: 9859 n/a memcpy(outdata, tmp, sizeof(Py_UCS4) * newlength); 9860 n/a break; 9861 n/a default: 9862 n/a assert(0); 9863 n/a break; 9864 n/a } 9865 n/a leave: 9866 n/a PyMem_FREE(tmp); 9867 n/a return res; 9868 n/a } 9869 n/a 9870 n/a PyObject * 9871 n/a PyUnicode_Join(PyObject *separator, PyObject *seq) 9872 n/a { 9873 n/a PyObject *res; 9874 n/a PyObject *fseq; 9875 n/a Py_ssize_t seqlen; 9876 n/a PyObject **items; 9877 n/a 9878 n/a fseq = PySequence_Fast(seq, "can only join an iterable"); 9879 n/a if (fseq == NULL) { 9880 n/a return NULL; 9881 n/a } 9882 n/a 9883 n/a /* NOTE: the following code can't call back into Python code, 9884 n/a * so we are sure that fseq won't be mutated. 9885 n/a */ 9886 n/a 9887 n/a items = PySequence_Fast_ITEMS(fseq); 9888 n/a seqlen = PySequence_Fast_GET_SIZE(fseq); 9889 n/a res = _PyUnicode_JoinArray(separator, items, seqlen); 9890 n/a Py_DECREF(fseq); 9891 n/a return res; 9892 n/a } 9893 n/a 9894 n/a PyObject * 9895 n/a _PyUnicode_JoinArray(PyObject *separator, PyObject **items, Py_ssize_t seqlen) 9896 n/a { 9897 n/a PyObject *res = NULL; /* the result */ 9898 n/a PyObject *sep = NULL; 9899 n/a Py_ssize_t seplen; 9900 n/a PyObject *item; 9901 n/a Py_ssize_t sz, i, res_offset; 9902 n/a Py_UCS4 maxchar; 9903 n/a Py_UCS4 item_maxchar; 9904 n/a int use_memcpy; 9905 n/a unsigned char *res_data = NULL, *sep_data = NULL; 9906 n/a PyObject *last_obj; 9907 n/a unsigned int kind = 0; 9908 n/a 9909 n/a /* If empty sequence, return u"". */ 9910 n/a if (seqlen == 0) { 9911 n/a _Py_RETURN_UNICODE_EMPTY(); 9912 n/a } 9913 n/a 9914 n/a /* If singleton sequence with an exact Unicode, return that. */ 9915 n/a last_obj = NULL; 9916 n/a if (seqlen == 1) { 9917 n/a if (PyUnicode_CheckExact(items[0])) { 9918 n/a res = items[0]; 9919 n/a Py_INCREF(res); 9920 n/a return res; 9921 n/a } 9922 n/a seplen = 0; 9923 n/a maxchar = 0; 9924 n/a } 9925 n/a else { 9926 n/a /* Set up sep and seplen */ 9927 n/a if (separator == NULL) { 9928 n/a /* fall back to a blank space separator */ 9929 n/a sep = PyUnicode_FromOrdinal(' '); 9930 n/a if (!sep) 9931 n/a goto onError; 9932 n/a seplen = 1; 9933 n/a maxchar = 32; 9934 n/a } 9935 n/a else { 9936 n/a if (!PyUnicode_Check(separator)) { 9937 n/a PyErr_Format(PyExc_TypeError, 9938 n/a "separator: expected str instance," 9939 n/a " %.80s found", 9940 n/a Py_TYPE(separator)->tp_name); 9941 n/a goto onError; 9942 n/a } 9943 n/a if (PyUnicode_READY(separator)) 9944 n/a goto onError; 9945 n/a sep = separator; 9946 n/a seplen = PyUnicode_GET_LENGTH(separator); 9947 n/a maxchar = PyUnicode_MAX_CHAR_VALUE(separator); 9948 n/a /* inc refcount to keep this code path symmetric with the 9949 n/a above case of a blank separator */ 9950 n/a Py_INCREF(sep); 9951 n/a } 9952 n/a last_obj = sep; 9953 n/a } 9954 n/a 9955 n/a /* There are at least two things to join, or else we have a subclass 9956 n/a * of str in the sequence. 9957 n/a * Do a pre-pass to figure out the total amount of space we'll 9958 n/a * need (sz), and see whether all argument are strings. 9959 n/a */ 9960 n/a sz = 0; 9961 n/a #ifdef Py_DEBUG 9962 n/a use_memcpy = 0; 9963 n/a #else 9964 n/a use_memcpy = 1; 9965 n/a #endif 9966 n/a for (i = 0; i < seqlen; i++) { 9967 n/a size_t add_sz; 9968 n/a item = items[i]; 9969 n/a if (!PyUnicode_Check(item)) { 9970 n/a PyErr_Format(PyExc_TypeError, 9971 n/a "sequence item %zd: expected str instance," 9972 n/a " %.80s found", 9973 n/a i, Py_TYPE(item)->tp_name); 9974 n/a goto onError; 9975 n/a } 9976 n/a if (PyUnicode_READY(item) == -1) 9977 n/a goto onError; 9978 n/a add_sz = PyUnicode_GET_LENGTH(item); 9979 n/a item_maxchar = PyUnicode_MAX_CHAR_VALUE(item); 9980 n/a maxchar = Py_MAX(maxchar, item_maxchar); 9981 n/a if (i != 0) { 9982 n/a add_sz += seplen; 9983 n/a } 9984 n/a if (add_sz > (size_t)(PY_SSIZE_T_MAX - sz)) { 9985 n/a PyErr_SetString(PyExc_OverflowError, 9986 n/a "join() result is too long for a Python string"); 9987 n/a goto onError; 9988 n/a } 9989 n/a sz += add_sz; 9990 n/a if (use_memcpy && last_obj != NULL) { 9991 n/a if (PyUnicode_KIND(last_obj) != PyUnicode_KIND(item)) 9992 n/a use_memcpy = 0; 9993 n/a } 9994 n/a last_obj = item; 9995 n/a } 9996 n/a 9997 n/a res = PyUnicode_New(sz, maxchar); 9998 n/a if (res == NULL) 9999 n/a goto onError; 10000 n/a 10001 n/a /* Catenate everything. */ 10002 n/a #ifdef Py_DEBUG 10003 n/a use_memcpy = 0; 10004 n/a #else 10005 n/a if (use_memcpy) { 10006 n/a res_data = PyUnicode_1BYTE_DATA(res); 10007 n/a kind = PyUnicode_KIND(res); 10008 n/a if (seplen != 0) 10009 n/a sep_data = PyUnicode_1BYTE_DATA(sep); 10010 n/a } 10011 n/a #endif 10012 n/a if (use_memcpy) { 10013 n/a for (i = 0; i < seqlen; ++i) { 10014 n/a Py_ssize_t itemlen; 10015 n/a item = items[i]; 10016 n/a 10017 n/a /* Copy item, and maybe the separator. */ 10018 n/a if (i && seplen != 0) { 10019 n/a memcpy(res_data, 10020 n/a sep_data, 10021 n/a kind * seplen); 10022 n/a res_data += kind * seplen; 10023 n/a } 10024 n/a 10025 n/a itemlen = PyUnicode_GET_LENGTH(item); 10026 n/a if (itemlen != 0) { 10027 n/a memcpy(res_data, 10028 n/a PyUnicode_DATA(item), 10029 n/a kind * itemlen); 10030 n/a res_data += kind * itemlen; 10031 n/a } 10032 n/a } 10033 n/a assert(res_data == PyUnicode_1BYTE_DATA(res) 10034 n/a + kind * PyUnicode_GET_LENGTH(res)); 10035 n/a } 10036 n/a else { 10037 n/a for (i = 0, res_offset = 0; i < seqlen; ++i) { 10038 n/a Py_ssize_t itemlen; 10039 n/a item = items[i]; 10040 n/a 10041 n/a /* Copy item, and maybe the separator. */ 10042 n/a if (i && seplen != 0) { 10043 n/a _PyUnicode_FastCopyCharacters(res, res_offset, sep, 0, seplen); 10044 n/a res_offset += seplen; 10045 n/a } 10046 n/a 10047 n/a itemlen = PyUnicode_GET_LENGTH(item); 10048 n/a if (itemlen != 0) { 10049 n/a _PyUnicode_FastCopyCharacters(res, res_offset, item, 0, itemlen); 10050 n/a res_offset += itemlen; 10051 n/a } 10052 n/a } 10053 n/a assert(res_offset == PyUnicode_GET_LENGTH(res)); 10054 n/a } 10055 n/a 10056 n/a Py_XDECREF(sep); 10057 n/a assert(_PyUnicode_CheckConsistency(res, 1)); 10058 n/a return res; 10059 n/a 10060 n/a onError: 10061 n/a Py_XDECREF(sep); 10062 n/a Py_XDECREF(res); 10063 n/a return NULL; 10064 n/a } 10065 n/a 10066 n/a #define FILL(kind, data, value, start, length) \ 10067 n/a do { \ 10068 n/a Py_ssize_t i_ = 0; \ 10069 n/a assert(kind != PyUnicode_WCHAR_KIND); \ 10070 n/a switch ((kind)) { \ 10071 n/a case PyUnicode_1BYTE_KIND: { \ 10072 n/a unsigned char * to_ = (unsigned char *)((data)) + (start); \ 10073 n/a memset(to_, (unsigned char)value, (length)); \ 10074 n/a break; \ 10075 n/a } \ 10076 n/a case PyUnicode_2BYTE_KIND: { \ 10077 n/a Py_UCS2 * to_ = (Py_UCS2 *)((data)) + (start); \ 10078 n/a for (; i_ < (length); ++i_, ++to_) *to_ = (value); \ 10079 n/a break; \ 10080 n/a } \ 10081 n/a case PyUnicode_4BYTE_KIND: { \ 10082 n/a Py_UCS4 * to_ = (Py_UCS4 *)((data)) + (start); \ 10083 n/a for (; i_ < (length); ++i_, ++to_) *to_ = (value); \ 10084 n/a break; \ 10085 n/a } \ 10086 n/a default: assert(0); \ 10087 n/a } \ 10088 n/a } while (0) 10089 n/a 10090 n/a void 10091 n/a _PyUnicode_FastFill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length, 10092 n/a Py_UCS4 fill_char) 10093 n/a { 10094 n/a const enum PyUnicode_Kind kind = PyUnicode_KIND(unicode); 10095 n/a const void *data = PyUnicode_DATA(unicode); 10096 n/a assert(PyUnicode_IS_READY(unicode)); 10097 n/a assert(unicode_modifiable(unicode)); 10098 n/a assert(fill_char <= PyUnicode_MAX_CHAR_VALUE(unicode)); 10099 n/a assert(start >= 0); 10100 n/a assert(start + length <= PyUnicode_GET_LENGTH(unicode)); 10101 n/a FILL(kind, data, fill_char, start, length); 10102 n/a } 10103 n/a 10104 n/a Py_ssize_t 10105 n/a PyUnicode_Fill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length, 10106 n/a Py_UCS4 fill_char) 10107 n/a { 10108 n/a Py_ssize_t maxlen; 10109 n/a 10110 n/a if (!PyUnicode_Check(unicode)) { 10111 n/a PyErr_BadInternalCall(); 10112 n/a return -1; 10113 n/a } 10114 n/a if (PyUnicode_READY(unicode) == -1) 10115 n/a return -1; 10116 n/a if (unicode_check_modifiable(unicode)) 10117 n/a return -1; 10118 n/a 10119 n/a if (start < 0) { 10120 n/a PyErr_SetString(PyExc_IndexError, "string index out of range"); 10121 n/a return -1; 10122 n/a } 10123 n/a if (fill_char > PyUnicode_MAX_CHAR_VALUE(unicode)) { 10124 n/a PyErr_SetString(PyExc_ValueError, 10125 n/a "fill character is bigger than " 10126 n/a "the string maximum character"); 10127 n/a return -1; 10128 n/a } 10129 n/a 10130 n/a maxlen = PyUnicode_GET_LENGTH(unicode) - start; 10131 n/a length = Py_MIN(maxlen, length); 10132 n/a if (length <= 0) 10133 n/a return 0; 10134 n/a 10135 n/a _PyUnicode_FastFill(unicode, start, length, fill_char); 10136 n/a return length; 10137 n/a } 10138 n/a 10139 n/a static PyObject * 10140 n/a pad(PyObject *self, 10141 n/a Py_ssize_t left, 10142 n/a Py_ssize_t right, 10143 n/a Py_UCS4 fill) 10144 n/a { 10145 n/a PyObject *u; 10146 n/a Py_UCS4 maxchar; 10147 n/a int kind; 10148 n/a void *data; 10149 n/a 10150 n/a if (left < 0) 10151 n/a left = 0; 10152 n/a if (right < 0) 10153 n/a right = 0; 10154 n/a 10155 n/a if (left == 0 && right == 0) 10156 n/a return unicode_result_unchanged(self); 10157 n/a 10158 n/a if (left > PY_SSIZE_T_MAX - _PyUnicode_LENGTH(self) || 10159 n/a right > PY_SSIZE_T_MAX - (left + _PyUnicode_LENGTH(self))) { 10160 n/a PyErr_SetString(PyExc_OverflowError, "padded string is too long"); 10161 n/a return NULL; 10162 n/a } 10163 n/a maxchar = PyUnicode_MAX_CHAR_VALUE(self); 10164 n/a maxchar = Py_MAX(maxchar, fill); 10165 n/a u = PyUnicode_New(left + _PyUnicode_LENGTH(self) + right, maxchar); 10166 n/a if (!u) 10167 n/a return NULL; 10168 n/a 10169 n/a kind = PyUnicode_KIND(u); 10170 n/a data = PyUnicode_DATA(u); 10171 n/a if (left) 10172 n/a FILL(kind, data, fill, 0, left); 10173 n/a if (right) 10174 n/a FILL(kind, data, fill, left + _PyUnicode_LENGTH(self), right); 10175 n/a _PyUnicode_FastCopyCharacters(u, left, self, 0, _PyUnicode_LENGTH(self)); 10176 n/a assert(_PyUnicode_CheckConsistency(u, 1)); 10177 n/a return u; 10178 n/a } 10179 n/a 10180 n/a PyObject * 10181 n/a PyUnicode_Splitlines(PyObject *string, int keepends) 10182 n/a { 10183 n/a PyObject *list; 10184 n/a 10185 n/a if (ensure_unicode(string) < 0) 10186 n/a return NULL; 10187 n/a 10188 n/a switch (PyUnicode_KIND(string)) { 10189 n/a case PyUnicode_1BYTE_KIND: 10190 n/a if (PyUnicode_IS_ASCII(string)) 10191 n/a list = asciilib_splitlines( 10192 n/a string, PyUnicode_1BYTE_DATA(string), 10193 n/a PyUnicode_GET_LENGTH(string), keepends); 10194 n/a else 10195 n/a list = ucs1lib_splitlines( 10196 n/a string, PyUnicode_1BYTE_DATA(string), 10197 n/a PyUnicode_GET_LENGTH(string), keepends); 10198 n/a break; 10199 n/a case PyUnicode_2BYTE_KIND: 10200 n/a list = ucs2lib_splitlines( 10201 n/a string, PyUnicode_2BYTE_DATA(string), 10202 n/a PyUnicode_GET_LENGTH(string), keepends); 10203 n/a break; 10204 n/a case PyUnicode_4BYTE_KIND: 10205 n/a list = ucs4lib_splitlines( 10206 n/a string, PyUnicode_4BYTE_DATA(string), 10207 n/a PyUnicode_GET_LENGTH(string), keepends); 10208 n/a break; 10209 n/a default: 10210 n/a assert(0); 10211 n/a list = 0; 10212 n/a } 10213 n/a return list; 10214 n/a } 10215 n/a 10216 n/a static PyObject * 10217 n/a split(PyObject *self, 10218 n/a PyObject *substring, 10219 n/a Py_ssize_t maxcount) 10220 n/a { 10221 n/a int kind1, kind2; 10222 n/a void *buf1, *buf2; 10223 n/a Py_ssize_t len1, len2; 10224 n/a PyObject* out; 10225 n/a 10226 n/a if (maxcount < 0) 10227 n/a maxcount = PY_SSIZE_T_MAX; 10228 n/a 10229 n/a if (PyUnicode_READY(self) == -1) 10230 n/a return NULL; 10231 n/a 10232 n/a if (substring == NULL) 10233 n/a switch (PyUnicode_KIND(self)) { 10234 n/a case PyUnicode_1BYTE_KIND: 10235 n/a if (PyUnicode_IS_ASCII(self)) 10236 n/a return asciilib_split_whitespace( 10237 n/a self, PyUnicode_1BYTE_DATA(self), 10238 n/a PyUnicode_GET_LENGTH(self), maxcount 10239 n/a ); 10240 n/a else 10241 n/a return ucs1lib_split_whitespace( 10242 n/a self, PyUnicode_1BYTE_DATA(self), 10243 n/a PyUnicode_GET_LENGTH(self), maxcount 10244 n/a ); 10245 n/a case PyUnicode_2BYTE_KIND: 10246 n/a return ucs2lib_split_whitespace( 10247 n/a self, PyUnicode_2BYTE_DATA(self), 10248 n/a PyUnicode_GET_LENGTH(self), maxcount 10249 n/a ); 10250 n/a case PyUnicode_4BYTE_KIND: 10251 n/a return ucs4lib_split_whitespace( 10252 n/a self, PyUnicode_4BYTE_DATA(self), 10253 n/a PyUnicode_GET_LENGTH(self), maxcount 10254 n/a ); 10255 n/a default: 10256 n/a assert(0); 10257 n/a return NULL; 10258 n/a } 10259 n/a 10260 n/a if (PyUnicode_READY(substring) == -1) 10261 n/a return NULL; 10262 n/a 10263 n/a kind1 = PyUnicode_KIND(self); 10264 n/a kind2 = PyUnicode_KIND(substring); 10265 n/a len1 = PyUnicode_GET_LENGTH(self); 10266 n/a len2 = PyUnicode_GET_LENGTH(substring); 10267 n/a if (kind1 < kind2 || len1 < len2) { 10268 n/a out = PyList_New(1); 10269 n/a if (out == NULL) 10270 n/a return NULL; 10271 n/a Py_INCREF(self); 10272 n/a PyList_SET_ITEM(out, 0, self); 10273 n/a return out; 10274 n/a } 10275 n/a buf1 = PyUnicode_DATA(self); 10276 n/a buf2 = PyUnicode_DATA(substring); 10277 n/a if (kind2 != kind1) { 10278 n/a buf2 = _PyUnicode_AsKind(substring, kind1); 10279 n/a if (!buf2) 10280 n/a return NULL; 10281 n/a } 10282 n/a 10283 n/a switch (kind1) { 10284 n/a case PyUnicode_1BYTE_KIND: 10285 n/a if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring)) 10286 n/a out = asciilib_split( 10287 n/a self, buf1, len1, buf2, len2, maxcount); 10288 n/a else 10289 n/a out = ucs1lib_split( 10290 n/a self, buf1, len1, buf2, len2, maxcount); 10291 n/a break; 10292 n/a case PyUnicode_2BYTE_KIND: 10293 n/a out = ucs2lib_split( 10294 n/a self, buf1, len1, buf2, len2, maxcount); 10295 n/a break; 10296 n/a case PyUnicode_4BYTE_KIND: 10297 n/a out = ucs4lib_split( 10298 n/a self, buf1, len1, buf2, len2, maxcount); 10299 n/a break; 10300 n/a default: 10301 n/a out = NULL; 10302 n/a } 10303 n/a if (kind2 != kind1) 10304 n/a PyMem_Free(buf2); 10305 n/a return out; 10306 n/a } 10307 n/a 10308 n/a static PyObject * 10309 n/a rsplit(PyObject *self, 10310 n/a PyObject *substring, 10311 n/a Py_ssize_t maxcount) 10312 n/a { 10313 n/a int kind1, kind2; 10314 n/a void *buf1, *buf2; 10315 n/a Py_ssize_t len1, len2; 10316 n/a PyObject* out; 10317 n/a 10318 n/a if (maxcount < 0) 10319 n/a maxcount = PY_SSIZE_T_MAX; 10320 n/a 10321 n/a if (PyUnicode_READY(self) == -1) 10322 n/a return NULL; 10323 n/a 10324 n/a if (substring == NULL) 10325 n/a switch (PyUnicode_KIND(self)) { 10326 n/a case PyUnicode_1BYTE_KIND: 10327 n/a if (PyUnicode_IS_ASCII(self)) 10328 n/a return asciilib_rsplit_whitespace( 10329 n/a self, PyUnicode_1BYTE_DATA(self), 10330 n/a PyUnicode_GET_LENGTH(self), maxcount 10331 n/a ); 10332 n/a else 10333 n/a return ucs1lib_rsplit_whitespace( 10334 n/a self, PyUnicode_1BYTE_DATA(self), 10335 n/a PyUnicode_GET_LENGTH(self), maxcount 10336 n/a ); 10337 n/a case PyUnicode_2BYTE_KIND: 10338 n/a return ucs2lib_rsplit_whitespace( 10339 n/a self, PyUnicode_2BYTE_DATA(self), 10340 n/a PyUnicode_GET_LENGTH(self), maxcount 10341 n/a ); 10342 n/a case PyUnicode_4BYTE_KIND: 10343 n/a return ucs4lib_rsplit_whitespace( 10344 n/a self, PyUnicode_4BYTE_DATA(self), 10345 n/a PyUnicode_GET_LENGTH(self), maxcount 10346 n/a ); 10347 n/a default: 10348 n/a assert(0); 10349 n/a return NULL; 10350 n/a } 10351 n/a 10352 n/a if (PyUnicode_READY(substring) == -1) 10353 n/a return NULL; 10354 n/a 10355 n/a kind1 = PyUnicode_KIND(self); 10356 n/a kind2 = PyUnicode_KIND(substring); 10357 n/a len1 = PyUnicode_GET_LENGTH(self); 10358 n/a len2 = PyUnicode_GET_LENGTH(substring); 10359 n/a if (kind1 < kind2 || len1 < len2) { 10360 n/a out = PyList_New(1); 10361 n/a if (out == NULL) 10362 n/a return NULL; 10363 n/a Py_INCREF(self); 10364 n/a PyList_SET_ITEM(out, 0, self); 10365 n/a return out; 10366 n/a } 10367 n/a buf1 = PyUnicode_DATA(self); 10368 n/a buf2 = PyUnicode_DATA(substring); 10369 n/a if (kind2 != kind1) { 10370 n/a buf2 = _PyUnicode_AsKind(substring, kind1); 10371 n/a if (!buf2) 10372 n/a return NULL; 10373 n/a } 10374 n/a 10375 n/a switch (kind1) { 10376 n/a case PyUnicode_1BYTE_KIND: 10377 n/a if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring)) 10378 n/a out = asciilib_rsplit( 10379 n/a self, buf1, len1, buf2, len2, maxcount); 10380 n/a else 10381 n/a out = ucs1lib_rsplit( 10382 n/a self, buf1, len1, buf2, len2, maxcount); 10383 n/a break; 10384 n/a case PyUnicode_2BYTE_KIND: 10385 n/a out = ucs2lib_rsplit( 10386 n/a self, buf1, len1, buf2, len2, maxcount); 10387 n/a break; 10388 n/a case PyUnicode_4BYTE_KIND: 10389 n/a out = ucs4lib_rsplit( 10390 n/a self, buf1, len1, buf2, len2, maxcount); 10391 n/a break; 10392 n/a default: 10393 n/a out = NULL; 10394 n/a } 10395 n/a if (kind2 != kind1) 10396 n/a PyMem_Free(buf2); 10397 n/a return out; 10398 n/a } 10399 n/a 10400 n/a static Py_ssize_t 10401 n/a anylib_find(int kind, PyObject *str1, void *buf1, Py_ssize_t len1, 10402 n/a PyObject *str2, void *buf2, Py_ssize_t len2, Py_ssize_t offset) 10403 n/a { 10404 n/a switch (kind) { 10405 n/a case PyUnicode_1BYTE_KIND: 10406 n/a if (PyUnicode_IS_ASCII(str1) && PyUnicode_IS_ASCII(str2)) 10407 n/a return asciilib_find(buf1, len1, buf2, len2, offset); 10408 n/a else 10409 n/a return ucs1lib_find(buf1, len1, buf2, len2, offset); 10410 n/a case PyUnicode_2BYTE_KIND: 10411 n/a return ucs2lib_find(buf1, len1, buf2, len2, offset); 10412 n/a case PyUnicode_4BYTE_KIND: 10413 n/a return ucs4lib_find(buf1, len1, buf2, len2, offset); 10414 n/a } 10415 n/a assert(0); 10416 n/a return -1; 10417 n/a } 10418 n/a 10419 n/a static Py_ssize_t 10420 n/a anylib_count(int kind, PyObject *sstr, void* sbuf, Py_ssize_t slen, 10421 n/a PyObject *str1, void *buf1, Py_ssize_t len1, Py_ssize_t maxcount) 10422 n/a { 10423 n/a switch (kind) { 10424 n/a case PyUnicode_1BYTE_KIND: 10425 n/a if (PyUnicode_IS_ASCII(sstr) && PyUnicode_IS_ASCII(str1)) 10426 n/a return asciilib_count(sbuf, slen, buf1, len1, maxcount); 10427 n/a else 10428 n/a return ucs1lib_count(sbuf, slen, buf1, len1, maxcount); 10429 n/a case PyUnicode_2BYTE_KIND: 10430 n/a return ucs2lib_count(sbuf, slen, buf1, len1, maxcount); 10431 n/a case PyUnicode_4BYTE_KIND: 10432 n/a return ucs4lib_count(sbuf, slen, buf1, len1, maxcount); 10433 n/a } 10434 n/a assert(0); 10435 n/a return 0; 10436 n/a } 10437 n/a 10438 n/a static void 10439 n/a replace_1char_inplace(PyObject *u, Py_ssize_t pos, 10440 n/a Py_UCS4 u1, Py_UCS4 u2, Py_ssize_t maxcount) 10441 n/a { 10442 n/a int kind = PyUnicode_KIND(u); 10443 n/a void *data = PyUnicode_DATA(u); 10444 n/a Py_ssize_t len = PyUnicode_GET_LENGTH(u); 10445 n/a if (kind == PyUnicode_1BYTE_KIND) { 10446 n/a ucs1lib_replace_1char_inplace((Py_UCS1 *)data + pos, 10447 n/a (Py_UCS1 *)data + len, 10448 n/a u1, u2, maxcount); 10449 n/a } 10450 n/a else if (kind == PyUnicode_2BYTE_KIND) { 10451 n/a ucs2lib_replace_1char_inplace((Py_UCS2 *)data + pos, 10452 n/a (Py_UCS2 *)data + len, 10453 n/a u1, u2, maxcount); 10454 n/a } 10455 n/a else { 10456 n/a assert(kind == PyUnicode_4BYTE_KIND); 10457 n/a ucs4lib_replace_1char_inplace((Py_UCS4 *)data + pos, 10458 n/a (Py_UCS4 *)data + len, 10459 n/a u1, u2, maxcount); 10460 n/a } 10461 n/a } 10462 n/a 10463 n/a static PyObject * 10464 n/a replace(PyObject *self, PyObject *str1, 10465 n/a PyObject *str2, Py_ssize_t maxcount) 10466 n/a { 10467 n/a PyObject *u; 10468 n/a char *sbuf = PyUnicode_DATA(self); 10469 n/a char *buf1 = PyUnicode_DATA(str1); 10470 n/a char *buf2 = PyUnicode_DATA(str2); 10471 n/a int srelease = 0, release1 = 0, release2 = 0; 10472 n/a int skind = PyUnicode_KIND(self); 10473 n/a int kind1 = PyUnicode_KIND(str1); 10474 n/a int kind2 = PyUnicode_KIND(str2); 10475 n/a Py_ssize_t slen = PyUnicode_GET_LENGTH(self); 10476 n/a Py_ssize_t len1 = PyUnicode_GET_LENGTH(str1); 10477 n/a Py_ssize_t len2 = PyUnicode_GET_LENGTH(str2); 10478 n/a int mayshrink; 10479 n/a Py_UCS4 maxchar, maxchar_str1, maxchar_str2; 10480 n/a 10481 n/a if (maxcount < 0) 10482 n/a maxcount = PY_SSIZE_T_MAX; 10483 n/a else if (maxcount == 0 || slen == 0) 10484 n/a goto nothing; 10485 n/a 10486 n/a if (str1 == str2) 10487 n/a goto nothing; 10488 n/a 10489 n/a maxchar = PyUnicode_MAX_CHAR_VALUE(self); 10490 n/a maxchar_str1 = PyUnicode_MAX_CHAR_VALUE(str1); 10491 n/a if (maxchar < maxchar_str1) 10492 n/a /* substring too wide to be present */ 10493 n/a goto nothing; 10494 n/a maxchar_str2 = PyUnicode_MAX_CHAR_VALUE(str2); 10495 n/a /* Replacing str1 with str2 may cause a maxchar reduction in the 10496 n/a result string. */ 10497 n/a mayshrink = (maxchar_str2 < maxchar_str1) && (maxchar == maxchar_str1); 10498 n/a maxchar = Py_MAX(maxchar, maxchar_str2); 10499 n/a 10500 n/a if (len1 == len2) { 10501 n/a /* same length */ 10502 n/a if (len1 == 0) 10503 n/a goto nothing; 10504 n/a if (len1 == 1) { 10505 n/a /* replace characters */ 10506 n/a Py_UCS4 u1, u2; 10507 n/a Py_ssize_t pos; 10508 n/a 10509 n/a u1 = PyUnicode_READ(kind1, buf1, 0); 10510 n/a pos = findchar(sbuf, skind, slen, u1, 1); 10511 n/a if (pos < 0) 10512 n/a goto nothing; 10513 n/a u2 = PyUnicode_READ(kind2, buf2, 0); 10514 n/a u = PyUnicode_New(slen, maxchar); 10515 n/a if (!u) 10516 n/a goto error; 10517 n/a 10518 n/a _PyUnicode_FastCopyCharacters(u, 0, self, 0, slen); 10519 n/a replace_1char_inplace(u, pos, u1, u2, maxcount); 10520 n/a } 10521 n/a else { 10522 n/a int rkind = skind; 10523 n/a char *res; 10524 n/a Py_ssize_t i; 10525 n/a 10526 n/a if (kind1 < rkind) { 10527 n/a /* widen substring */ 10528 n/a buf1 = _PyUnicode_AsKind(str1, rkind); 10529 n/a if (!buf1) goto error; 10530 n/a release1 = 1; 10531 n/a } 10532 n/a i = anylib_find(rkind, self, sbuf, slen, str1, buf1, len1, 0); 10533 n/a if (i < 0) 10534 n/a goto nothing; 10535 n/a if (rkind > kind2) { 10536 n/a /* widen replacement */ 10537 n/a buf2 = _PyUnicode_AsKind(str2, rkind); 10538 n/a if (!buf2) goto error; 10539 n/a release2 = 1; 10540 n/a } 10541 n/a else if (rkind < kind2) { 10542 n/a /* widen self and buf1 */ 10543 n/a rkind = kind2; 10544 n/a if (release1) PyMem_Free(buf1); 10545 n/a release1 = 0; 10546 n/a sbuf = _PyUnicode_AsKind(self, rkind); 10547 n/a if (!sbuf) goto error; 10548 n/a srelease = 1; 10549 n/a buf1 = _PyUnicode_AsKind(str1, rkind); 10550 n/a if (!buf1) goto error; 10551 n/a release1 = 1; 10552 n/a } 10553 n/a u = PyUnicode_New(slen, maxchar); 10554 n/a if (!u) 10555 n/a goto error; 10556 n/a assert(PyUnicode_KIND(u) == rkind); 10557 n/a res = PyUnicode_DATA(u); 10558 n/a 10559 n/a memcpy(res, sbuf, rkind * slen); 10560 n/a /* change everything in-place, starting with this one */ 10561 n/a memcpy(res + rkind * i, 10562 n/a buf2, 10563 n/a rkind * len2); 10564 n/a i += len1; 10565 n/a 10566 n/a while ( --maxcount > 0) { 10567 n/a i = anylib_find(rkind, self, 10568 n/a sbuf+rkind*i, slen-i, 10569 n/a str1, buf1, len1, i); 10570 n/a if (i == -1) 10571 n/a break; 10572 n/a memcpy(res + rkind * i, 10573 n/a buf2, 10574 n/a rkind * len2); 10575 n/a i += len1; 10576 n/a } 10577 n/a } 10578 n/a } 10579 n/a else { 10580 n/a Py_ssize_t n, i, j, ires; 10581 n/a Py_ssize_t new_size; 10582 n/a int rkind = skind; 10583 n/a char *res; 10584 n/a 10585 n/a if (kind1 < rkind) { 10586 n/a /* widen substring */ 10587 n/a buf1 = _PyUnicode_AsKind(str1, rkind); 10588 n/a if (!buf1) goto error; 10589 n/a release1 = 1; 10590 n/a } 10591 n/a n = anylib_count(rkind, self, sbuf, slen, str1, buf1, len1, maxcount); 10592 n/a if (n == 0) 10593 n/a goto nothing; 10594 n/a if (kind2 < rkind) { 10595 n/a /* widen replacement */ 10596 n/a buf2 = _PyUnicode_AsKind(str2, rkind); 10597 n/a if (!buf2) goto error; 10598 n/a release2 = 1; 10599 n/a } 10600 n/a else if (kind2 > rkind) { 10601 n/a /* widen self and buf1 */ 10602 n/a rkind = kind2; 10603 n/a sbuf = _PyUnicode_AsKind(self, rkind); 10604 n/a if (!sbuf) goto error; 10605 n/a srelease = 1; 10606 n/a if (release1) PyMem_Free(buf1); 10607 n/a release1 = 0; 10608 n/a buf1 = _PyUnicode_AsKind(str1, rkind); 10609 n/a if (!buf1) goto error; 10610 n/a release1 = 1; 10611 n/a } 10612 n/a /* new_size = PyUnicode_GET_LENGTH(self) + n * (PyUnicode_GET_LENGTH(str2) - 10613 n/a PyUnicode_GET_LENGTH(str1))); */ 10614 n/a if (len1 < len2 && len2 - len1 > (PY_SSIZE_T_MAX - slen) / n) { 10615 n/a PyErr_SetString(PyExc_OverflowError, 10616 n/a "replace string is too long"); 10617 n/a goto error; 10618 n/a } 10619 n/a new_size = slen + n * (len2 - len1); 10620 n/a if (new_size == 0) { 10621 n/a _Py_INCREF_UNICODE_EMPTY(); 10622 n/a if (!unicode_empty) 10623 n/a goto error; 10624 n/a u = unicode_empty; 10625 n/a goto done; 10626 n/a } 10627 n/a if (new_size > (PY_SSIZE_T_MAX / rkind)) { 10628 n/a PyErr_SetString(PyExc_OverflowError, 10629 n/a "replace string is too long"); 10630 n/a goto error; 10631 n/a } 10632 n/a u = PyUnicode_New(new_size, maxchar); 10633 n/a if (!u) 10634 n/a goto error; 10635 n/a assert(PyUnicode_KIND(u) == rkind); 10636 n/a res = PyUnicode_DATA(u); 10637 n/a ires = i = 0; 10638 n/a if (len1 > 0) { 10639 n/a while (n-- > 0) { 10640 n/a /* look for next match */ 10641 n/a j = anylib_find(rkind, self, 10642 n/a sbuf + rkind * i, slen-i, 10643 n/a str1, buf1, len1, i); 10644 n/a if (j == -1) 10645 n/a break; 10646 n/a else if (j > i) { 10647 n/a /* copy unchanged part [i:j] */ 10648 n/a memcpy(res + rkind * ires, 10649 n/a sbuf + rkind * i, 10650 n/a rkind * (j-i)); 10651 n/a ires += j - i; 10652 n/a } 10653 n/a /* copy substitution string */ 10654 n/a if (len2 > 0) { 10655 n/a memcpy(res + rkind * ires, 10656 n/a buf2, 10657 n/a rkind * len2); 10658 n/a ires += len2; 10659 n/a } 10660 n/a i = j + len1; 10661 n/a } 10662 n/a if (i < slen) 10663 n/a /* copy tail [i:] */ 10664 n/a memcpy(res + rkind * ires, 10665 n/a sbuf + rkind * i, 10666 n/a rkind * (slen-i)); 10667 n/a } 10668 n/a else { 10669 n/a /* interleave */ 10670 n/a while (n > 0) { 10671 n/a memcpy(res + rkind * ires, 10672 n/a buf2, 10673 n/a rkind * len2); 10674 n/a ires += len2; 10675 n/a if (--n <= 0) 10676 n/a break; 10677 n/a memcpy(res + rkind * ires, 10678 n/a sbuf + rkind * i, 10679 n/a rkind); 10680 n/a ires++; 10681 n/a i++; 10682 n/a } 10683 n/a memcpy(res + rkind * ires, 10684 n/a sbuf + rkind * i, 10685 n/a rkind * (slen-i)); 10686 n/a } 10687 n/a } 10688 n/a 10689 n/a if (mayshrink) { 10690 n/a unicode_adjust_maxchar(&u); 10691 n/a if (u == NULL) 10692 n/a goto error; 10693 n/a } 10694 n/a 10695 n/a done: 10696 n/a if (srelease) 10697 n/a PyMem_FREE(sbuf); 10698 n/a if (release1) 10699 n/a PyMem_FREE(buf1); 10700 n/a if (release2) 10701 n/a PyMem_FREE(buf2); 10702 n/a assert(_PyUnicode_CheckConsistency(u, 1)); 10703 n/a return u; 10704 n/a 10705 n/a nothing: 10706 n/a /* nothing to replace; return original string (when possible) */ 10707 n/a if (srelease) 10708 n/a PyMem_FREE(sbuf); 10709 n/a if (release1) 10710 n/a PyMem_FREE(buf1); 10711 n/a if (release2) 10712 n/a PyMem_FREE(buf2); 10713 n/a return unicode_result_unchanged(self); 10714 n/a 10715 n/a error: 10716 n/a if (srelease && sbuf) 10717 n/a PyMem_FREE(sbuf); 10718 n/a if (release1 && buf1) 10719 n/a PyMem_FREE(buf1); 10720 n/a if (release2 && buf2) 10721 n/a PyMem_FREE(buf2); 10722 n/a return NULL; 10723 n/a } 10724 n/a 10725 n/a /* --- Unicode Object Methods --------------------------------------------- */ 10726 n/a 10727 n/a /*[clinic input] 10728 n/a str.title as unicode_title 10729 n/a 10730 n/a Return a version of the string where each word is titlecased. 10731 n/a 10732 n/a More specifically, words start with uppercased characters and all remaining 10733 n/a cased characters have lower case. 10734 n/a [clinic start generated code]*/ 10735 n/a 10736 n/a static PyObject * 10737 n/a unicode_title_impl(PyObject *self) 10738 n/a /*[clinic end generated code: output=c75ae03809574902 input=fa945d669b26e683]*/ 10739 n/a { 10740 n/a if (PyUnicode_READY(self) == -1) 10741 n/a return NULL; 10742 n/a return case_operation(self, do_title); 10743 n/a } 10744 n/a 10745 n/a /*[clinic input] 10746 n/a str.capitalize as unicode_capitalize 10747 n/a 10748 n/a Return a capitalized version of the string. 10749 n/a 10750 n/a More specifically, make the first character have upper case and the rest lower 10751 n/a case. 10752 n/a [clinic start generated code]*/ 10753 n/a 10754 n/a static PyObject * 10755 n/a unicode_capitalize_impl(PyObject *self) 10756 n/a /*[clinic end generated code: output=e49a4c333cdb7667 input=f4cbf1016938da6d]*/ 10757 n/a { 10758 n/a if (PyUnicode_READY(self) == -1) 10759 n/a return NULL; 10760 n/a if (PyUnicode_GET_LENGTH(self) == 0) 10761 n/a return unicode_result_unchanged(self); 10762 n/a return case_operation(self, do_capitalize); 10763 n/a } 10764 n/a 10765 n/a /*[clinic input] 10766 n/a str.casefold as unicode_casefold 10767 n/a 10768 n/a Return a version of the string suitable for caseless comparisons. 10769 n/a [clinic start generated code]*/ 10770 n/a 10771 n/a static PyObject * 10772 n/a unicode_casefold_impl(PyObject *self) 10773 n/a /*[clinic end generated code: output=0120daf657ca40af input=384d66cc2ae30daf]*/ 10774 n/a { 10775 n/a if (PyUnicode_READY(self) == -1) 10776 n/a return NULL; 10777 n/a if (PyUnicode_IS_ASCII(self)) 10778 n/a return ascii_upper_or_lower(self, 1); 10779 n/a return case_operation(self, do_casefold); 10780 n/a } 10781 n/a 10782 n/a 10783 n/a /* Argument converter. Accepts a single Unicode character. */ 10784 n/a 10785 n/a static int 10786 n/a convert_uc(PyObject *obj, void *addr) 10787 n/a { 10788 n/a Py_UCS4 *fillcharloc = (Py_UCS4 *)addr; 10789 n/a 10790 n/a if (!PyUnicode_Check(obj)) { 10791 n/a PyErr_Format(PyExc_TypeError, 10792 n/a "The fill character must be a unicode character, " 10793 n/a "not %.100s", Py_TYPE(obj)->tp_name); 10794 n/a return 0; 10795 n/a } 10796 n/a if (PyUnicode_READY(obj) < 0) 10797 n/a return 0; 10798 n/a if (PyUnicode_GET_LENGTH(obj) != 1) { 10799 n/a PyErr_SetString(PyExc_TypeError, 10800 n/a "The fill character must be exactly one character long"); 10801 n/a return 0; 10802 n/a } 10803 n/a *fillcharloc = PyUnicode_READ_CHAR(obj, 0); 10804 n/a return 1; 10805 n/a } 10806 n/a 10807 n/a /*[clinic input] 10808 n/a str.center as unicode_center 10809 n/a 10810 n/a width: Py_ssize_t 10811 n/a fillchar: Py_UCS4 = ' ' 10812 n/a / 10813 n/a 10814 n/a Return a centered string of length width. 10815 n/a 10816 n/a Padding is done using the specified fill character (default is a space). 10817 n/a [clinic start generated code]*/ 10818 n/a 10819 n/a static PyObject * 10820 n/a unicode_center_impl(PyObject *self, Py_ssize_t width, Py_UCS4 fillchar) 10821 n/a /*[clinic end generated code: output=420c8859effc7c0c input=b42b247eb26e6519]*/ 10822 n/a { 10823 n/a Py_ssize_t marg, left; 10824 n/a 10825 n/a if (PyUnicode_READY(self) == -1) 10826 n/a return NULL; 10827 n/a 10828 n/a if (PyUnicode_GET_LENGTH(self) >= width) 10829 n/a return unicode_result_unchanged(self); 10830 n/a 10831 n/a marg = width - PyUnicode_GET_LENGTH(self); 10832 n/a left = marg / 2 + (marg & width & 1); 10833 n/a 10834 n/a return pad(self, left, marg - left, fillchar); 10835 n/a } 10836 n/a 10837 n/a /* This function assumes that str1 and str2 are readied by the caller. */ 10838 n/a 10839 n/a static int 10840 n/a unicode_compare(PyObject *str1, PyObject *str2) 10841 n/a { 10842 n/a #define COMPARE(TYPE1, TYPE2) \ 10843 n/a do { \ 10844 n/a TYPE1* p1 = (TYPE1 *)data1; \ 10845 n/a TYPE2* p2 = (TYPE2 *)data2; \ 10846 n/a TYPE1* end = p1 + len; \ 10847 n/a Py_UCS4 c1, c2; \ 10848 n/a for (; p1 != end; p1++, p2++) { \ 10849 n/a c1 = *p1; \ 10850 n/a c2 = *p2; \ 10851 n/a if (c1 != c2) \ 10852 n/a return (c1 < c2) ? -1 : 1; \ 10853 n/a } \ 10854 n/a } \ 10855 n/a while (0) 10856 n/a 10857 n/a int kind1, kind2; 10858 n/a void *data1, *data2; 10859 n/a Py_ssize_t len1, len2, len; 10860 n/a 10861 n/a kind1 = PyUnicode_KIND(str1); 10862 n/a kind2 = PyUnicode_KIND(str2); 10863 n/a data1 = PyUnicode_DATA(str1); 10864 n/a data2 = PyUnicode_DATA(str2); 10865 n/a len1 = PyUnicode_GET_LENGTH(str1); 10866 n/a len2 = PyUnicode_GET_LENGTH(str2); 10867 n/a len = Py_MIN(len1, len2); 10868 n/a 10869 n/a switch(kind1) { 10870 n/a case PyUnicode_1BYTE_KIND: 10871 n/a { 10872 n/a switch(kind2) { 10873 n/a case PyUnicode_1BYTE_KIND: 10874 n/a { 10875 n/a int cmp = memcmp(data1, data2, len); 10876 n/a /* normalize result of memcmp() into the range [-1; 1] */ 10877 n/a if (cmp < 0) 10878 n/a return -1; 10879 n/a if (cmp > 0) 10880 n/a return 1; 10881 n/a break; 10882 n/a } 10883 n/a case PyUnicode_2BYTE_KIND: 10884 n/a COMPARE(Py_UCS1, Py_UCS2); 10885 n/a break; 10886 n/a case PyUnicode_4BYTE_KIND: 10887 n/a COMPARE(Py_UCS1, Py_UCS4); 10888 n/a break; 10889 n/a default: 10890 n/a assert(0); 10891 n/a } 10892 n/a break; 10893 n/a } 10894 n/a case PyUnicode_2BYTE_KIND: 10895 n/a { 10896 n/a switch(kind2) { 10897 n/a case PyUnicode_1BYTE_KIND: 10898 n/a COMPARE(Py_UCS2, Py_UCS1); 10899 n/a break; 10900 n/a case PyUnicode_2BYTE_KIND: 10901 n/a { 10902 n/a COMPARE(Py_UCS2, Py_UCS2); 10903 n/a break; 10904 n/a } 10905 n/a case PyUnicode_4BYTE_KIND: 10906 n/a COMPARE(Py_UCS2, Py_UCS4); 10907 n/a break; 10908 n/a default: 10909 n/a assert(0); 10910 n/a } 10911 n/a break; 10912 n/a } 10913 n/a case PyUnicode_4BYTE_KIND: 10914 n/a { 10915 n/a switch(kind2) { 10916 n/a case PyUnicode_1BYTE_KIND: 10917 n/a COMPARE(Py_UCS4, Py_UCS1); 10918 n/a break; 10919 n/a case PyUnicode_2BYTE_KIND: 10920 n/a COMPARE(Py_UCS4, Py_UCS2); 10921 n/a break; 10922 n/a case PyUnicode_4BYTE_KIND: 10923 n/a { 10924 n/a #if defined(HAVE_WMEMCMP) && SIZEOF_WCHAR_T == 4 10925 n/a int cmp = wmemcmp((wchar_t *)data1, (wchar_t *)data2, len); 10926 n/a /* normalize result of wmemcmp() into the range [-1; 1] */ 10927 n/a if (cmp < 0) 10928 n/a return -1; 10929 n/a if (cmp > 0) 10930 n/a return 1; 10931 n/a #else 10932 n/a COMPARE(Py_UCS4, Py_UCS4); 10933 n/a #endif 10934 n/a break; 10935 n/a } 10936 n/a default: 10937 n/a assert(0); 10938 n/a } 10939 n/a break; 10940 n/a } 10941 n/a default: 10942 n/a assert(0); 10943 n/a } 10944 n/a 10945 n/a if (len1 == len2) 10946 n/a return 0; 10947 n/a if (len1 < len2) 10948 n/a return -1; 10949 n/a else 10950 n/a return 1; 10951 n/a 10952 n/a #undef COMPARE 10953 n/a } 10954 n/a 10955 n/a static int 10956 n/a unicode_compare_eq(PyObject *str1, PyObject *str2) 10957 n/a { 10958 n/a int kind; 10959 n/a void *data1, *data2; 10960 n/a Py_ssize_t len; 10961 n/a int cmp; 10962 n/a 10963 n/a len = PyUnicode_GET_LENGTH(str1); 10964 n/a if (PyUnicode_GET_LENGTH(str2) != len) 10965 n/a return 0; 10966 n/a kind = PyUnicode_KIND(str1); 10967 n/a if (PyUnicode_KIND(str2) != kind) 10968 n/a return 0; 10969 n/a data1 = PyUnicode_DATA(str1); 10970 n/a data2 = PyUnicode_DATA(str2); 10971 n/a 10972 n/a cmp = memcmp(data1, data2, len * kind); 10973 n/a return (cmp == 0); 10974 n/a } 10975 n/a 10976 n/a 10977 n/a int 10978 n/a PyUnicode_Compare(PyObject *left, PyObject *right) 10979 n/a { 10980 n/a if (PyUnicode_Check(left) && PyUnicode_Check(right)) { 10981 n/a if (PyUnicode_READY(left) == -1 || 10982 n/a PyUnicode_READY(right) == -1) 10983 n/a return -1; 10984 n/a 10985 n/a /* a string is equal to itself */ 10986 n/a if (left == right) 10987 n/a return 0; 10988 n/a 10989 n/a return unicode_compare(left, right); 10990 n/a } 10991 n/a PyErr_Format(PyExc_TypeError, 10992 n/a "Can't compare %.100s and %.100s", 10993 n/a left->ob_type->tp_name, 10994 n/a right->ob_type->tp_name); 10995 n/a return -1; 10996 n/a } 10997 n/a 10998 n/a int 10999 n/a PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str) 11000 n/a { 11001 n/a Py_ssize_t i; 11002 n/a int kind; 11003 n/a Py_UCS4 chr; 11004 n/a const unsigned char *ustr = (const unsigned char *)str; 11005 n/a 11006 n/a assert(_PyUnicode_CHECK(uni)); 11007 n/a if (!PyUnicode_IS_READY(uni)) { 11008 n/a const wchar_t *ws = _PyUnicode_WSTR(uni); 11009 n/a /* Compare Unicode string and source character set string */ 11010 n/a for (i = 0; (chr = ws[i]) && ustr[i]; i++) { 11011 n/a if (chr != ustr[i]) 11012 n/a return (chr < ustr[i]) ? -1 : 1; 11013 n/a } 11014 n/a /* This check keeps Python strings that end in '\0' from comparing equal 11015 n/a to C strings identical up to that point. */ 11016 n/a if (_PyUnicode_WSTR_LENGTH(uni) != i || chr) 11017 n/a return 1; /* uni is longer */ 11018 n/a if (ustr[i]) 11019 n/a return -1; /* str is longer */ 11020 n/a return 0; 11021 n/a } 11022 n/a kind = PyUnicode_KIND(uni); 11023 n/a if (kind == PyUnicode_1BYTE_KIND) { 11024 n/a const void *data = PyUnicode_1BYTE_DATA(uni); 11025 n/a size_t len1 = (size_t)PyUnicode_GET_LENGTH(uni); 11026 n/a size_t len, len2 = strlen(str); 11027 n/a int cmp; 11028 n/a 11029 n/a len = Py_MIN(len1, len2); 11030 n/a cmp = memcmp(data, str, len); 11031 n/a if (cmp != 0) { 11032 n/a if (cmp < 0) 11033 n/a return -1; 11034 n/a else 11035 n/a return 1; 11036 n/a } 11037 n/a if (len1 > len2) 11038 n/a return 1; /* uni is longer */ 11039 n/a if (len1 < len2) 11040 n/a return -1; /* str is longer */ 11041 n/a return 0; 11042 n/a } 11043 n/a else { 11044 n/a void *data = PyUnicode_DATA(uni); 11045 n/a /* Compare Unicode string and source character set string */ 11046 n/a for (i = 0; (chr = PyUnicode_READ(kind, data, i)) && str[i]; i++) 11047 n/a if (chr != (unsigned char)str[i]) 11048 n/a return (chr < (unsigned char)(str[i])) ? -1 : 1; 11049 n/a /* This check keeps Python strings that end in '\0' from comparing equal 11050 n/a to C strings identical up to that point. */ 11051 n/a if (PyUnicode_GET_LENGTH(uni) != i || chr) 11052 n/a return 1; /* uni is longer */ 11053 n/a if (str[i]) 11054 n/a return -1; /* str is longer */ 11055 n/a return 0; 11056 n/a } 11057 n/a } 11058 n/a 11059 n/a static int 11060 n/a non_ready_unicode_equal_to_ascii_string(PyObject *unicode, const char *str) 11061 n/a { 11062 n/a size_t i, len; 11063 n/a const wchar_t *p; 11064 n/a len = (size_t)_PyUnicode_WSTR_LENGTH(unicode); 11065 n/a if (strlen(str) != len) 11066 n/a return 0; 11067 n/a p = _PyUnicode_WSTR(unicode); 11068 n/a assert(p); 11069 n/a for (i = 0; i < len; i++) { 11070 n/a unsigned char c = (unsigned char)str[i]; 11071 n/a if (c >= 128 || p[i] != (wchar_t)c) 11072 n/a return 0; 11073 n/a } 11074 n/a return 1; 11075 n/a } 11076 n/a 11077 n/a int 11078 n/a _PyUnicode_EqualToASCIIString(PyObject *unicode, const char *str) 11079 n/a { 11080 n/a size_t len; 11081 n/a assert(_PyUnicode_CHECK(unicode)); 11082 n/a assert(str); 11083 n/a #ifndef NDEBUG 11084 n/a for (const char *p = str; *p; p++) { 11085 n/a assert((unsigned char)*p < 128); 11086 n/a } 11087 n/a #endif 11088 n/a if (PyUnicode_READY(unicode) == -1) { 11089 n/a /* Memory error or bad data */ 11090 n/a PyErr_Clear(); 11091 n/a return non_ready_unicode_equal_to_ascii_string(unicode, str); 11092 n/a } 11093 n/a if (!PyUnicode_IS_ASCII(unicode)) 11094 n/a return 0; 11095 n/a len = (size_t)PyUnicode_GET_LENGTH(unicode); 11096 n/a return strlen(str) == len && 11097 n/a memcmp(PyUnicode_1BYTE_DATA(unicode), str, len) == 0; 11098 n/a } 11099 n/a 11100 n/a int 11101 n/a _PyUnicode_EqualToASCIIId(PyObject *left, _Py_Identifier *right) 11102 n/a { 11103 n/a PyObject *right_uni; 11104 n/a Py_hash_t hash; 11105 n/a 11106 n/a assert(_PyUnicode_CHECK(left)); 11107 n/a assert(right->string); 11108 n/a #ifndef NDEBUG 11109 n/a for (const char *p = right->string; *p; p++) { 11110 n/a assert((unsigned char)*p < 128); 11111 n/a } 11112 n/a #endif 11113 n/a 11114 n/a if (PyUnicode_READY(left) == -1) { 11115 n/a /* memory error or bad data */ 11116 n/a PyErr_Clear(); 11117 n/a return non_ready_unicode_equal_to_ascii_string(left, right->string); 11118 n/a } 11119 n/a 11120 n/a if (!PyUnicode_IS_ASCII(left)) 11121 n/a return 0; 11122 n/a 11123 n/a right_uni = _PyUnicode_FromId(right); /* borrowed */ 11124 n/a if (right_uni == NULL) { 11125 n/a /* memory error or bad data */ 11126 n/a PyErr_Clear(); 11127 n/a return _PyUnicode_EqualToASCIIString(left, right->string); 11128 n/a } 11129 n/a 11130 n/a if (left == right_uni) 11131 n/a return 1; 11132 n/a 11133 n/a if (PyUnicode_CHECK_INTERNED(left)) 11134 n/a return 0; 11135 n/a 11136 n/a assert(_PyUnicode_HASH(right_uni) != 1); 11137 n/a hash = _PyUnicode_HASH(left); 11138 n/a if (hash != -1 && hash != _PyUnicode_HASH(right_uni)) 11139 n/a return 0; 11140 n/a 11141 n/a return unicode_compare_eq(left, right_uni); 11142 n/a } 11143 n/a 11144 n/a #define TEST_COND(cond) \ 11145 n/a ((cond) ? Py_True : Py_False) 11146 n/a 11147 n/a PyObject * 11148 n/a PyUnicode_RichCompare(PyObject *left, PyObject *right, int op) 11149 n/a { 11150 n/a int result; 11151 n/a PyObject *v; 11152 n/a 11153 n/a if (!PyUnicode_Check(left) || !PyUnicode_Check(right)) 11154 n/a Py_RETURN_NOTIMPLEMENTED; 11155 n/a 11156 n/a if (PyUnicode_READY(left) == -1 || 11157 n/a PyUnicode_READY(right) == -1) 11158 n/a return NULL; 11159 n/a 11160 n/a if (left == right) { 11161 n/a switch (op) { 11162 n/a case Py_EQ: 11163 n/a case Py_LE: 11164 n/a case Py_GE: 11165 n/a /* a string is equal to itself */ 11166 n/a v = Py_True; 11167 n/a break; 11168 n/a case Py_NE: 11169 n/a case Py_LT: 11170 n/a case Py_GT: 11171 n/a v = Py_False; 11172 n/a break; 11173 n/a default: 11174 n/a PyErr_BadArgument(); 11175 n/a return NULL; 11176 n/a } 11177 n/a } 11178 n/a else if (op == Py_EQ || op == Py_NE) { 11179 n/a result = unicode_compare_eq(left, right); 11180 n/a result ^= (op == Py_NE); 11181 n/a v = TEST_COND(result); 11182 n/a } 11183 n/a else { 11184 n/a result = unicode_compare(left, right); 11185 n/a 11186 n/a /* Convert the return value to a Boolean */ 11187 n/a switch (op) { 11188 n/a case Py_LE: 11189 n/a v = TEST_COND(result <= 0); 11190 n/a break; 11191 n/a case Py_GE: 11192 n/a v = TEST_COND(result >= 0); 11193 n/a break; 11194 n/a case Py_LT: 11195 n/a v = TEST_COND(result == -1); 11196 n/a break; 11197 n/a case Py_GT: 11198 n/a v = TEST_COND(result == 1); 11199 n/a break; 11200 n/a default: 11201 n/a PyErr_BadArgument(); 11202 n/a return NULL; 11203 n/a } 11204 n/a } 11205 n/a Py_INCREF(v); 11206 n/a return v; 11207 n/a } 11208 n/a 11209 n/a int 11210 n/a _PyUnicode_EQ(PyObject *aa, PyObject *bb) 11211 n/a { 11212 n/a return unicode_eq(aa, bb); 11213 n/a } 11214 n/a 11215 n/a int 11216 n/a PyUnicode_Contains(PyObject *str, PyObject *substr) 11217 n/a { 11218 n/a int kind1, kind2; 11219 n/a void *buf1, *buf2; 11220 n/a Py_ssize_t len1, len2; 11221 n/a int result; 11222 n/a 11223 n/a if (!PyUnicode_Check(substr)) { 11224 n/a PyErr_Format(PyExc_TypeError, 11225 n/a "'in <string>' requires string as left operand, not %.100s", 11226 n/a Py_TYPE(substr)->tp_name); 11227 n/a return -1; 11228 n/a } 11229 n/a if (PyUnicode_READY(substr) == -1) 11230 n/a return -1; 11231 n/a if (ensure_unicode(str) < 0) 11232 n/a return -1; 11233 n/a 11234 n/a kind1 = PyUnicode_KIND(str); 11235 n/a kind2 = PyUnicode_KIND(substr); 11236 n/a if (kind1 < kind2) 11237 n/a return 0; 11238 n/a len1 = PyUnicode_GET_LENGTH(str); 11239 n/a len2 = PyUnicode_GET_LENGTH(substr); 11240 n/a if (len1 < len2) 11241 n/a return 0; 11242 n/a buf1 = PyUnicode_DATA(str); 11243 n/a buf2 = PyUnicode_DATA(substr); 11244 n/a if (len2 == 1) { 11245 n/a Py_UCS4 ch = PyUnicode_READ(kind2, buf2, 0); 11246 n/a result = findchar((const char *)buf1, kind1, len1, ch, 1) != -1; 11247 n/a return result; 11248 n/a } 11249 n/a if (kind2 != kind1) { 11250 n/a buf2 = _PyUnicode_AsKind(substr, kind1); 11251 n/a if (!buf2) 11252 n/a return -1; 11253 n/a } 11254 n/a 11255 n/a switch (kind1) { 11256 n/a case PyUnicode_1BYTE_KIND: 11257 n/a result = ucs1lib_find(buf1, len1, buf2, len2, 0) != -1; 11258 n/a break; 11259 n/a case PyUnicode_2BYTE_KIND: 11260 n/a result = ucs2lib_find(buf1, len1, buf2, len2, 0) != -1; 11261 n/a break; 11262 n/a case PyUnicode_4BYTE_KIND: 11263 n/a result = ucs4lib_find(buf1, len1, buf2, len2, 0) != -1; 11264 n/a break; 11265 n/a default: 11266 n/a result = -1; 11267 n/a assert(0); 11268 n/a } 11269 n/a 11270 n/a if (kind2 != kind1) 11271 n/a PyMem_Free(buf2); 11272 n/a 11273 n/a return result; 11274 n/a } 11275 n/a 11276 n/a /* Concat to string or Unicode object giving a new Unicode object. */ 11277 n/a 11278 n/a PyObject * 11279 n/a PyUnicode_Concat(PyObject *left, PyObject *right) 11280 n/a { 11281 n/a PyObject *result; 11282 n/a Py_UCS4 maxchar, maxchar2; 11283 n/a Py_ssize_t left_len, right_len, new_len; 11284 n/a 11285 n/a if (ensure_unicode(left) < 0 || ensure_unicode(right) < 0) 11286 n/a return NULL; 11287 n/a 11288 n/a /* Shortcuts */ 11289 n/a if (left == unicode_empty) 11290 n/a return PyUnicode_FromObject(right); 11291 n/a if (right == unicode_empty) 11292 n/a return PyUnicode_FromObject(left); 11293 n/a 11294 n/a left_len = PyUnicode_GET_LENGTH(left); 11295 n/a right_len = PyUnicode_GET_LENGTH(right); 11296 n/a if (left_len > PY_SSIZE_T_MAX - right_len) { 11297 n/a PyErr_SetString(PyExc_OverflowError, 11298 n/a "strings are too large to concat"); 11299 n/a return NULL; 11300 n/a } 11301 n/a new_len = left_len + right_len; 11302 n/a 11303 n/a maxchar = PyUnicode_MAX_CHAR_VALUE(left); 11304 n/a maxchar2 = PyUnicode_MAX_CHAR_VALUE(right); 11305 n/a maxchar = Py_MAX(maxchar, maxchar2); 11306 n/a 11307 n/a /* Concat the two Unicode strings */ 11308 n/a result = PyUnicode_New(new_len, maxchar); 11309 n/a if (result == NULL) 11310 n/a return NULL; 11311 n/a _PyUnicode_FastCopyCharacters(result, 0, left, 0, left_len); 11312 n/a _PyUnicode_FastCopyCharacters(result, left_len, right, 0, right_len); 11313 n/a assert(_PyUnicode_CheckConsistency(result, 1)); 11314 n/a return result; 11315 n/a } 11316 n/a 11317 n/a void 11318 n/a PyUnicode_Append(PyObject **p_left, PyObject *right) 11319 n/a { 11320 n/a PyObject *left, *res; 11321 n/a Py_UCS4 maxchar, maxchar2; 11322 n/a Py_ssize_t left_len, right_len, new_len; 11323 n/a 11324 n/a if (p_left == NULL) { 11325 n/a if (!PyErr_Occurred()) 11326 n/a PyErr_BadInternalCall(); 11327 n/a return; 11328 n/a } 11329 n/a left = *p_left; 11330 n/a if (right == NULL || left == NULL 11331 n/a || !PyUnicode_Check(left) || !PyUnicode_Check(right)) { 11332 n/a if (!PyErr_Occurred()) 11333 n/a PyErr_BadInternalCall(); 11334 n/a goto error; 11335 n/a } 11336 n/a 11337 n/a if (PyUnicode_READY(left) == -1) 11338 n/a goto error; 11339 n/a if (PyUnicode_READY(right) == -1) 11340 n/a goto error; 11341 n/a 11342 n/a /* Shortcuts */ 11343 n/a if (left == unicode_empty) { 11344 n/a Py_DECREF(left); 11345 n/a Py_INCREF(right); 11346 n/a *p_left = right; 11347 n/a return; 11348 n/a } 11349 n/a if (right == unicode_empty) 11350 n/a return; 11351 n/a 11352 n/a left_len = PyUnicode_GET_LENGTH(left); 11353 n/a right_len = PyUnicode_GET_LENGTH(right); 11354 n/a if (left_len > PY_SSIZE_T_MAX - right_len) { 11355 n/a PyErr_SetString(PyExc_OverflowError, 11356 n/a "strings are too large to concat"); 11357 n/a goto error; 11358 n/a } 11359 n/a new_len = left_len + right_len; 11360 n/a 11361 n/a if (unicode_modifiable(left) 11362 n/a && PyUnicode_CheckExact(right) 11363 n/a && PyUnicode_KIND(right) <= PyUnicode_KIND(left) 11364 n/a /* Don't resize for ascii += latin1. Convert ascii to latin1 requires 11365 n/a to change the structure size, but characters are stored just after 11366 n/a the structure, and so it requires to move all characters which is 11367 n/a not so different than duplicating the string. */ 11368 n/a && !(PyUnicode_IS_ASCII(left) && !PyUnicode_IS_ASCII(right))) 11369 n/a { 11370 n/a /* append inplace */ 11371 n/a if (unicode_resize(p_left, new_len) != 0) 11372 n/a goto error; 11373 n/a 11374 n/a /* copy 'right' into the newly allocated area of 'left' */ 11375 n/a _PyUnicode_FastCopyCharacters(*p_left, left_len, right, 0, right_len); 11376 n/a } 11377 n/a else { 11378 n/a maxchar = PyUnicode_MAX_CHAR_VALUE(left); 11379 n/a maxchar2 = PyUnicode_MAX_CHAR_VALUE(right); 11380 n/a maxchar = Py_MAX(maxchar, maxchar2); 11381 n/a 11382 n/a /* Concat the two Unicode strings */ 11383 n/a res = PyUnicode_New(new_len, maxchar); 11384 n/a if (res == NULL) 11385 n/a goto error; 11386 n/a _PyUnicode_FastCopyCharacters(res, 0, left, 0, left_len); 11387 n/a _PyUnicode_FastCopyCharacters(res, left_len, right, 0, right_len); 11388 n/a Py_DECREF(left); 11389 n/a *p_left = res; 11390 n/a } 11391 n/a assert(_PyUnicode_CheckConsistency(*p_left, 1)); 11392 n/a return; 11393 n/a 11394 n/a error: 11395 n/a Py_CLEAR(*p_left); 11396 n/a } 11397 n/a 11398 n/a void 11399 n/a PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right) 11400 n/a { 11401 n/a PyUnicode_Append(pleft, right); 11402 n/a Py_XDECREF(right); 11403 n/a } 11404 n/a 11405 n/a /* 11406 n/a Wraps stringlib_parse_args_finds() and additionally ensures that the 11407 n/a first argument is a unicode object. 11408 n/a */ 11409 n/a 11410 n/a static inline int 11411 n/a parse_args_finds_unicode(const char * function_name, PyObject *args, 11412 n/a PyObject **substring, 11413 n/a Py_ssize_t *start, Py_ssize_t *end) 11414 n/a { 11415 n/a if(stringlib_parse_args_finds(function_name, args, substring, 11416 n/a start, end)) { 11417 n/a if (ensure_unicode(*substring) < 0) 11418 n/a return 0; 11419 n/a return 1; 11420 n/a } 11421 n/a return 0; 11422 n/a } 11423 n/a 11424 n/a PyDoc_STRVAR(count__doc__, 11425 n/a "S.count(sub[, start[, end]]) -> int\n\ 11426 n/a \n\ 11427 n/a Return the number of non-overlapping occurrences of substring sub in\n\ 11428 n/a string S[start:end]. Optional arguments start and end are\n\ 11429 n/a interpreted as in slice notation."); 11430 n/a 11431 n/a static PyObject * 11432 n/a unicode_count(PyObject *self, PyObject *args) 11433 n/a { 11434 n/a PyObject *substring = NULL; /* initialize to fix a compiler warning */ 11435 n/a Py_ssize_t start = 0; 11436 n/a Py_ssize_t end = PY_SSIZE_T_MAX; 11437 n/a PyObject *result; 11438 n/a int kind1, kind2; 11439 n/a void *buf1, *buf2; 11440 n/a Py_ssize_t len1, len2, iresult; 11441 n/a 11442 n/a if (!parse_args_finds_unicode("count", args, &substring, &start, &end)) 11443 n/a return NULL; 11444 n/a 11445 n/a kind1 = PyUnicode_KIND(self); 11446 n/a kind2 = PyUnicode_KIND(substring); 11447 n/a if (kind1 < kind2) 11448 n/a return PyLong_FromLong(0); 11449 n/a 11450 n/a len1 = PyUnicode_GET_LENGTH(self); 11451 n/a len2 = PyUnicode_GET_LENGTH(substring); 11452 n/a ADJUST_INDICES(start, end, len1); 11453 n/a if (end - start < len2) 11454 n/a return PyLong_FromLong(0); 11455 n/a 11456 n/a buf1 = PyUnicode_DATA(self); 11457 n/a buf2 = PyUnicode_DATA(substring); 11458 n/a if (kind2 != kind1) { 11459 n/a buf2 = _PyUnicode_AsKind(substring, kind1); 11460 n/a if (!buf2) 11461 n/a return NULL; 11462 n/a } 11463 n/a switch (kind1) { 11464 n/a case PyUnicode_1BYTE_KIND: 11465 n/a iresult = ucs1lib_count( 11466 n/a ((Py_UCS1*)buf1) + start, end - start, 11467 n/a buf2, len2, PY_SSIZE_T_MAX 11468 n/a ); 11469 n/a break; 11470 n/a case PyUnicode_2BYTE_KIND: 11471 n/a iresult = ucs2lib_count( 11472 n/a ((Py_UCS2*)buf1) + start, end - start, 11473 n/a buf2, len2, PY_SSIZE_T_MAX 11474 n/a ); 11475 n/a break; 11476 n/a case PyUnicode_4BYTE_KIND: 11477 n/a iresult = ucs4lib_count( 11478 n/a ((Py_UCS4*)buf1) + start, end - start, 11479 n/a buf2, len2, PY_SSIZE_T_MAX 11480 n/a ); 11481 n/a break; 11482 n/a default: 11483 n/a assert(0); iresult = 0; 11484 n/a } 11485 n/a 11486 n/a result = PyLong_FromSsize_t(iresult); 11487 n/a 11488 n/a if (kind2 != kind1) 11489 n/a PyMem_Free(buf2); 11490 n/a 11491 n/a return result; 11492 n/a } 11493 n/a 11494 n/a /*[clinic input] 11495 n/a str.encode as unicode_encode 11496 n/a 11497 n/a encoding: str(c_default="NULL") = 'utf-8' 11498 n/a The encoding in which to encode the string. 11499 n/a errors: str(c_default="NULL") = 'strict' 11500 n/a The error handling scheme to use for encoding errors. 11501 n/a The default is 'strict' meaning that encoding errors raise a 11502 n/a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and 11503 n/a 'xmlcharrefreplace' as well as any other name registered with 11504 n/a codecs.register_error that can handle UnicodeEncodeErrors. 11505 n/a 11506 n/a Encode the string using the codec registered for encoding. 11507 n/a [clinic start generated code]*/ 11508 n/a 11509 n/a static PyObject * 11510 n/a unicode_encode_impl(PyObject *self, const char *encoding, const char *errors) 11511 n/a /*[clinic end generated code: output=bf78b6e2a9470e3c input=f0a9eb293d08fe02]*/ 11512 n/a { 11513 n/a return PyUnicode_AsEncodedString(self, encoding, errors); 11514 n/a } 11515 n/a 11516 n/a /*[clinic input] 11517 n/a str.expandtabs as unicode_expandtabs 11518 n/a 11519 n/a tabsize: int = 8 11520 n/a 11521 n/a Return a copy where all tab characters are expanded using spaces. 11522 n/a 11523 n/a If tabsize is not given, a tab size of 8 characters is assumed. 11524 n/a [clinic start generated code]*/ 11525 n/a 11526 n/a static PyObject * 11527 n/a unicode_expandtabs_impl(PyObject *self, int tabsize) 11528 n/a /*[clinic end generated code: output=3457c5dcee26928f input=8a01914034af4c85]*/ 11529 n/a { 11530 n/a Py_ssize_t i, j, line_pos, src_len, incr; 11531 n/a Py_UCS4 ch; 11532 n/a PyObject *u; 11533 n/a void *src_data, *dest_data; 11534 n/a int kind; 11535 n/a int found; 11536 n/a 11537 n/a if (PyUnicode_READY(self) == -1) 11538 n/a return NULL; 11539 n/a 11540 n/a /* First pass: determine size of output string */ 11541 n/a src_len = PyUnicode_GET_LENGTH(self); 11542 n/a i = j = line_pos = 0; 11543 n/a kind = PyUnicode_KIND(self); 11544 n/a src_data = PyUnicode_DATA(self); 11545 n/a found = 0; 11546 n/a for (; i < src_len; i++) { 11547 n/a ch = PyUnicode_READ(kind, src_data, i); 11548 n/a if (ch == '\t') { 11549 n/a found = 1; 11550 n/a if (tabsize > 0) { 11551 n/a incr = tabsize - (line_pos % tabsize); /* cannot overflow */ 11552 n/a if (j > PY_SSIZE_T_MAX - incr) 11553 n/a goto overflow; 11554 n/a line_pos += incr; 11555 n/a j += incr; 11556 n/a } 11557 n/a } 11558 n/a else { 11559 n/a if (j > PY_SSIZE_T_MAX - 1) 11560 n/a goto overflow; 11561 n/a line_pos++; 11562 n/a j++; 11563 n/a if (ch == '\n' || ch == '\r') 11564 n/a line_pos = 0; 11565 n/a } 11566 n/a } 11567 n/a if (!found) 11568 n/a return unicode_result_unchanged(self); 11569 n/a 11570 n/a /* Second pass: create output string and fill it */ 11571 n/a u = PyUnicode_New(j, PyUnicode_MAX_CHAR_VALUE(self)); 11572 n/a if (!u) 11573 n/a return NULL; 11574 n/a dest_data = PyUnicode_DATA(u); 11575 n/a 11576 n/a i = j = line_pos = 0; 11577 n/a 11578 n/a for (; i < src_len; i++) { 11579 n/a ch = PyUnicode_READ(kind, src_data, i); 11580 n/a if (ch == '\t') { 11581 n/a if (tabsize > 0) { 11582 n/a incr = tabsize - (line_pos % tabsize); 11583 n/a line_pos += incr; 11584 n/a FILL(kind, dest_data, ' ', j, incr); 11585 n/a j += incr; 11586 n/a } 11587 n/a } 11588 n/a else { 11589 n/a line_pos++; 11590 n/a PyUnicode_WRITE(kind, dest_data, j, ch); 11591 n/a j++; 11592 n/a if (ch == '\n' || ch == '\r') 11593 n/a line_pos = 0; 11594 n/a } 11595 n/a } 11596 n/a assert (j == PyUnicode_GET_LENGTH(u)); 11597 n/a return unicode_result(u); 11598 n/a 11599 n/a overflow: 11600 n/a PyErr_SetString(PyExc_OverflowError, "new string is too long"); 11601 n/a return NULL; 11602 n/a } 11603 n/a 11604 n/a PyDoc_STRVAR(find__doc__, 11605 n/a "S.find(sub[, start[, end]]) -> int\n\ 11606 n/a \n\ 11607 n/a Return the lowest index in S where substring sub is found,\n\ 11608 n/a such that sub is contained within S[start:end]. Optional\n\ 11609 n/a arguments start and end are interpreted as in slice notation.\n\ 11610 n/a \n\ 11611 n/a Return -1 on failure."); 11612 n/a 11613 n/a static PyObject * 11614 n/a unicode_find(PyObject *self, PyObject *args) 11615 n/a { 11616 n/a /* initialize variables to prevent gcc warning */ 11617 n/a PyObject *substring = NULL; 11618 n/a Py_ssize_t start = 0; 11619 n/a Py_ssize_t end = 0; 11620 n/a Py_ssize_t result; 11621 n/a 11622 n/a if (!parse_args_finds_unicode("find", args, &substring, &start, &end)) 11623 n/a return NULL; 11624 n/a 11625 n/a if (PyUnicode_READY(self) == -1) 11626 n/a return NULL; 11627 n/a 11628 n/a result = any_find_slice(self, substring, start, end, 1); 11629 n/a 11630 n/a if (result == -2) 11631 n/a return NULL; 11632 n/a 11633 n/a return PyLong_FromSsize_t(result); 11634 n/a } 11635 n/a 11636 n/a static PyObject * 11637 n/a unicode_getitem(PyObject *self, Py_ssize_t index) 11638 n/a { 11639 n/a void *data; 11640 n/a enum PyUnicode_Kind kind; 11641 n/a Py_UCS4 ch; 11642 n/a 11643 n/a if (!PyUnicode_Check(self) || PyUnicode_READY(self) == -1) { 11644 n/a PyErr_BadArgument(); 11645 n/a return NULL; 11646 n/a } 11647 n/a if (index < 0 || index >= PyUnicode_GET_LENGTH(self)) { 11648 n/a PyErr_SetString(PyExc_IndexError, "string index out of range"); 11649 n/a return NULL; 11650 n/a } 11651 n/a kind = PyUnicode_KIND(self); 11652 n/a data = PyUnicode_DATA(self); 11653 n/a ch = PyUnicode_READ(kind, data, index); 11654 n/a return unicode_char(ch); 11655 n/a } 11656 n/a 11657 n/a /* Believe it or not, this produces the same value for ASCII strings 11658 n/a as bytes_hash(). */ 11659 n/a static Py_hash_t 11660 n/a unicode_hash(PyObject *self) 11661 n/a { 11662 n/a Py_ssize_t len; 11663 n/a Py_uhash_t x; /* Unsigned for defined overflow behavior. */ 11664 n/a 11665 n/a #ifdef Py_DEBUG 11666 n/a assert(_Py_HashSecret_Initialized); 11667 n/a #endif 11668 n/a if (_PyUnicode_HASH(self) != -1) 11669 n/a return _PyUnicode_HASH(self); 11670 n/a if (PyUnicode_READY(self) == -1) 11671 n/a return -1; 11672 n/a len = PyUnicode_GET_LENGTH(self); 11673 n/a /* 11674 n/a We make the hash of the empty string be 0, rather than using 11675 n/a (prefix ^ suffix), since this slightly obfuscates the hash secret 11676 n/a */ 11677 n/a if (len == 0) { 11678 n/a _PyUnicode_HASH(self) = 0; 11679 n/a return 0; 11680 n/a } 11681 n/a x = _Py_HashBytes(PyUnicode_DATA(self), 11682 n/a PyUnicode_GET_LENGTH(self) * PyUnicode_KIND(self)); 11683 n/a _PyUnicode_HASH(self) = x; 11684 n/a return x; 11685 n/a } 11686 n/a 11687 n/a PyDoc_STRVAR(index__doc__, 11688 n/a "S.index(sub[, start[, end]]) -> int\n\ 11689 n/a \n\ 11690 n/a Like S.find() but raise ValueError when the substring is not found."); 11691 n/a 11692 n/a static PyObject * 11693 n/a unicode_index(PyObject *self, PyObject *args) 11694 n/a { 11695 n/a /* initialize variables to prevent gcc warning */ 11696 n/a Py_ssize_t result; 11697 n/a PyObject *substring = NULL; 11698 n/a Py_ssize_t start = 0; 11699 n/a Py_ssize_t end = 0; 11700 n/a 11701 n/a if (!parse_args_finds_unicode("index", args, &substring, &start, &end)) 11702 n/a return NULL; 11703 n/a 11704 n/a if (PyUnicode_READY(self) == -1) 11705 n/a return NULL; 11706 n/a 11707 n/a result = any_find_slice(self, substring, start, end, 1); 11708 n/a 11709 n/a if (result == -2) 11710 n/a return NULL; 11711 n/a 11712 n/a if (result < 0) { 11713 n/a PyErr_SetString(PyExc_ValueError, "substring not found"); 11714 n/a return NULL; 11715 n/a } 11716 n/a 11717 n/a return PyLong_FromSsize_t(result); 11718 n/a } 11719 n/a 11720 n/a /*[clinic input] 11721 n/a str.islower as unicode_islower 11722 n/a 11723 n/a Return True if the string is a lowercase string, False otherwise. 11724 n/a 11725 n/a A string is lowercase if all cased characters in the string are lowercase and 11726 n/a there is at least one cased character in the string. 11727 n/a [clinic start generated code]*/ 11728 n/a 11729 n/a static PyObject * 11730 n/a unicode_islower_impl(PyObject *self) 11731 n/a /*[clinic end generated code: output=dbd41995bd005b81 input=acec65ac6821ae47]*/ 11732 n/a { 11733 n/a Py_ssize_t i, length; 11734 n/a int kind; 11735 n/a void *data; 11736 n/a int cased; 11737 n/a 11738 n/a if (PyUnicode_READY(self) == -1) 11739 n/a return NULL; 11740 n/a length = PyUnicode_GET_LENGTH(self); 11741 n/a kind = PyUnicode_KIND(self); 11742 n/a data = PyUnicode_DATA(self); 11743 n/a 11744 n/a /* Shortcut for single character strings */ 11745 n/a if (length == 1) 11746 n/a return PyBool_FromLong( 11747 n/a Py_UNICODE_ISLOWER(PyUnicode_READ(kind, data, 0))); 11748 n/a 11749 n/a /* Special case for empty strings */ 11750 n/a if (length == 0) 11751 n/a return PyBool_FromLong(0); 11752 n/a 11753 n/a cased = 0; 11754 n/a for (i = 0; i < length; i++) { 11755 n/a const Py_UCS4 ch = PyUnicode_READ(kind, data, i); 11756 n/a 11757 n/a if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) 11758 n/a return PyBool_FromLong(0); 11759 n/a else if (!cased && Py_UNICODE_ISLOWER(ch)) 11760 n/a cased = 1; 11761 n/a } 11762 n/a return PyBool_FromLong(cased); 11763 n/a } 11764 n/a 11765 n/a /*[clinic input] 11766 n/a str.isupper as unicode_isupper 11767 n/a 11768 n/a Return True if the string is an uppercase string, False otherwise. 11769 n/a 11770 n/a A string is uppercase if all cased characters in the string are uppercase and 11771 n/a there is at least one cased character in the string. 11772 n/a [clinic start generated code]*/ 11773 n/a 11774 n/a static PyObject * 11775 n/a unicode_isupper_impl(PyObject *self) 11776 n/a /*[clinic end generated code: output=049209c8e7f15f59 input=e9b1feda5d17f2d3]*/ 11777 n/a { 11778 n/a Py_ssize_t i, length; 11779 n/a int kind; 11780 n/a void *data; 11781 n/a int cased; 11782 n/a 11783 n/a if (PyUnicode_READY(self) == -1) 11784 n/a return NULL; 11785 n/a length = PyUnicode_GET_LENGTH(self); 11786 n/a kind = PyUnicode_KIND(self); 11787 n/a data = PyUnicode_DATA(self); 11788 n/a 11789 n/a /* Shortcut for single character strings */ 11790 n/a if (length == 1) 11791 n/a return PyBool_FromLong( 11792 n/a Py_UNICODE_ISUPPER(PyUnicode_READ(kind, data, 0)) != 0); 11793 n/a 11794 n/a /* Special case for empty strings */ 11795 n/a if (length == 0) 11796 n/a return PyBool_FromLong(0); 11797 n/a 11798 n/a cased = 0; 11799 n/a for (i = 0; i < length; i++) { 11800 n/a const Py_UCS4 ch = PyUnicode_READ(kind, data, i); 11801 n/a 11802 n/a if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch)) 11803 n/a return PyBool_FromLong(0); 11804 n/a else if (!cased && Py_UNICODE_ISUPPER(ch)) 11805 n/a cased = 1; 11806 n/a } 11807 n/a return PyBool_FromLong(cased); 11808 n/a } 11809 n/a 11810 n/a /*[clinic input] 11811 n/a str.istitle as unicode_istitle 11812 n/a 11813 n/a Return True if the string is a title-cased string, False otherwise. 11814 n/a 11815 n/a In a title-cased string, upper- and title-case characters may only 11816 n/a follow uncased characters and lowercase characters only cased ones. 11817 n/a [clinic start generated code]*/ 11818 n/a 11819 n/a static PyObject * 11820 n/a unicode_istitle_impl(PyObject *self) 11821 n/a /*[clinic end generated code: output=e9bf6eb91f5d3f0e input=98d32bd2e1f06f8c]*/ 11822 n/a { 11823 n/a Py_ssize_t i, length; 11824 n/a int kind; 11825 n/a void *data; 11826 n/a int cased, previous_is_cased; 11827 n/a 11828 n/a if (PyUnicode_READY(self) == -1) 11829 n/a return NULL; 11830 n/a length = PyUnicode_GET_LENGTH(self); 11831 n/a kind = PyUnicode_KIND(self); 11832 n/a data = PyUnicode_DATA(self); 11833 n/a 11834 n/a /* Shortcut for single character strings */ 11835 n/a if (length == 1) { 11836 n/a Py_UCS4 ch = PyUnicode_READ(kind, data, 0); 11837 n/a return PyBool_FromLong((Py_UNICODE_ISTITLE(ch) != 0) || 11838 n/a (Py_UNICODE_ISUPPER(ch) != 0)); 11839 n/a } 11840 n/a 11841 n/a /* Special case for empty strings */ 11842 n/a if (length == 0) 11843 n/a return PyBool_FromLong(0); 11844 n/a 11845 n/a cased = 0; 11846 n/a previous_is_cased = 0; 11847 n/a for (i = 0; i < length; i++) { 11848 n/a const Py_UCS4 ch = PyUnicode_READ(kind, data, i); 11849 n/a 11850 n/a if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) { 11851 n/a if (previous_is_cased) 11852 n/a return PyBool_FromLong(0); 11853 n/a previous_is_cased = 1; 11854 n/a cased = 1; 11855 n/a } 11856 n/a else if (Py_UNICODE_ISLOWER(ch)) { 11857 n/a if (!previous_is_cased) 11858 n/a return PyBool_FromLong(0); 11859 n/a previous_is_cased = 1; 11860 n/a cased = 1; 11861 n/a } 11862 n/a else 11863 n/a previous_is_cased = 0; 11864 n/a } 11865 n/a return PyBool_FromLong(cased); 11866 n/a } 11867 n/a 11868 n/a /*[clinic input] 11869 n/a str.isspace as unicode_isspace 11870 n/a 11871 n/a Return True if the string is a whitespace string, False otherwise. 11872 n/a 11873 n/a A string is whitespace if all characters in the string are whitespace and there 11874 n/a is at least one character in the string. 11875 n/a [clinic start generated code]*/ 11876 n/a 11877 n/a static PyObject * 11878 n/a unicode_isspace_impl(PyObject *self) 11879 n/a /*[clinic end generated code: output=163a63bfa08ac2b9 input=fe462cb74f8437d8]*/ 11880 n/a { 11881 n/a Py_ssize_t i, length; 11882 n/a int kind; 11883 n/a void *data; 11884 n/a 11885 n/a if (PyUnicode_READY(self) == -1) 11886 n/a return NULL; 11887 n/a length = PyUnicode_GET_LENGTH(self); 11888 n/a kind = PyUnicode_KIND(self); 11889 n/a data = PyUnicode_DATA(self); 11890 n/a 11891 n/a /* Shortcut for single character strings */ 11892 n/a if (length == 1) 11893 n/a return PyBool_FromLong( 11894 n/a Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, 0))); 11895 n/a 11896 n/a /* Special case for empty strings */ 11897 n/a if (length == 0) 11898 n/a return PyBool_FromLong(0); 11899 n/a 11900 n/a for (i = 0; i < length; i++) { 11901 n/a const Py_UCS4 ch = PyUnicode_READ(kind, data, i); 11902 n/a if (!Py_UNICODE_ISSPACE(ch)) 11903 n/a return PyBool_FromLong(0); 11904 n/a } 11905 n/a return PyBool_FromLong(1); 11906 n/a } 11907 n/a 11908 n/a /*[clinic input] 11909 n/a str.isalpha as unicode_isalpha 11910 n/a 11911 n/a Return True if the string is an alphabetic string, False otherwise. 11912 n/a 11913 n/a A string is alphabetic if all characters in the string are alphabetic and there 11914 n/a is at least one character in the string. 11915 n/a [clinic start generated code]*/ 11916 n/a 11917 n/a static PyObject * 11918 n/a unicode_isalpha_impl(PyObject *self) 11919 n/a /*[clinic end generated code: output=cc81b9ac3883ec4f input=d0fd18a96cbca5eb]*/ 11920 n/a { 11921 n/a Py_ssize_t i, length; 11922 n/a int kind; 11923 n/a void *data; 11924 n/a 11925 n/a if (PyUnicode_READY(self) == -1) 11926 n/a return NULL; 11927 n/a length = PyUnicode_GET_LENGTH(self); 11928 n/a kind = PyUnicode_KIND(self); 11929 n/a data = PyUnicode_DATA(self); 11930 n/a 11931 n/a /* Shortcut for single character strings */ 11932 n/a if (length == 1) 11933 n/a return PyBool_FromLong( 11934 n/a Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, 0))); 11935 n/a 11936 n/a /* Special case for empty strings */ 11937 n/a if (length == 0) 11938 n/a return PyBool_FromLong(0); 11939 n/a 11940 n/a for (i = 0; i < length; i++) { 11941 n/a if (!Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, i))) 11942 n/a return PyBool_FromLong(0); 11943 n/a } 11944 n/a return PyBool_FromLong(1); 11945 n/a } 11946 n/a 11947 n/a /*[clinic input] 11948 n/a str.isalnum as unicode_isalnum 11949 n/a 11950 n/a Return True if the string is an alpha-numeric string, False otherwise. 11951 n/a 11952 n/a A string is alpha-numeric if all characters in the string are alpha-numeric and 11953 n/a there is at least one character in the string. 11954 n/a [clinic start generated code]*/ 11955 n/a 11956 n/a static PyObject * 11957 n/a unicode_isalnum_impl(PyObject *self) 11958 n/a /*[clinic end generated code: output=a5a23490ffc3660c input=5c6579bf2e04758c]*/ 11959 n/a { 11960 n/a int kind; 11961 n/a void *data; 11962 n/a Py_ssize_t len, i; 11963 n/a 11964 n/a if (PyUnicode_READY(self) == -1) 11965 n/a return NULL; 11966 n/a 11967 n/a kind = PyUnicode_KIND(self); 11968 n/a data = PyUnicode_DATA(self); 11969 n/a len = PyUnicode_GET_LENGTH(self); 11970 n/a 11971 n/a /* Shortcut for single character strings */ 11972 n/a if (len == 1) { 11973 n/a const Py_UCS4 ch = PyUnicode_READ(kind, data, 0); 11974 n/a return PyBool_FromLong(Py_UNICODE_ISALNUM(ch)); 11975 n/a } 11976 n/a 11977 n/a /* Special case for empty strings */ 11978 n/a if (len == 0) 11979 n/a return PyBool_FromLong(0); 11980 n/a 11981 n/a for (i = 0; i < len; i++) { 11982 n/a const Py_UCS4 ch = PyUnicode_READ(kind, data, i); 11983 n/a if (!Py_UNICODE_ISALNUM(ch)) 11984 n/a return PyBool_FromLong(0); 11985 n/a } 11986 n/a return PyBool_FromLong(1); 11987 n/a } 11988 n/a 11989 n/a /*[clinic input] 11990 n/a str.isdecimal as unicode_isdecimal 11991 n/a 11992 n/a Return True if the string is a decimal string, False otherwise. 11993 n/a 11994 n/a A string is a decimal string if all characters in the string are decimal and 11995 n/a there is at least one character in the string. 11996 n/a [clinic start generated code]*/ 11997 n/a 11998 n/a static PyObject * 11999 n/a unicode_isdecimal_impl(PyObject *self) 12000 n/a /*[clinic end generated code: output=fb2dcdb62d3fc548 input=336bc97ab4c8268f]*/ 12001 n/a { 12002 n/a Py_ssize_t i, length; 12003 n/a int kind; 12004 n/a void *data; 12005 n/a 12006 n/a if (PyUnicode_READY(self) == -1) 12007 n/a return NULL; 12008 n/a length = PyUnicode_GET_LENGTH(self); 12009 n/a kind = PyUnicode_KIND(self); 12010 n/a data = PyUnicode_DATA(self); 12011 n/a 12012 n/a /* Shortcut for single character strings */ 12013 n/a if (length == 1) 12014 n/a return PyBool_FromLong( 12015 n/a Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, 0))); 12016 n/a 12017 n/a /* Special case for empty strings */ 12018 n/a if (length == 0) 12019 n/a return PyBool_FromLong(0); 12020 n/a 12021 n/a for (i = 0; i < length; i++) { 12022 n/a if (!Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, i))) 12023 n/a return PyBool_FromLong(0); 12024 n/a } 12025 n/a return PyBool_FromLong(1); 12026 n/a } 12027 n/a 12028 n/a /*[clinic input] 12029 n/a str.isdigit as unicode_isdigit 12030 n/a 12031 n/a Return True if the string is a digit string, False otherwise. 12032 n/a 12033 n/a A string is a digit string if all characters in the string are digits and there 12034 n/a is at least one character in the string. 12035 n/a [clinic start generated code]*/ 12036 n/a 12037 n/a static PyObject * 12038 n/a unicode_isdigit_impl(PyObject *self) 12039 n/a /*[clinic end generated code: output=10a6985311da6858 input=901116c31deeea4c]*/ 12040 n/a { 12041 n/a Py_ssize_t i, length; 12042 n/a int kind; 12043 n/a void *data; 12044 n/a 12045 n/a if (PyUnicode_READY(self) == -1) 12046 n/a return NULL; 12047 n/a length = PyUnicode_GET_LENGTH(self); 12048 n/a kind = PyUnicode_KIND(self); 12049 n/a data = PyUnicode_DATA(self); 12050 n/a 12051 n/a /* Shortcut for single character strings */ 12052 n/a if (length == 1) { 12053 n/a const Py_UCS4 ch = PyUnicode_READ(kind, data, 0); 12054 n/a return PyBool_FromLong(Py_UNICODE_ISDIGIT(ch)); 12055 n/a } 12056 n/a 12057 n/a /* Special case for empty strings */ 12058 n/a if (length == 0) 12059 n/a return PyBool_FromLong(0); 12060 n/a 12061 n/a for (i = 0; i < length; i++) { 12062 n/a if (!Py_UNICODE_ISDIGIT(PyUnicode_READ(kind, data, i))) 12063 n/a return PyBool_FromLong(0); 12064 n/a } 12065 n/a return PyBool_FromLong(1); 12066 n/a } 12067 n/a 12068 n/a /*[clinic input] 12069 n/a str.isnumeric as unicode_isnumeric 12070 n/a 12071 n/a Return True if the string is a numeric string, False otherwise. 12072 n/a 12073 n/a A string is numeric if all characters in the string are numeric and there is at 12074 n/a least one character in the string. 12075 n/a [clinic start generated code]*/ 12076 n/a 12077 n/a static PyObject * 12078 n/a unicode_isnumeric_impl(PyObject *self) 12079 n/a /*[clinic end generated code: output=9172a32d9013051a input=722507db976f826c]*/ 12080 n/a { 12081 n/a Py_ssize_t i, length; 12082 n/a int kind; 12083 n/a void *data; 12084 n/a 12085 n/a if (PyUnicode_READY(self) == -1) 12086 n/a return NULL; 12087 n/a length = PyUnicode_GET_LENGTH(self); 12088 n/a kind = PyUnicode_KIND(self); 12089 n/a data = PyUnicode_DATA(self); 12090 n/a 12091 n/a /* Shortcut for single character strings */ 12092 n/a if (length == 1) 12093 n/a return PyBool_FromLong( 12094 n/a Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, 0))); 12095 n/a 12096 n/a /* Special case for empty strings */ 12097 n/a if (length == 0) 12098 n/a return PyBool_FromLong(0); 12099 n/a 12100 n/a for (i = 0; i < length; i++) { 12101 n/a if (!Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, i))) 12102 n/a return PyBool_FromLong(0); 12103 n/a } 12104 n/a return PyBool_FromLong(1); 12105 n/a } 12106 n/a 12107 n/a int 12108 n/a PyUnicode_IsIdentifier(PyObject *self) 12109 n/a { 12110 n/a int kind; 12111 n/a void *data; 12112 n/a Py_ssize_t i; 12113 n/a Py_UCS4 first; 12114 n/a 12115 n/a if (PyUnicode_READY(self) == -1) { 12116 n/a Py_FatalError("identifier not ready"); 12117 n/a return 0; 12118 n/a } 12119 n/a 12120 n/a /* Special case for empty strings */ 12121 n/a if (PyUnicode_GET_LENGTH(self) == 0) 12122 n/a return 0; 12123 n/a kind = PyUnicode_KIND(self); 12124 n/a data = PyUnicode_DATA(self); 12125 n/a 12126 n/a /* PEP 3131 says that the first character must be in 12127 n/a XID_Start and subsequent characters in XID_Continue, 12128 n/a and for the ASCII range, the 2.x rules apply (i.e 12129 n/a start with letters and underscore, continue with 12130 n/a letters, digits, underscore). However, given the current 12131 n/a definition of XID_Start and XID_Continue, it is sufficient 12132 n/a to check just for these, except that _ must be allowed 12133 n/a as starting an identifier. */ 12134 n/a first = PyUnicode_READ(kind, data, 0); 12135 n/a if (!_PyUnicode_IsXidStart(first) && first != 0x5F /* LOW LINE */) 12136 n/a return 0; 12137 n/a 12138 n/a for (i = 1; i < PyUnicode_GET_LENGTH(self); i++) 12139 n/a if (!_PyUnicode_IsXidContinue(PyUnicode_READ(kind, data, i))) 12140 n/a return 0; 12141 n/a return 1; 12142 n/a } 12143 n/a 12144 n/a /*[clinic input] 12145 n/a str.isidentifier as unicode_isidentifier 12146 n/a 12147 n/a Return True if the string is a valid Python identifier, False otherwise. 12148 n/a 12149 n/a Use keyword.iskeyword() to test for reserved identifiers such as "def" and 12150 n/a "class". 12151 n/a [clinic start generated code]*/ 12152 n/a 12153 n/a static PyObject * 12154 n/a unicode_isidentifier_impl(PyObject *self) 12155 n/a /*[clinic end generated code: output=fe585a9666572905 input=916b0a3c9f57e919]*/ 12156 n/a { 12157 n/a return PyBool_FromLong(PyUnicode_IsIdentifier(self)); 12158 n/a } 12159 n/a 12160 n/a /*[clinic input] 12161 n/a str.isprintable as unicode_isprintable 12162 n/a 12163 n/a Return True if the string is printable, False otherwise. 12164 n/a 12165 n/a A string is printable if all of its characters are considered printable in 12166 n/a repr() or if it is empty. 12167 n/a [clinic start generated code]*/ 12168 n/a 12169 n/a static PyObject * 12170 n/a unicode_isprintable_impl(PyObject *self) 12171 n/a /*[clinic end generated code: output=3ab9626cd32dd1a0 input=98a0e1c2c1813209]*/ 12172 n/a { 12173 n/a Py_ssize_t i, length; 12174 n/a int kind; 12175 n/a void *data; 12176 n/a 12177 n/a if (PyUnicode_READY(self) == -1) 12178 n/a return NULL; 12179 n/a length = PyUnicode_GET_LENGTH(self); 12180 n/a kind = PyUnicode_KIND(self); 12181 n/a data = PyUnicode_DATA(self); 12182 n/a 12183 n/a /* Shortcut for single character strings */ 12184 n/a if (length == 1) 12185 n/a return PyBool_FromLong( 12186 n/a Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, 0))); 12187 n/a 12188 n/a for (i = 0; i < length; i++) { 12189 n/a if (!Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, i))) { 12190 n/a Py_RETURN_FALSE; 12191 n/a } 12192 n/a } 12193 n/a Py_RETURN_TRUE; 12194 n/a } 12195 n/a 12196 n/a /*[clinic input] 12197 n/a str.join as unicode_join 12198 n/a 12199 n/a iterable: object 12200 n/a / 12201 n/a 12202 n/a Concatenate any number of strings. 12203 n/a 12204 n/a The string whose method is called is inserted in between each given string. 12205 n/a The result is returned as a new string. 12206 n/a 12207 n/a Example: '.'.join(['ab', 'pq', 'rs']) -> 'ab.pq.rs' 12208 n/a [clinic start generated code]*/ 12209 n/a 12210 n/a static PyObject * 12211 n/a unicode_join(PyObject *self, PyObject *iterable) 12212 n/a /*[clinic end generated code: output=6857e7cecfe7bf98 input=2f70422bfb8fa189]*/ 12213 n/a { 12214 n/a return PyUnicode_Join(self, iterable); 12215 n/a } 12216 n/a 12217 n/a static Py_ssize_t 12218 n/a unicode_length(PyObject *self) 12219 n/a { 12220 n/a if (PyUnicode_READY(self) == -1) 12221 n/a return -1; 12222 n/a return PyUnicode_GET_LENGTH(self); 12223 n/a } 12224 n/a 12225 n/a /*[clinic input] 12226 n/a str.ljust as unicode_ljust 12227 n/a 12228 n/a width: Py_ssize_t 12229 n/a fillchar: Py_UCS4 = ' ' 12230 n/a / 12231 n/a 12232 n/a Return a left-justified string of length width. 12233 n/a 12234 n/a Padding is done using the specified fill character (default is a space). 12235 n/a [clinic start generated code]*/ 12236 n/a 12237 n/a static PyObject * 12238 n/a unicode_ljust_impl(PyObject *self, Py_ssize_t width, Py_UCS4 fillchar) 12239 n/a /*[clinic end generated code: output=1cce0e0e0a0b84b3 input=3ab599e335e60a32]*/ 12240 n/a { 12241 n/a if (PyUnicode_READY(self) == -1) 12242 n/a return NULL; 12243 n/a 12244 n/a if (PyUnicode_GET_LENGTH(self) >= width) 12245 n/a return unicode_result_unchanged(self); 12246 n/a 12247 n/a return pad(self, 0, width - PyUnicode_GET_LENGTH(self), fillchar); 12248 n/a } 12249 n/a 12250 n/a /*[clinic input] 12251 n/a str.lower as unicode_lower 12252 n/a 12253 n/a Return a copy of the string converted to lowercase. 12254 n/a [clinic start generated code]*/ 12255 n/a 12256 n/a static PyObject * 12257 n/a unicode_lower_impl(PyObject *self) 12258 n/a /*[clinic end generated code: output=84ef9ed42efad663 input=60a2984b8beff23a]*/ 12259 n/a { 12260 n/a if (PyUnicode_READY(self) == -1) 12261 n/a return NULL; 12262 n/a if (PyUnicode_IS_ASCII(self)) 12263 n/a return ascii_upper_or_lower(self, 1); 12264 n/a return case_operation(self, do_lower); 12265 n/a } 12266 n/a 12267 n/a #define LEFTSTRIP 0 12268 n/a #define RIGHTSTRIP 1 12269 n/a #define BOTHSTRIP 2 12270 n/a 12271 n/a /* Arrays indexed by above */ 12272 n/a static const char *stripfuncnames[] = {"lstrip", "rstrip", "strip"}; 12273 n/a 12274 n/a #define STRIPNAME(i) (stripfuncnames[i]) 12275 n/a 12276 n/a /* externally visible for str.strip(unicode) */ 12277 n/a PyObject * 12278 n/a _PyUnicode_XStrip(PyObject *self, int striptype, PyObject *sepobj) 12279 n/a { 12280 n/a void *data; 12281 n/a int kind; 12282 n/a Py_ssize_t i, j, len; 12283 n/a BLOOM_MASK sepmask; 12284 n/a Py_ssize_t seplen; 12285 n/a 12286 n/a if (PyUnicode_READY(self) == -1 || PyUnicode_READY(sepobj) == -1) 12287 n/a return NULL; 12288 n/a 12289 n/a kind = PyUnicode_KIND(self); 12290 n/a data = PyUnicode_DATA(self); 12291 n/a len = PyUnicode_GET_LENGTH(self); 12292 n/a seplen = PyUnicode_GET_LENGTH(sepobj); 12293 n/a sepmask = make_bloom_mask(PyUnicode_KIND(sepobj), 12294 n/a PyUnicode_DATA(sepobj), 12295 n/a seplen); 12296 n/a 12297 n/a i = 0; 12298 n/a if (striptype != RIGHTSTRIP) { 12299 n/a while (i < len) { 12300 n/a Py_UCS4 ch = PyUnicode_READ(kind, data, i); 12301 n/a if (!BLOOM(sepmask, ch)) 12302 n/a break; 12303 n/a if (PyUnicode_FindChar(sepobj, ch, 0, seplen, 1) < 0) 12304 n/a break; 12305 n/a i++; 12306 n/a } 12307 n/a } 12308 n/a 12309 n/a j = len; 12310 n/a if (striptype != LEFTSTRIP) { 12311 n/a j--; 12312 n/a while (j >= i) { 12313 n/a Py_UCS4 ch = PyUnicode_READ(kind, data, j); 12314 n/a if (!BLOOM(sepmask, ch)) 12315 n/a break; 12316 n/a if (PyUnicode_FindChar(sepobj, ch, 0, seplen, 1) < 0) 12317 n/a break; 12318 n/a j--; 12319 n/a } 12320 n/a 12321 n/a j++; 12322 n/a } 12323 n/a 12324 n/a return PyUnicode_Substring(self, i, j); 12325 n/a } 12326 n/a 12327 n/a PyObject* 12328 n/a PyUnicode_Substring(PyObject *self, Py_ssize_t start, Py_ssize_t end) 12329 n/a { 12330 n/a unsigned char *data; 12331 n/a int kind; 12332 n/a Py_ssize_t length; 12333 n/a 12334 n/a if (PyUnicode_READY(self) == -1) 12335 n/a return NULL; 12336 n/a 12337 n/a length = PyUnicode_GET_LENGTH(self); 12338 n/a end = Py_MIN(end, length); 12339 n/a 12340 n/a if (start == 0 && end == length) 12341 n/a return unicode_result_unchanged(self); 12342 n/a 12343 n/a if (start < 0 || end < 0) { 12344 n/a PyErr_SetString(PyExc_IndexError, "string index out of range"); 12345 n/a return NULL; 12346 n/a } 12347 n/a if (start >= length || end < start) 12348 n/a _Py_RETURN_UNICODE_EMPTY(); 12349 n/a 12350 n/a length = end - start; 12351 n/a if (PyUnicode_IS_ASCII(self)) { 12352 n/a data = PyUnicode_1BYTE_DATA(self); 12353 n/a return _PyUnicode_FromASCII((char*)(data + start), length); 12354 n/a } 12355 n/a else { 12356 n/a kind = PyUnicode_KIND(self); 12357 n/a data = PyUnicode_1BYTE_DATA(self); 12358 n/a return PyUnicode_FromKindAndData(kind, 12359 n/a data + kind * start, 12360 n/a length); 12361 n/a } 12362 n/a } 12363 n/a 12364 n/a static PyObject * 12365 n/a do_strip(PyObject *self, int striptype) 12366 n/a { 12367 n/a Py_ssize_t len, i, j; 12368 n/a 12369 n/a if (PyUnicode_READY(self) == -1) 12370 n/a return NULL; 12371 n/a 12372 n/a len = PyUnicode_GET_LENGTH(self); 12373 n/a 12374 n/a if (PyUnicode_IS_ASCII(self)) { 12375 n/a Py_UCS1 *data = PyUnicode_1BYTE_DATA(self); 12376 n/a 12377 n/a i = 0; 12378 n/a if (striptype != RIGHTSTRIP) { 12379 n/a while (i < len) { 12380 n/a Py_UCS1 ch = data[i]; 12381 n/a if (!_Py_ascii_whitespace[ch]) 12382 n/a break; 12383 n/a i++; 12384 n/a } 12385 n/a } 12386 n/a 12387 n/a j = len; 12388 n/a if (striptype != LEFTSTRIP) { 12389 n/a j--; 12390 n/a while (j >= i) { 12391 n/a Py_UCS1 ch = data[j]; 12392 n/a if (!_Py_ascii_whitespace[ch]) 12393 n/a break; 12394 n/a j--; 12395 n/a } 12396 n/a j++; 12397 n/a } 12398 n/a } 12399 n/a else { 12400 n/a int kind = PyUnicode_KIND(self); 12401 n/a void *data = PyUnicode_DATA(self); 12402 n/a 12403 n/a i = 0; 12404 n/a if (striptype != RIGHTSTRIP) { 12405 n/a while (i < len) { 12406 n/a Py_UCS4 ch = PyUnicode_READ(kind, data, i); 12407 n/a if (!Py_UNICODE_ISSPACE(ch)) 12408 n/a break; 12409 n/a i++; 12410 n/a } 12411 n/a } 12412 n/a 12413 n/a j = len; 12414 n/a if (striptype != LEFTSTRIP) { 12415 n/a j--; 12416 n/a while (j >= i) { 12417 n/a Py_UCS4 ch = PyUnicode_READ(kind, data, j); 12418 n/a if (!Py_UNICODE_ISSPACE(ch)) 12419 n/a break; 12420 n/a j--; 12421 n/a } 12422 n/a j++; 12423 n/a } 12424 n/a } 12425 n/a 12426 n/a return PyUnicode_Substring(self, i, j); 12427 n/a } 12428 n/a 12429 n/a 12430 n/a static PyObject * 12431 n/a do_argstrip(PyObject *self, int striptype, PyObject *sep) 12432 n/a { 12433 n/a if (sep != NULL && sep != Py_None) { 12434 n/a if (PyUnicode_Check(sep)) 12435 n/a return _PyUnicode_XStrip(self, striptype, sep); 12436 n/a else { 12437 n/a PyErr_Format(PyExc_TypeError, 12438 n/a "%s arg must be None or str", 12439 n/a STRIPNAME(striptype)); 12440 n/a return NULL; 12441 n/a } 12442 n/a } 12443 n/a 12444 n/a return do_strip(self, striptype); 12445 n/a } 12446 n/a 12447 n/a 12448 n/a /*[clinic input] 12449 n/a str.strip as unicode_strip 12450 n/a 12451 n/a chars: object = None 12452 n/a / 12453 n/a 12454 n/a Return a copy of the string with leading and trailing whitespace remove. 12455 n/a 12456 n/a If chars is given and not None, remove characters in chars instead. 12457 n/a [clinic start generated code]*/ 12458 n/a 12459 n/a static PyObject * 12460 n/a unicode_strip_impl(PyObject *self, PyObject *chars) 12461 n/a /*[clinic end generated code: output=ca19018454345d57 input=eefe24a1059c352b]*/ 12462 n/a { 12463 n/a return do_argstrip(self, BOTHSTRIP, chars); 12464 n/a } 12465 n/a 12466 n/a 12467 n/a /*[clinic input] 12468 n/a str.lstrip as unicode_lstrip 12469 n/a 12470 n/a chars: object = NULL 12471 n/a / 12472 n/a 12473 n/a Return a copy of the string with leading whitespace removed. 12474 n/a 12475 n/a If chars is given and not None, remove characters in chars instead. 12476 n/a [clinic start generated code]*/ 12477 n/a 12478 n/a static PyObject * 12479 n/a unicode_lstrip_impl(PyObject *self, PyObject *chars) 12480 n/a /*[clinic end generated code: output=3b43683251f79ca7 input=9e56f3c45f5ff4c3]*/ 12481 n/a { 12482 n/a return do_argstrip(self, LEFTSTRIP, chars); 12483 n/a } 12484 n/a 12485 n/a 12486 n/a /*[clinic input] 12487 n/a str.rstrip as unicode_rstrip 12488 n/a 12489 n/a chars: object = NULL 12490 n/a / 12491 n/a 12492 n/a Return a copy of the string with trailing whitespace removed. 12493 n/a 12494 n/a If chars is given and not None, remove characters in chars instead. 12495 n/a [clinic start generated code]*/ 12496 n/a 12497 n/a static PyObject * 12498 n/a unicode_rstrip_impl(PyObject *self, PyObject *chars) 12499 n/a /*[clinic end generated code: output=4a59230017cc3b7a input=ac89d0219cb411ee]*/ 12500 n/a { 12501 n/a return do_argstrip(self, RIGHTSTRIP, chars); 12502 n/a } 12503 n/a 12504 n/a 12505 n/a static PyObject* 12506 n/a unicode_repeat(PyObject *str, Py_ssize_t len) 12507 n/a { 12508 n/a PyObject *u; 12509 n/a Py_ssize_t nchars, n; 12510 n/a 12511 n/a if (len < 1) 12512 n/a _Py_RETURN_UNICODE_EMPTY(); 12513 n/a 12514 n/a /* no repeat, return original string */ 12515 n/a if (len == 1) 12516 n/a return unicode_result_unchanged(str); 12517 n/a 12518 n/a if (PyUnicode_READY(str) == -1) 12519 n/a return NULL; 12520 n/a 12521 n/a if (PyUnicode_GET_LENGTH(str) > PY_SSIZE_T_MAX / len) { 12522 n/a PyErr_SetString(PyExc_OverflowError, 12523 n/a "repeated string is too long"); 12524 n/a return NULL; 12525 n/a } 12526 n/a nchars = len * PyUnicode_GET_LENGTH(str); 12527 n/a 12528 n/a u = PyUnicode_New(nchars, PyUnicode_MAX_CHAR_VALUE(str)); 12529 n/a if (!u) 12530 n/a return NULL; 12531 n/a assert(PyUnicode_KIND(u) == PyUnicode_KIND(str)); 12532 n/a 12533 n/a if (PyUnicode_GET_LENGTH(str) == 1) { 12534 n/a const int kind = PyUnicode_KIND(str); 12535 n/a const Py_UCS4 fill_char = PyUnicode_READ(kind, PyUnicode_DATA(str), 0); 12536 n/a if (kind == PyUnicode_1BYTE_KIND) { 12537 n/a void *to = PyUnicode_DATA(u); 12538 n/a memset(to, (unsigned char)fill_char, len); 12539 n/a } 12540 n/a else if (kind == PyUnicode_2BYTE_KIND) { 12541 n/a Py_UCS2 *ucs2 = PyUnicode_2BYTE_DATA(u); 12542 n/a for (n = 0; n < len; ++n) 12543 n/a ucs2[n] = fill_char; 12544 n/a } else { 12545 n/a Py_UCS4 *ucs4 = PyUnicode_4BYTE_DATA(u); 12546 n/a assert(kind == PyUnicode_4BYTE_KIND); 12547 n/a for (n = 0; n < len; ++n) 12548 n/a ucs4[n] = fill_char; 12549 n/a } 12550 n/a } 12551 n/a else { 12552 n/a /* number of characters copied this far */ 12553 n/a Py_ssize_t done = PyUnicode_GET_LENGTH(str); 12554 n/a const Py_ssize_t char_size = PyUnicode_KIND(str); 12555 n/a char *to = (char *) PyUnicode_DATA(u); 12556 n/a memcpy(to, PyUnicode_DATA(str), 12557 n/a PyUnicode_GET_LENGTH(str) * char_size); 12558 n/a while (done < nchars) { 12559 n/a n = (done <= nchars-done) ? done : nchars-done; 12560 n/a memcpy(to + (done * char_size), to, n * char_size); 12561 n/a done += n; 12562 n/a } 12563 n/a } 12564 n/a 12565 n/a assert(_PyUnicode_CheckConsistency(u, 1)); 12566 n/a return u; 12567 n/a } 12568 n/a 12569 n/a PyObject * 12570 n/a PyUnicode_Replace(PyObject *str, 12571 n/a PyObject *substr, 12572 n/a PyObject *replstr, 12573 n/a Py_ssize_t maxcount) 12574 n/a { 12575 n/a if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0 || 12576 n/a ensure_unicode(replstr) < 0) 12577 n/a return NULL; 12578 n/a return replace(str, substr, replstr, maxcount); 12579 n/a } 12580 n/a 12581 n/a /*[clinic input] 12582 n/a str.replace as unicode_replace 12583 n/a 12584 n/a old: unicode 12585 n/a new: unicode 12586 n/a count: Py_ssize_t = -1 12587 n/a Maximum number of occurrences to replace. 12588 n/a -1 (the default value) means replace all occurrences. 12589 n/a / 12590 n/a 12591 n/a Return a copy with all occurrences of substring old replaced by new. 12592 n/a 12593 n/a If the optional argument count is given, only the first count occurrences are 12594 n/a replaced. 12595 n/a [clinic start generated code]*/ 12596 n/a 12597 n/a static PyObject * 12598 n/a unicode_replace_impl(PyObject *self, PyObject *old, PyObject *new, 12599 n/a Py_ssize_t count) 12600 n/a /*[clinic end generated code: output=b63f1a8b5eebf448 input=147d12206276ebeb]*/ 12601 n/a { 12602 n/a if (PyUnicode_READY(self) == -1) 12603 n/a return NULL; 12604 n/a return replace(self, old, new, count); 12605 n/a } 12606 n/a 12607 n/a static PyObject * 12608 n/a unicode_repr(PyObject *unicode) 12609 n/a { 12610 n/a PyObject *repr; 12611 n/a Py_ssize_t isize; 12612 n/a Py_ssize_t osize, squote, dquote, i, o; 12613 n/a Py_UCS4 max, quote; 12614 n/a int ikind, okind, unchanged; 12615 n/a void *idata, *odata; 12616 n/a 12617 n/a if (PyUnicode_READY(unicode) == -1) 12618 n/a return NULL; 12619 n/a 12620 n/a isize = PyUnicode_GET_LENGTH(unicode); 12621 n/a idata = PyUnicode_DATA(unicode); 12622 n/a 12623 n/a /* Compute length of output, quote characters, and 12624 n/a maximum character */ 12625 n/a osize = 0; 12626 n/a max = 127; 12627 n/a squote = dquote = 0; 12628 n/a ikind = PyUnicode_KIND(unicode); 12629 n/a for (i = 0; i < isize; i++) { 12630 n/a Py_UCS4 ch = PyUnicode_READ(ikind, idata, i); 12631 n/a Py_ssize_t incr = 1; 12632 n/a switch (ch) { 12633 n/a case '\'': squote++; break; 12634 n/a case '"': dquote++; break; 12635 n/a case '\\': case '\t': case '\r': case '\n': 12636 n/a incr = 2; 12637 n/a break; 12638 n/a default: 12639 n/a /* Fast-path ASCII */ 12640 n/a if (ch < ' ' || ch == 0x7f) 12641 n/a incr = 4; /* \xHH */ 12642 n/a else if (ch < 0x7f) 12643 n/a ; 12644 n/a else if (Py_UNICODE_ISPRINTABLE(ch)) 12645 n/a max = ch > max ? ch : max; 12646 n/a else if (ch < 0x100) 12647 n/a incr = 4; /* \xHH */ 12648 n/a else if (ch < 0x10000) 12649 n/a incr = 6; /* \uHHHH */ 12650 n/a else 12651 n/a incr = 10; /* \uHHHHHHHH */ 12652 n/a } 12653 n/a if (osize > PY_SSIZE_T_MAX - incr) { 12654 n/a PyErr_SetString(PyExc_OverflowError, 12655 n/a "string is too long to generate repr"); 12656 n/a return NULL; 12657 n/a } 12658 n/a osize += incr; 12659 n/a } 12660 n/a 12661 n/a quote = '\''; 12662 n/a unchanged = (osize == isize); 12663 n/a if (squote) { 12664 n/a unchanged = 0; 12665 n/a if (dquote) 12666 n/a /* Both squote and dquote present. Use squote, 12667 n/a and escape them */ 12668 n/a osize += squote; 12669 n/a else 12670 n/a quote = '"'; 12671 n/a } 12672 n/a osize += 2; /* quotes */ 12673 n/a 12674 n/a repr = PyUnicode_New(osize, max); 12675 n/a if (repr == NULL) 12676 n/a return NULL; 12677 n/a okind = PyUnicode_KIND(repr); 12678 n/a odata = PyUnicode_DATA(repr); 12679 n/a 12680 n/a PyUnicode_WRITE(okind, odata, 0, quote); 12681 n/a PyUnicode_WRITE(okind, odata, osize-1, quote); 12682 n/a if (unchanged) { 12683 n/a _PyUnicode_FastCopyCharacters(repr, 1, 12684 n/a unicode, 0, 12685 n/a isize); 12686 n/a } 12687 n/a else { 12688 n/a for (i = 0, o = 1; i < isize; i++) { 12689 n/a Py_UCS4 ch = PyUnicode_READ(ikind, idata, i); 12690 n/a 12691 n/a /* Escape quotes and backslashes */ 12692 n/a if ((ch == quote) || (ch == '\\')) { 12693 n/a PyUnicode_WRITE(okind, odata, o++, '\\'); 12694 n/a PyUnicode_WRITE(okind, odata, o++, ch); 12695 n/a continue; 12696 n/a } 12697 n/a 12698 n/a /* Map special whitespace to '\t', \n', '\r' */ 12699 n/a if (ch == '\t') { 12700 n/a PyUnicode_WRITE(okind, odata, o++, '\\'); 12701 n/a PyUnicode_WRITE(okind, odata, o++, 't'); 12702 n/a } 12703 n/a else if (ch == '\n') { 12704 n/a PyUnicode_WRITE(okind, odata, o++, '\\'); 12705 n/a PyUnicode_WRITE(okind, odata, o++, 'n'); 12706 n/a } 12707 n/a else if (ch == '\r') { 12708 n/a PyUnicode_WRITE(okind, odata, o++, '\\'); 12709 n/a PyUnicode_WRITE(okind, odata, o++, 'r'); 12710 n/a } 12711 n/a 12712 n/a /* Map non-printable US ASCII to '\xhh' */ 12713 n/a else if (ch < ' ' || ch == 0x7F) { 12714 n/a PyUnicode_WRITE(okind, odata, o++, '\\'); 12715 n/a PyUnicode_WRITE(okind, odata, o++, 'x'); 12716 n/a PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]); 12717 n/a PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]); 12718 n/a } 12719 n/a 12720 n/a /* Copy ASCII characters as-is */ 12721 n/a else if (ch < 0x7F) { 12722 n/a PyUnicode_WRITE(okind, odata, o++, ch); 12723 n/a } 12724 n/a 12725 n/a /* Non-ASCII characters */ 12726 n/a else { 12727 n/a /* Map Unicode whitespace and control characters 12728 n/a (categories Z* and C* except ASCII space) 12729 n/a */ 12730 n/a if (!Py_UNICODE_ISPRINTABLE(ch)) { 12731 n/a PyUnicode_WRITE(okind, odata, o++, '\\'); 12732 n/a /* Map 8-bit characters to '\xhh' */ 12733 n/a if (ch <= 0xff) { 12734 n/a PyUnicode_WRITE(okind, odata, o++, 'x'); 12735 n/a PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]); 12736 n/a PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]); 12737 n/a } 12738 n/a /* Map 16-bit characters to '\uxxxx' */ 12739 n/a else if (ch <= 0xffff) { 12740 n/a PyUnicode_WRITE(okind, odata, o++, 'u'); 12741 n/a PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]); 12742 n/a PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]); 12743 n/a PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]); 12744 n/a PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]); 12745 n/a } 12746 n/a /* Map 21-bit characters to '\U00xxxxxx' */ 12747 n/a else { 12748 n/a PyUnicode_WRITE(okind, odata, o++, 'U'); 12749 n/a PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 28) & 0xF]); 12750 n/a PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 24) & 0xF]); 12751 n/a PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 20) & 0xF]); 12752 n/a PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 16) & 0xF]); 12753 n/a PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]); 12754 n/a PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]); 12755 n/a PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]); 12756 n/a PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]); 12757 n/a } 12758 n/a } 12759 n/a /* Copy characters as-is */ 12760 n/a else { 12761 n/a PyUnicode_WRITE(okind, odata, o++, ch); 12762 n/a } 12763 n/a } 12764 n/a } 12765 n/a } 12766 n/a /* Closing quote already added at the beginning */ 12767 n/a assert(_PyUnicode_CheckConsistency(repr, 1)); 12768 n/a return repr; 12769 n/a } 12770 n/a 12771 n/a PyDoc_STRVAR(rfind__doc__, 12772 n/a "S.rfind(sub[, start[, end]]) -> int\n\ 12773 n/a \n\ 12774 n/a Return the highest index in S where substring sub is found,\n\ 12775 n/a such that sub is contained within S[start:end]. Optional\n\ 12776 n/a arguments start and end are interpreted as in slice notation.\n\ 12777 n/a \n\ 12778 n/a Return -1 on failure."); 12779 n/a 12780 n/a static PyObject * 12781 n/a unicode_rfind(PyObject *self, PyObject *args) 12782 n/a { 12783 n/a /* initialize variables to prevent gcc warning */ 12784 n/a PyObject *substring = NULL; 12785 n/a Py_ssize_t start = 0; 12786 n/a Py_ssize_t end = 0; 12787 n/a Py_ssize_t result; 12788 n/a 12789 n/a if (!parse_args_finds_unicode("rfind", args, &substring, &start, &end)) 12790 n/a return NULL; 12791 n/a 12792 n/a if (PyUnicode_READY(self) == -1) 12793 n/a return NULL; 12794 n/a 12795 n/a result = any_find_slice(self, substring, start, end, -1); 12796 n/a 12797 n/a if (result == -2) 12798 n/a return NULL; 12799 n/a 12800 n/a return PyLong_FromSsize_t(result); 12801 n/a } 12802 n/a 12803 n/a PyDoc_STRVAR(rindex__doc__, 12804 n/a "S.rindex(sub[, start[, end]]) -> int\n\ 12805 n/a \n\ 12806 n/a Like S.rfind() but raise ValueError when the substring is not found."); 12807 n/a 12808 n/a static PyObject * 12809 n/a unicode_rindex(PyObject *self, PyObject *args) 12810 n/a { 12811 n/a /* initialize variables to prevent gcc warning */ 12812 n/a PyObject *substring = NULL; 12813 n/a Py_ssize_t start = 0; 12814 n/a Py_ssize_t end = 0; 12815 n/a Py_ssize_t result; 12816 n/a 12817 n/a if (!parse_args_finds_unicode("rindex", args, &substring, &start, &end)) 12818 n/a return NULL; 12819 n/a 12820 n/a if (PyUnicode_READY(self) == -1) 12821 n/a return NULL; 12822 n/a 12823 n/a result = any_find_slice(self, substring, start, end, -1); 12824 n/a 12825 n/a if (result == -2) 12826 n/a return NULL; 12827 n/a 12828 n/a if (result < 0) { 12829 n/a PyErr_SetString(PyExc_ValueError, "substring not found"); 12830 n/a return NULL; 12831 n/a } 12832 n/a 12833 n/a return PyLong_FromSsize_t(result); 12834 n/a } 12835 n/a 12836 n/a /*[clinic input] 12837 n/a str.rjust as unicode_rjust 12838 n/a 12839 n/a width: Py_ssize_t 12840 n/a fillchar: Py_UCS4 = ' ' 12841 n/a / 12842 n/a 12843 n/a Return a right-justified string of length width. 12844 n/a 12845 n/a Padding is done using the specified fill character (default is a space). 12846 n/a [clinic start generated code]*/ 12847 n/a 12848 n/a static PyObject * 12849 n/a unicode_rjust_impl(PyObject *self, Py_ssize_t width, Py_UCS4 fillchar) 12850 n/a /*[clinic end generated code: output=804a1a57fbe8d5cf input=d05f550b5beb1f72]*/ 12851 n/a { 12852 n/a if (PyUnicode_READY(self) == -1) 12853 n/a return NULL; 12854 n/a 12855 n/a if (PyUnicode_GET_LENGTH(self) >= width) 12856 n/a return unicode_result_unchanged(self); 12857 n/a 12858 n/a return pad(self, width - PyUnicode_GET_LENGTH(self), 0, fillchar); 12859 n/a } 12860 n/a 12861 n/a PyObject * 12862 n/a PyUnicode_Split(PyObject *s, PyObject *sep, Py_ssize_t maxsplit) 12863 n/a { 12864 n/a if (ensure_unicode(s) < 0 || (sep != NULL && ensure_unicode(sep) < 0)) 12865 n/a return NULL; 12866 n/a 12867 n/a return split(s, sep, maxsplit); 12868 n/a } 12869 n/a 12870 n/a /*[clinic input] 12871 n/a str.split as unicode_split 12872 n/a 12873 n/a sep: object = None 12874 n/a The delimiter according which to split the string. 12875 n/a None (the default value) means split according to any whitespace, 12876 n/a and discard empty strings from the result. 12877 n/a maxsplit: Py_ssize_t = -1 12878 n/a Maximum number of splits to do. 12879 n/a -1 (the default value) means no limit. 12880 n/a 12881 n/a Return a list of the words in the string, using sep as the delimiter string. 12882 n/a [clinic start generated code]*/ 12883 n/a 12884 n/a static PyObject * 12885 n/a unicode_split_impl(PyObject *self, PyObject *sep, Py_ssize_t maxsplit) 12886 n/a /*[clinic end generated code: output=3a65b1db356948dc input=606e750488a82359]*/ 12887 n/a { 12888 n/a if (sep == Py_None) 12889 n/a return split(self, NULL, maxsplit); 12890 n/a if (PyUnicode_Check(sep)) 12891 n/a return split(self, sep, maxsplit); 12892 n/a 12893 n/a PyErr_Format(PyExc_TypeError, 12894 n/a "must be str or None, not %.100s", 12895 n/a Py_TYPE(sep)->tp_name); 12896 n/a return NULL; 12897 n/a } 12898 n/a 12899 n/a PyObject * 12900 n/a PyUnicode_Partition(PyObject *str_obj, PyObject *sep_obj) 12901 n/a { 12902 n/a PyObject* out; 12903 n/a int kind1, kind2; 12904 n/a void *buf1, *buf2; 12905 n/a Py_ssize_t len1, len2; 12906 n/a 12907 n/a if (ensure_unicode(str_obj) < 0 || ensure_unicode(sep_obj) < 0) 12908 n/a return NULL; 12909 n/a 12910 n/a kind1 = PyUnicode_KIND(str_obj); 12911 n/a kind2 = PyUnicode_KIND(sep_obj); 12912 n/a len1 = PyUnicode_GET_LENGTH(str_obj); 12913 n/a len2 = PyUnicode_GET_LENGTH(sep_obj); 12914 n/a if (kind1 < kind2 || len1 < len2) { 12915 n/a _Py_INCREF_UNICODE_EMPTY(); 12916 n/a if (!unicode_empty) 12917 n/a out = NULL; 12918 n/a else { 12919 n/a out = PyTuple_Pack(3, str_obj, unicode_empty, unicode_empty); 12920 n/a Py_DECREF(unicode_empty); 12921 n/a } 12922 n/a return out; 12923 n/a } 12924 n/a buf1 = PyUnicode_DATA(str_obj); 12925 n/a buf2 = PyUnicode_DATA(sep_obj); 12926 n/a if (kind2 != kind1) { 12927 n/a buf2 = _PyUnicode_AsKind(sep_obj, kind1); 12928 n/a if (!buf2) 12929 n/a return NULL; 12930 n/a } 12931 n/a 12932 n/a switch (kind1) { 12933 n/a case PyUnicode_1BYTE_KIND: 12934 n/a if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj)) 12935 n/a out = asciilib_partition(str_obj, buf1, len1, sep_obj, buf2, len2); 12936 n/a else 12937 n/a out = ucs1lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2); 12938 n/a break; 12939 n/a case PyUnicode_2BYTE_KIND: 12940 n/a out = ucs2lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2); 12941 n/a break; 12942 n/a case PyUnicode_4BYTE_KIND: 12943 n/a out = ucs4lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2); 12944 n/a break; 12945 n/a default: 12946 n/a assert(0); 12947 n/a out = 0; 12948 n/a } 12949 n/a 12950 n/a if (kind2 != kind1) 12951 n/a PyMem_Free(buf2); 12952 n/a 12953 n/a return out; 12954 n/a } 12955 n/a 12956 n/a 12957 n/a PyObject * 12958 n/a PyUnicode_RPartition(PyObject *str_obj, PyObject *sep_obj) 12959 n/a { 12960 n/a PyObject* out; 12961 n/a int kind1, kind2; 12962 n/a void *buf1, *buf2; 12963 n/a Py_ssize_t len1, len2; 12964 n/a 12965 n/a if (ensure_unicode(str_obj) < 0 || ensure_unicode(sep_obj) < 0) 12966 n/a return NULL; 12967 n/a 12968 n/a kind1 = PyUnicode_KIND(str_obj); 12969 n/a kind2 = PyUnicode_KIND(sep_obj); 12970 n/a len1 = PyUnicode_GET_LENGTH(str_obj); 12971 n/a len2 = PyUnicode_GET_LENGTH(sep_obj); 12972 n/a if (kind1 < kind2 || len1 < len2) { 12973 n/a _Py_INCREF_UNICODE_EMPTY(); 12974 n/a if (!unicode_empty) 12975 n/a out = NULL; 12976 n/a else { 12977 n/a out = PyTuple_Pack(3, unicode_empty, unicode_empty, str_obj); 12978 n/a Py_DECREF(unicode_empty); 12979 n/a } 12980 n/a return out; 12981 n/a } 12982 n/a buf1 = PyUnicode_DATA(str_obj); 12983 n/a buf2 = PyUnicode_DATA(sep_obj); 12984 n/a if (kind2 != kind1) { 12985 n/a buf2 = _PyUnicode_AsKind(sep_obj, kind1); 12986 n/a if (!buf2) 12987 n/a return NULL; 12988 n/a } 12989 n/a 12990 n/a switch (kind1) { 12991 n/a case PyUnicode_1BYTE_KIND: 12992 n/a if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj)) 12993 n/a out = asciilib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2); 12994 n/a else 12995 n/a out = ucs1lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2); 12996 n/a break; 12997 n/a case PyUnicode_2BYTE_KIND: 12998 n/a out = ucs2lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2); 12999 n/a break; 13000 n/a case PyUnicode_4BYTE_KIND: 13001 n/a out = ucs4lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2); 13002 n/a break; 13003 n/a default: 13004 n/a assert(0); 13005 n/a out = 0; 13006 n/a } 13007 n/a 13008 n/a if (kind2 != kind1) 13009 n/a PyMem_Free(buf2); 13010 n/a 13011 n/a return out; 13012 n/a } 13013 n/a 13014 n/a /*[clinic input] 13015 n/a str.partition as unicode_partition 13016 n/a 13017 n/a sep: object 13018 n/a / 13019 n/a 13020 n/a Partition the string into three parts using the given separator. 13021 n/a 13022 n/a This will search for the separator in the string. If the separator is found, 13023 n/a returns a 3-tuple containing the part before the separator, the separator 13024 n/a itself, and the part after it. 13025 n/a 13026 n/a If the separator is not found, returns a 3-tuple containing the original string 13027 n/a and two empty strings. 13028 n/a [clinic start generated code]*/ 13029 n/a 13030 n/a static PyObject * 13031 n/a unicode_partition(PyObject *self, PyObject *sep) 13032 n/a /*[clinic end generated code: output=e4ced7bd253ca3c4 input=f29b8d06c63e50be]*/ 13033 n/a { 13034 n/a return PyUnicode_Partition(self, sep); 13035 n/a } 13036 n/a 13037 n/a /*[clinic input] 13038 n/a str.rpartition as unicode_rpartition = str.partition 13039 n/a 13040 n/a Partition the string into three parts using the given separator. 13041 n/a 13042 n/a This will search for the separator in the string, starting and the end. If 13043 n/a the separator is found, returns a 3-tuple containing the part before the 13044 n/a separator, the separator itself, and the part after it. 13045 n/a 13046 n/a If the separator is not found, returns a 3-tuple containing two empty strings 13047 n/a and the original string. 13048 n/a [clinic start generated code]*/ 13049 n/a 13050 n/a static PyObject * 13051 n/a unicode_rpartition(PyObject *self, PyObject *sep) 13052 n/a /*[clinic end generated code: output=1aa13cf1156572aa input=e77c7acb69bdfca6]*/ 13053 n/a { 13054 n/a return PyUnicode_RPartition(self, sep); 13055 n/a } 13056 n/a 13057 n/a PyObject * 13058 n/a PyUnicode_RSplit(PyObject *s, PyObject *sep, Py_ssize_t maxsplit) 13059 n/a { 13060 n/a if (ensure_unicode(s) < 0 || (sep != NULL && ensure_unicode(sep) < 0)) 13061 n/a return NULL; 13062 n/a 13063 n/a return rsplit(s, sep, maxsplit); 13064 n/a } 13065 n/a 13066 n/a /*[clinic input] 13067 n/a str.rsplit as unicode_rsplit = str.split 13068 n/a 13069 n/a Return a list of the words in the string, using sep as the delimiter string. 13070 n/a 13071 n/a Splits are done starting at the end of the string and working to the front. 13072 n/a [clinic start generated code]*/ 13073 n/a 13074 n/a static PyObject * 13075 n/a unicode_rsplit_impl(PyObject *self, PyObject *sep, Py_ssize_t maxsplit) 13076 n/a /*[clinic end generated code: output=c2b815c63bcabffc input=12ad4bf57dd35f15]*/ 13077 n/a { 13078 n/a if (sep == Py_None) 13079 n/a return rsplit(self, NULL, maxsplit); 13080 n/a if (PyUnicode_Check(sep)) 13081 n/a return rsplit(self, sep, maxsplit); 13082 n/a 13083 n/a PyErr_Format(PyExc_TypeError, 13084 n/a "must be str or None, not %.100s", 13085 n/a Py_TYPE(sep)->tp_name); 13086 n/a return NULL; 13087 n/a } 13088 n/a 13089 n/a /*[clinic input] 13090 n/a str.splitlines as unicode_splitlines 13091 n/a 13092 n/a keepends: int(c_default="0") = False 13093 n/a 13094 n/a Return a list of the lines in the string, breaking at line boundaries. 13095 n/a 13096 n/a Line breaks are not included in the resulting list unless keepends is given and 13097 n/a true. 13098 n/a [clinic start generated code]*/ 13099 n/a 13100 n/a static PyObject * 13101 n/a unicode_splitlines_impl(PyObject *self, int keepends) 13102 n/a /*[clinic end generated code: output=f664dcdad153ec40 input=d6ff99fe43465b0f]*/ 13103 n/a { 13104 n/a return PyUnicode_Splitlines(self, keepends); 13105 n/a } 13106 n/a 13107 n/a static 13108 n/a PyObject *unicode_str(PyObject *self) 13109 n/a { 13110 n/a return unicode_result_unchanged(self); 13111 n/a } 13112 n/a 13113 n/a /*[clinic input] 13114 n/a str.swapcase as unicode_swapcase 13115 n/a 13116 n/a Convert uppercase characters to lowercase and lowercase characters to uppercase. 13117 n/a [clinic start generated code]*/ 13118 n/a 13119 n/a static PyObject * 13120 n/a unicode_swapcase_impl(PyObject *self) 13121 n/a /*[clinic end generated code: output=5d28966bf6d7b2af input=3f3ef96d5798a7bb]*/ 13122 n/a { 13123 n/a if (PyUnicode_READY(self) == -1) 13124 n/a return NULL; 13125 n/a return case_operation(self, do_swapcase); 13126 n/a } 13127 n/a 13128 n/a /*[clinic input] 13129 n/a 13130 n/a @staticmethod 13131 n/a str.maketrans as unicode_maketrans 13132 n/a 13133 n/a x: object 13134 n/a 13135 n/a y: unicode=NULL 13136 n/a 13137 n/a z: unicode=NULL 13138 n/a 13139 n/a / 13140 n/a 13141 n/a Return a translation table usable for str.translate(). 13142 n/a 13143 n/a If there is only one argument, it must be a dictionary mapping Unicode 13144 n/a ordinals (integers) or characters to Unicode ordinals, strings or None. 13145 n/a Character keys will be then converted to ordinals. 13146 n/a If there are two arguments, they must be strings of equal length, and 13147 n/a in the resulting dictionary, each character in x will be mapped to the 13148 n/a character at the same position in y. If there is a third argument, it 13149 n/a must be a string, whose characters will be mapped to None in the result. 13150 n/a [clinic start generated code]*/ 13151 n/a 13152 n/a static PyObject * 13153 n/a unicode_maketrans_impl(PyObject *x, PyObject *y, PyObject *z) 13154 n/a /*[clinic end generated code: output=a925c89452bd5881 input=7bfbf529a293c6c5]*/ 13155 n/a { 13156 n/a PyObject *new = NULL, *key, *value; 13157 n/a Py_ssize_t i = 0; 13158 n/a int res; 13159 n/a 13160 n/a new = PyDict_New(); 13161 n/a if (!new) 13162 n/a return NULL; 13163 n/a if (y != NULL) { 13164 n/a int x_kind, y_kind, z_kind; 13165 n/a void *x_data, *y_data, *z_data; 13166 n/a 13167 n/a /* x must be a string too, of equal length */ 13168 n/a if (!PyUnicode_Check(x)) { 13169 n/a PyErr_SetString(PyExc_TypeError, "first maketrans argument must " 13170 n/a "be a string if there is a second argument"); 13171 n/a goto err; 13172 n/a } 13173 n/a if (PyUnicode_GET_LENGTH(x) != PyUnicode_GET_LENGTH(y)) { 13174 n/a PyErr_SetString(PyExc_ValueError, "the first two maketrans " 13175 n/a "arguments must have equal length"); 13176 n/a goto err; 13177 n/a } 13178 n/a /* create entries for translating chars in x to those in y */ 13179 n/a x_kind = PyUnicode_KIND(x); 13180 n/a y_kind = PyUnicode_KIND(y); 13181 n/a x_data = PyUnicode_DATA(x); 13182 n/a y_data = PyUnicode_DATA(y); 13183 n/a for (i = 0; i < PyUnicode_GET_LENGTH(x); i++) { 13184 n/a key = PyLong_FromLong(PyUnicode_READ(x_kind, x_data, i)); 13185 n/a if (!key) 13186 n/a goto err; 13187 n/a value = PyLong_FromLong(PyUnicode_READ(y_kind, y_data, i)); 13188 n/a if (!value) { 13189 n/a Py_DECREF(key); 13190 n/a goto err; 13191 n/a } 13192 n/a res = PyDict_SetItem(new, key, value); 13193 n/a Py_DECREF(key); 13194 n/a Py_DECREF(value); 13195 n/a if (res < 0) 13196 n/a goto err; 13197 n/a } 13198 n/a /* create entries for deleting chars in z */ 13199 n/a if (z != NULL) { 13200 n/a z_kind = PyUnicode_KIND(z); 13201 n/a z_data = PyUnicode_DATA(z); 13202 n/a for (i = 0; i < PyUnicode_GET_LENGTH(z); i++) { 13203 n/a key = PyLong_FromLong(PyUnicode_READ(z_kind, z_data, i)); 13204 n/a if (!key) 13205 n/a goto err; 13206 n/a res = PyDict_SetItem(new, key, Py_None); 13207 n/a Py_DECREF(key); 13208 n/a if (res < 0) 13209 n/a goto err; 13210 n/a } 13211 n/a } 13212 n/a } else { 13213 n/a int kind; 13214 n/a void *data; 13215 n/a 13216 n/a /* x must be a dict */ 13217 n/a if (!PyDict_CheckExact(x)) { 13218 n/a PyErr_SetString(PyExc_TypeError, "if you give only one argument " 13219 n/a "to maketrans it must be a dict"); 13220 n/a goto err; 13221 n/a } 13222 n/a /* copy entries into the new dict, converting string keys to int keys */ 13223 n/a while (PyDict_Next(x, &i, &key, &value)) { 13224 n/a if (PyUnicode_Check(key)) { 13225 n/a /* convert string keys to integer keys */ 13226 n/a PyObject *newkey; 13227 n/a if (PyUnicode_GET_LENGTH(key) != 1) { 13228 n/a PyErr_SetString(PyExc_ValueError, "string keys in translate " 13229 n/a "table must be of length 1"); 13230 n/a goto err; 13231 n/a } 13232 n/a kind = PyUnicode_KIND(key); 13233 n/a data = PyUnicode_DATA(key); 13234 n/a newkey = PyLong_FromLong(PyUnicode_READ(kind, data, 0)); 13235 n/a if (!newkey) 13236 n/a goto err; 13237 n/a res = PyDict_SetItem(new, newkey, value); 13238 n/a Py_DECREF(newkey); 13239 n/a if (res < 0) 13240 n/a goto err; 13241 n/a } else if (PyLong_Check(key)) { 13242 n/a /* just keep integer keys */ 13243 n/a if (PyDict_SetItem(new, key, value) < 0) 13244 n/a goto err; 13245 n/a } else { 13246 n/a PyErr_SetString(PyExc_TypeError, "keys in translate table must " 13247 n/a "be strings or integers"); 13248 n/a goto err; 13249 n/a } 13250 n/a } 13251 n/a } 13252 n/a return new; 13253 n/a err: 13254 n/a Py_DECREF(new); 13255 n/a return NULL; 13256 n/a } 13257 n/a 13258 n/a /*[clinic input] 13259 n/a str.translate as unicode_translate 13260 n/a 13261 n/a table: object 13262 n/a Translation table, which must be a mapping of Unicode ordinals to 13263 n/a Unicode ordinals, strings, or None. 13264 n/a / 13265 n/a 13266 n/a Replace each character in the string using the given translation table. 13267 n/a 13268 n/a The table must implement lookup/indexing via __getitem__, for instance a 13269 n/a dictionary or list. If this operation raises LookupError, the character is 13270 n/a left untouched. Characters mapped to None are deleted. 13271 n/a [clinic start generated code]*/ 13272 n/a 13273 n/a static PyObject * 13274 n/a unicode_translate(PyObject *self, PyObject *table) 13275 n/a /*[clinic end generated code: output=3cb448ff2fd96bf3 input=6d38343db63d8eb0]*/ 13276 n/a { 13277 n/a return _PyUnicode_TranslateCharmap(self, table, "ignore"); 13278 n/a } 13279 n/a 13280 n/a /*[clinic input] 13281 n/a str.upper as unicode_upper 13282 n/a 13283 n/a Return a copy of the string converted to uppercase. 13284 n/a [clinic start generated code]*/ 13285 n/a 13286 n/a static PyObject * 13287 n/a unicode_upper_impl(PyObject *self) 13288 n/a /*[clinic end generated code: output=1b7ddd16bbcdc092 input=db3d55682dfe2e6c]*/ 13289 n/a { 13290 n/a if (PyUnicode_READY(self) == -1) 13291 n/a return NULL; 13292 n/a if (PyUnicode_IS_ASCII(self)) 13293 n/a return ascii_upper_or_lower(self, 0); 13294 n/a return case_operation(self, do_upper); 13295 n/a } 13296 n/a 13297 n/a /*[clinic input] 13298 n/a str.zfill as unicode_zfill 13299 n/a 13300 n/a width: Py_ssize_t 13301 n/a / 13302 n/a 13303 n/a Pad a numeric string with zeros on the left, to fill a field of the given width. 13304 n/a 13305 n/a The string is never truncated. 13306 n/a [clinic start generated code]*/ 13307 n/a 13308 n/a static PyObject * 13309 n/a unicode_zfill_impl(PyObject *self, Py_ssize_t width) 13310 n/a /*[clinic end generated code: output=e13fb6bdf8e3b9df input=c6b2f772c6f27799]*/ 13311 n/a { 13312 n/a Py_ssize_t fill; 13313 n/a PyObject *u; 13314 n/a int kind; 13315 n/a void *data; 13316 n/a Py_UCS4 chr; 13317 n/a 13318 n/a if (PyUnicode_READY(self) == -1) 13319 n/a return NULL; 13320 n/a 13321 n/a if (PyUnicode_GET_LENGTH(self) >= width) 13322 n/a return unicode_result_unchanged(self); 13323 n/a 13324 n/a fill = width - PyUnicode_GET_LENGTH(self); 13325 n/a 13326 n/a u = pad(self, fill, 0, '0'); 13327 n/a 13328 n/a if (u == NULL) 13329 n/a return NULL; 13330 n/a 13331 n/a kind = PyUnicode_KIND(u); 13332 n/a data = PyUnicode_DATA(u); 13333 n/a chr = PyUnicode_READ(kind, data, fill); 13334 n/a 13335 n/a if (chr == '+' || chr == '-') { 13336 n/a /* move sign to beginning of string */ 13337 n/a PyUnicode_WRITE(kind, data, 0, chr); 13338 n/a PyUnicode_WRITE(kind, data, fill, '0'); 13339 n/a } 13340 n/a 13341 n/a assert(_PyUnicode_CheckConsistency(u, 1)); 13342 n/a return u; 13343 n/a } 13344 n/a 13345 n/a #if 0 13346 n/a static PyObject * 13347 n/a unicode__decimal2ascii(PyObject *self) 13348 n/a { 13349 n/a return PyUnicode_TransformDecimalAndSpaceToASCII(self); 13350 n/a } 13351 n/a #endif 13352 n/a 13353 n/a PyDoc_STRVAR(startswith__doc__, 13354 n/a "S.startswith(prefix[, start[, end]]) -> bool\n\ 13355 n/a \n\ 13356 n/a Return True if S starts with the specified prefix, False otherwise.\n\ 13357 n/a With optional start, test S beginning at that position.\n\ 13358 n/a With optional end, stop comparing S at that position.\n\ 13359 n/a prefix can also be a tuple of strings to try."); 13360 n/a 13361 n/a static PyObject * 13362 n/a unicode_startswith(PyObject *self, 13363 n/a PyObject *args) 13364 n/a { 13365 n/a PyObject *subobj; 13366 n/a PyObject *substring; 13367 n/a Py_ssize_t start = 0; 13368 n/a Py_ssize_t end = PY_SSIZE_T_MAX; 13369 n/a int result; 13370 n/a 13371 n/a if (!stringlib_parse_args_finds("startswith", args, &subobj, &start, &end)) 13372 n/a return NULL; 13373 n/a if (PyTuple_Check(subobj)) { 13374 n/a Py_ssize_t i; 13375 n/a for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) { 13376 n/a substring = PyTuple_GET_ITEM(subobj, i); 13377 n/a if (!PyUnicode_Check(substring)) { 13378 n/a PyErr_Format(PyExc_TypeError, 13379 n/a "tuple for startswith must only contain str, " 13380 n/a "not %.100s", 13381 n/a Py_TYPE(substring)->tp_name); 13382 n/a return NULL; 13383 n/a } 13384 n/a result = tailmatch(self, substring, start, end, -1); 13385 n/a if (result == -1) 13386 n/a return NULL; 13387 n/a if (result) { 13388 n/a Py_RETURN_TRUE; 13389 n/a } 13390 n/a } 13391 n/a /* nothing matched */ 13392 n/a Py_RETURN_FALSE; 13393 n/a } 13394 n/a if (!PyUnicode_Check(subobj)) { 13395 n/a PyErr_Format(PyExc_TypeError, 13396 n/a "startswith first arg must be str or " 13397 n/a "a tuple of str, not %.100s", Py_TYPE(subobj)->tp_name); 13398 n/a return NULL; 13399 n/a } 13400 n/a result = tailmatch(self, subobj, start, end, -1); 13401 n/a if (result == -1) 13402 n/a return NULL; 13403 n/a return PyBool_FromLong(result); 13404 n/a } 13405 n/a 13406 n/a 13407 n/a PyDoc_STRVAR(endswith__doc__, 13408 n/a "S.endswith(suffix[, start[, end]]) -> bool\n\ 13409 n/a \n\ 13410 n/a Return True if S ends with the specified suffix, False otherwise.\n\ 13411 n/a With optional start, test S beginning at that position.\n\ 13412 n/a With optional end, stop comparing S at that position.\n\ 13413 n/a suffix can also be a tuple of strings to try."); 13414 n/a 13415 n/a static PyObject * 13416 n/a unicode_endswith(PyObject *self, 13417 n/a PyObject *args) 13418 n/a { 13419 n/a PyObject *subobj; 13420 n/a PyObject *substring; 13421 n/a Py_ssize_t start = 0; 13422 n/a Py_ssize_t end = PY_SSIZE_T_MAX; 13423 n/a int result; 13424 n/a 13425 n/a if (!stringlib_parse_args_finds("endswith", args, &subobj, &start, &end)) 13426 n/a return NULL; 13427 n/a if (PyTuple_Check(subobj)) { 13428 n/a Py_ssize_t i; 13429 n/a for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) { 13430 n/a substring = PyTuple_GET_ITEM(subobj, i); 13431 n/a if (!PyUnicode_Check(substring)) { 13432 n/a PyErr_Format(PyExc_TypeError, 13433 n/a "tuple for endswith must only contain str, " 13434 n/a "not %.100s", 13435 n/a Py_TYPE(substring)->tp_name); 13436 n/a return NULL; 13437 n/a } 13438 n/a result = tailmatch(self, substring, start, end, +1); 13439 n/a if (result == -1) 13440 n/a return NULL; 13441 n/a if (result) { 13442 n/a Py_RETURN_TRUE; 13443 n/a } 13444 n/a } 13445 n/a Py_RETURN_FALSE; 13446 n/a } 13447 n/a if (!PyUnicode_Check(subobj)) { 13448 n/a PyErr_Format(PyExc_TypeError, 13449 n/a "endswith first arg must be str or " 13450 n/a "a tuple of str, not %.100s", Py_TYPE(subobj)->tp_name); 13451 n/a return NULL; 13452 n/a } 13453 n/a result = tailmatch(self, subobj, start, end, +1); 13454 n/a if (result == -1) 13455 n/a return NULL; 13456 n/a return PyBool_FromLong(result); 13457 n/a } 13458 n/a 13459 n/a static inline void 13460 n/a _PyUnicodeWriter_Update(_PyUnicodeWriter *writer) 13461 n/a { 13462 n/a writer->maxchar = PyUnicode_MAX_CHAR_VALUE(writer->buffer); 13463 n/a writer->data = PyUnicode_DATA(writer->buffer); 13464 n/a 13465 n/a if (!writer->readonly) { 13466 n/a writer->kind = PyUnicode_KIND(writer->buffer); 13467 n/a writer->size = PyUnicode_GET_LENGTH(writer->buffer); 13468 n/a } 13469 n/a else { 13470 n/a /* use a value smaller than PyUnicode_1BYTE_KIND() so 13471 n/a _PyUnicodeWriter_PrepareKind() will copy the buffer. */ 13472 n/a writer->kind = PyUnicode_WCHAR_KIND; 13473 n/a assert(writer->kind <= PyUnicode_1BYTE_KIND); 13474 n/a 13475 n/a /* Copy-on-write mode: set buffer size to 0 so 13476 n/a * _PyUnicodeWriter_Prepare() will copy (and enlarge) the buffer on 13477 n/a * next write. */ 13478 n/a writer->size = 0; 13479 n/a } 13480 n/a } 13481 n/a 13482 n/a void 13483 n/a _PyUnicodeWriter_Init(_PyUnicodeWriter *writer) 13484 n/a { 13485 n/a memset(writer, 0, sizeof(*writer)); 13486 n/a 13487 n/a /* ASCII is the bare minimum */ 13488 n/a writer->min_char = 127; 13489 n/a 13490 n/a /* use a value smaller than PyUnicode_1BYTE_KIND() so 13491 n/a _PyUnicodeWriter_PrepareKind() will copy the buffer. */ 13492 n/a writer->kind = PyUnicode_WCHAR_KIND; 13493 n/a assert(writer->kind <= PyUnicode_1BYTE_KIND); 13494 n/a } 13495 n/a 13496 n/a int 13497 n/a _PyUnicodeWriter_PrepareInternal(_PyUnicodeWriter *writer, 13498 n/a Py_ssize_t length, Py_UCS4 maxchar) 13499 n/a { 13500 n/a Py_ssize_t newlen; 13501 n/a PyObject *newbuffer; 13502 n/a 13503 n/a assert(maxchar <= MAX_UNICODE); 13504 n/a 13505 n/a /* ensure that the _PyUnicodeWriter_Prepare macro was used */ 13506 n/a assert((maxchar > writer->maxchar && length >= 0) 13507 n/a || length > 0); 13508 n/a 13509 n/a if (length > PY_SSIZE_T_MAX - writer->pos) { 13510 n/a PyErr_NoMemory(); 13511 n/a return -1; 13512 n/a } 13513 n/a newlen = writer->pos + length; 13514 n/a 13515 n/a maxchar = Py_MAX(maxchar, writer->min_char); 13516 n/a 13517 n/a if (writer->buffer == NULL) { 13518 n/a assert(!writer->readonly); 13519 n/a if (writer->overallocate 13520 n/a && newlen <= (PY_SSIZE_T_MAX - newlen / OVERALLOCATE_FACTOR)) { 13521 n/a /* overallocate to limit the number of realloc() */ 13522 n/a newlen += newlen / OVERALLOCATE_FACTOR; 13523 n/a } 13524 n/a if (newlen < writer->min_length) 13525 n/a newlen = writer->min_length; 13526 n/a 13527 n/a writer->buffer = PyUnicode_New(newlen, maxchar); 13528 n/a if (writer->buffer == NULL) 13529 n/a return -1; 13530 n/a } 13531 n/a else if (newlen > writer->size) { 13532 n/a if (writer->overallocate 13533 n/a && newlen <= (PY_SSIZE_T_MAX - newlen / OVERALLOCATE_FACTOR)) { 13534 n/a /* overallocate to limit the number of realloc() */ 13535 n/a newlen += newlen / OVERALLOCATE_FACTOR; 13536 n/a } 13537 n/a if (newlen < writer->min_length) 13538 n/a newlen = writer->min_length; 13539 n/a 13540 n/a if (maxchar > writer->maxchar || writer->readonly) { 13541 n/a /* resize + widen */ 13542 n/a maxchar = Py_MAX(maxchar, writer->maxchar); 13543 n/a newbuffer = PyUnicode_New(newlen, maxchar); 13544 n/a if (newbuffer == NULL) 13545 n/a return -1; 13546 n/a _PyUnicode_FastCopyCharacters(newbuffer, 0, 13547 n/a writer->buffer, 0, writer->pos); 13548 n/a Py_DECREF(writer->buffer); 13549 n/a writer->readonly = 0; 13550 n/a } 13551 n/a else { 13552 n/a newbuffer = resize_compact(writer->buffer, newlen); 13553 n/a if (newbuffer == NULL) 13554 n/a return -1; 13555 n/a } 13556 n/a writer->buffer = newbuffer; 13557 n/a } 13558 n/a else if (maxchar > writer->maxchar) { 13559 n/a assert(!writer->readonly); 13560 n/a newbuffer = PyUnicode_New(writer->size, maxchar); 13561 n/a if (newbuffer == NULL) 13562 n/a return -1; 13563 n/a _PyUnicode_FastCopyCharacters(newbuffer, 0, 13564 n/a writer->buffer, 0, writer->pos); 13565 n/a Py_SETREF(writer->buffer, newbuffer); 13566 n/a } 13567 n/a _PyUnicodeWriter_Update(writer); 13568 n/a return 0; 13569 n/a 13570 n/a #undef OVERALLOCATE_FACTOR 13571 n/a } 13572 n/a 13573 n/a int 13574 n/a _PyUnicodeWriter_PrepareKindInternal(_PyUnicodeWriter *writer, 13575 n/a enum PyUnicode_Kind kind) 13576 n/a { 13577 n/a Py_UCS4 maxchar; 13578 n/a 13579 n/a /* ensure that the _PyUnicodeWriter_PrepareKind macro was used */ 13580 n/a assert(writer->kind < kind); 13581 n/a 13582 n/a switch (kind) 13583 n/a { 13584 n/a case PyUnicode_1BYTE_KIND: maxchar = 0xff; break; 13585 n/a case PyUnicode_2BYTE_KIND: maxchar = 0xffff; break; 13586 n/a case PyUnicode_4BYTE_KIND: maxchar = 0x10ffff; break; 13587 n/a default: 13588 n/a assert(0 && "invalid kind"); 13589 n/a return -1; 13590 n/a } 13591 n/a 13592 n/a return _PyUnicodeWriter_PrepareInternal(writer, 0, maxchar); 13593 n/a } 13594 n/a 13595 n/a static inline int 13596 n/a _PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter *writer, Py_UCS4 ch) 13597 n/a { 13598 n/a assert(ch <= MAX_UNICODE); 13599 n/a if (_PyUnicodeWriter_Prepare(writer, 1, ch) < 0) 13600 n/a return -1; 13601 n/a PyUnicode_WRITE(writer->kind, writer->data, writer->pos, ch); 13602 n/a writer->pos++; 13603 n/a return 0; 13604 n/a } 13605 n/a 13606 n/a int 13607 n/a _PyUnicodeWriter_WriteChar(_PyUnicodeWriter *writer, Py_UCS4 ch) 13608 n/a { 13609 n/a return _PyUnicodeWriter_WriteCharInline(writer, ch); 13610 n/a } 13611 n/a 13612 n/a int 13613 n/a _PyUnicodeWriter_WriteStr(_PyUnicodeWriter *writer, PyObject *str) 13614 n/a { 13615 n/a Py_UCS4 maxchar; 13616 n/a Py_ssize_t len; 13617 n/a 13618 n/a if (PyUnicode_READY(str) == -1) 13619 n/a return -1; 13620 n/a len = PyUnicode_GET_LENGTH(str); 13621 n/a if (len == 0) 13622 n/a return 0; 13623 n/a maxchar = PyUnicode_MAX_CHAR_VALUE(str); 13624 n/a if (maxchar > writer->maxchar || len > writer->size - writer->pos) { 13625 n/a if (writer->buffer == NULL && !writer->overallocate) { 13626 n/a assert(_PyUnicode_CheckConsistency(str, 1)); 13627 n/a writer->readonly = 1; 13628 n/a Py_INCREF(str); 13629 n/a writer->buffer = str; 13630 n/a _PyUnicodeWriter_Update(writer); 13631 n/a writer->pos += len; 13632 n/a return 0; 13633 n/a } 13634 n/a if (_PyUnicodeWriter_PrepareInternal(writer, len, maxchar) == -1) 13635 n/a return -1; 13636 n/a } 13637 n/a _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos, 13638 n/a str, 0, len); 13639 n/a writer->pos += len; 13640 n/a return 0; 13641 n/a } 13642 n/a 13643 n/a int 13644 n/a _PyUnicodeWriter_WriteSubstring(_PyUnicodeWriter *writer, PyObject *str, 13645 n/a Py_ssize_t start, Py_ssize_t end) 13646 n/a { 13647 n/a Py_UCS4 maxchar; 13648 n/a Py_ssize_t len; 13649 n/a 13650 n/a if (PyUnicode_READY(str) == -1) 13651 n/a return -1; 13652 n/a 13653 n/a assert(0 <= start); 13654 n/a assert(end <= PyUnicode_GET_LENGTH(str)); 13655 n/a assert(start <= end); 13656 n/a 13657 n/a if (end == 0) 13658 n/a return 0; 13659 n/a 13660 n/a if (start == 0 && end == PyUnicode_GET_LENGTH(str)) 13661 n/a return _PyUnicodeWriter_WriteStr(writer, str); 13662 n/a 13663 n/a if (PyUnicode_MAX_CHAR_VALUE(str) > writer->maxchar) 13664 n/a maxchar = _PyUnicode_FindMaxChar(str, start, end); 13665 n/a else 13666 n/a maxchar = writer->maxchar; 13667 n/a len = end - start; 13668 n/a 13669 n/a if (_PyUnicodeWriter_Prepare(writer, len, maxchar) < 0) 13670 n/a return -1; 13671 n/a 13672 n/a _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos, 13673 n/a str, start, len); 13674 n/a writer->pos += len; 13675 n/a return 0; 13676 n/a } 13677 n/a 13678 n/a int 13679 n/a _PyUnicodeWriter_WriteASCIIString(_PyUnicodeWriter *writer, 13680 n/a const char *ascii, Py_ssize_t len) 13681 n/a { 13682 n/a if (len == -1) 13683 n/a len = strlen(ascii); 13684 n/a 13685 n/a assert(ucs1lib_find_max_char((Py_UCS1*)ascii, (Py_UCS1*)ascii + len) < 128); 13686 n/a 13687 n/a if (writer->buffer == NULL && !writer->overallocate) { 13688 n/a PyObject *str; 13689 n/a 13690 n/a str = _PyUnicode_FromASCII(ascii, len); 13691 n/a if (str == NULL) 13692 n/a return -1; 13693 n/a 13694 n/a writer->readonly = 1; 13695 n/a writer->buffer = str; 13696 n/a _PyUnicodeWriter_Update(writer); 13697 n/a writer->pos += len; 13698 n/a return 0; 13699 n/a } 13700 n/a 13701 n/a if (_PyUnicodeWriter_Prepare(writer, len, 127) == -1) 13702 n/a return -1; 13703 n/a 13704 n/a switch (writer->kind) 13705 n/a { 13706 n/a case PyUnicode_1BYTE_KIND: 13707 n/a { 13708 n/a const Py_UCS1 *str = (const Py_UCS1 *)ascii; 13709 n/a Py_UCS1 *data = writer->data; 13710 n/a 13711 n/a memcpy(data + writer->pos, str, len); 13712 n/a break; 13713 n/a } 13714 n/a case PyUnicode_2BYTE_KIND: 13715 n/a { 13716 n/a _PyUnicode_CONVERT_BYTES( 13717 n/a Py_UCS1, Py_UCS2, 13718 n/a ascii, ascii + len, 13719 n/a (Py_UCS2 *)writer->data + writer->pos); 13720 n/a break; 13721 n/a } 13722 n/a case PyUnicode_4BYTE_KIND: 13723 n/a { 13724 n/a _PyUnicode_CONVERT_BYTES( 13725 n/a Py_UCS1, Py_UCS4, 13726 n/a ascii, ascii + len, 13727 n/a (Py_UCS4 *)writer->data + writer->pos); 13728 n/a break; 13729 n/a } 13730 n/a default: 13731 n/a assert(0); 13732 n/a } 13733 n/a 13734 n/a writer->pos += len; 13735 n/a return 0; 13736 n/a } 13737 n/a 13738 n/a int 13739 n/a _PyUnicodeWriter_WriteLatin1String(_PyUnicodeWriter *writer, 13740 n/a const char *str, Py_ssize_t len) 13741 n/a { 13742 n/a Py_UCS4 maxchar; 13743 n/a 13744 n/a maxchar = ucs1lib_find_max_char((Py_UCS1*)str, (Py_UCS1*)str + len); 13745 n/a if (_PyUnicodeWriter_Prepare(writer, len, maxchar) == -1) 13746 n/a return -1; 13747 n/a unicode_write_cstr(writer->buffer, writer->pos, str, len); 13748 n/a writer->pos += len; 13749 n/a return 0; 13750 n/a } 13751 n/a 13752 n/a PyObject * 13753 n/a _PyUnicodeWriter_Finish(_PyUnicodeWriter *writer) 13754 n/a { 13755 n/a PyObject *str; 13756 n/a 13757 n/a if (writer->pos == 0) { 13758 n/a Py_CLEAR(writer->buffer); 13759 n/a _Py_RETURN_UNICODE_EMPTY(); 13760 n/a } 13761 n/a 13762 n/a str = writer->buffer; 13763 n/a writer->buffer = NULL; 13764 n/a 13765 n/a if (writer->readonly) { 13766 n/a assert(PyUnicode_GET_LENGTH(str) == writer->pos); 13767 n/a return str; 13768 n/a } 13769 n/a 13770 n/a if (PyUnicode_GET_LENGTH(str) != writer->pos) { 13771 n/a PyObject *str2; 13772 n/a str2 = resize_compact(str, writer->pos); 13773 n/a if (str2 == NULL) { 13774 n/a Py_DECREF(str); 13775 n/a return NULL; 13776 n/a } 13777 n/a str = str2; 13778 n/a } 13779 n/a 13780 n/a assert(_PyUnicode_CheckConsistency(str, 1)); 13781 n/a return unicode_result_ready(str); 13782 n/a } 13783 n/a 13784 n/a void 13785 n/a _PyUnicodeWriter_Dealloc(_PyUnicodeWriter *writer) 13786 n/a { 13787 n/a Py_CLEAR(writer->buffer); 13788 n/a } 13789 n/a 13790 n/a #include "stringlib/unicode_format.h" 13791 n/a 13792 n/a PyDoc_STRVAR(format__doc__, 13793 n/a "S.format(*args, **kwargs) -> str\n\ 13794 n/a \n\ 13795 n/a Return a formatted version of S, using substitutions from args and kwargs.\n\ 13796 n/a The substitutions are identified by braces ('{' and '}')."); 13797 n/a 13798 n/a PyDoc_STRVAR(format_map__doc__, 13799 n/a "S.format_map(mapping) -> str\n\ 13800 n/a \n\ 13801 n/a Return a formatted version of S, using substitutions from mapping.\n\ 13802 n/a The substitutions are identified by braces ('{' and '}')."); 13803 n/a 13804 n/a /*[clinic input] 13805 n/a str.__format__ as unicode___format__ 13806 n/a 13807 n/a format_spec: unicode 13808 n/a / 13809 n/a 13810 n/a Return a formatted version of the string as described by format_spec. 13811 n/a [clinic start generated code]*/ 13812 n/a 13813 n/a static PyObject * 13814 n/a unicode___format___impl(PyObject *self, PyObject *format_spec) 13815 n/a /*[clinic end generated code: output=45fceaca6d2ba4c8 input=5e135645d167a214]*/ 13816 n/a { 13817 n/a _PyUnicodeWriter writer; 13818 n/a int ret; 13819 n/a 13820 n/a if (PyUnicode_READY(self) == -1) 13821 n/a return NULL; 13822 n/a _PyUnicodeWriter_Init(&writer); 13823 n/a ret = _PyUnicode_FormatAdvancedWriter(&writer, 13824 n/a self, format_spec, 0, 13825 n/a PyUnicode_GET_LENGTH(format_spec)); 13826 n/a if (ret == -1) { 13827 n/a _PyUnicodeWriter_Dealloc(&writer); 13828 n/a return NULL; 13829 n/a } 13830 n/a return _PyUnicodeWriter_Finish(&writer); 13831 n/a } 13832 n/a 13833 n/a /*[clinic input] 13834 n/a str.__sizeof__ as unicode_sizeof 13835 n/a 13836 n/a Return the size of the string in memory, in bytes. 13837 n/a [clinic start generated code]*/ 13838 n/a 13839 n/a static PyObject * 13840 n/a unicode_sizeof_impl(PyObject *self) 13841 n/a /*[clinic end generated code: output=6dbc2f5a408b6d4f input=6dd011c108e33fb0]*/ 13842 n/a { 13843 n/a Py_ssize_t size; 13844 n/a 13845 n/a /* If it's a compact object, account for base structure + 13846 n/a character data. */ 13847 n/a if (PyUnicode_IS_COMPACT_ASCII(self)) 13848 n/a size = sizeof(PyASCIIObject) + PyUnicode_GET_LENGTH(self) + 1; 13849 n/a else if (PyUnicode_IS_COMPACT(self)) 13850 n/a size = sizeof(PyCompactUnicodeObject) + 13851 n/a (PyUnicode_GET_LENGTH(self) + 1) * PyUnicode_KIND(self); 13852 n/a else { 13853 n/a /* If it is a two-block object, account for base object, and 13854 n/a for character block if present. */ 13855 n/a size = sizeof(PyUnicodeObject); 13856 n/a if (_PyUnicode_DATA_ANY(self)) 13857 n/a size += (PyUnicode_GET_LENGTH(self) + 1) * 13858 n/a PyUnicode_KIND(self); 13859 n/a } 13860 n/a /* If the wstr pointer is present, account for it unless it is shared 13861 n/a with the data pointer. Check if the data is not shared. */ 13862 n/a if (_PyUnicode_HAS_WSTR_MEMORY(self)) 13863 n/a size += (PyUnicode_WSTR_LENGTH(self) + 1) * sizeof(wchar_t); 13864 n/a if (_PyUnicode_HAS_UTF8_MEMORY(self)) 13865 n/a size += PyUnicode_UTF8_LENGTH(self) + 1; 13866 n/a 13867 n/a return PyLong_FromSsize_t(size); 13868 n/a } 13869 n/a 13870 n/a static PyObject * 13871 n/a unicode_getnewargs(PyObject *v) 13872 n/a { 13873 n/a PyObject *copy = _PyUnicode_Copy(v); 13874 n/a if (!copy) 13875 n/a return NULL; 13876 n/a return Py_BuildValue("(N)", copy); 13877 n/a } 13878 n/a 13879 n/a static PyMethodDef unicode_methods[] = { 13880 n/a UNICODE_ENCODE_METHODDEF 13881 n/a UNICODE_REPLACE_METHODDEF 13882 n/a UNICODE_SPLIT_METHODDEF 13883 n/a UNICODE_RSPLIT_METHODDEF 13884 n/a UNICODE_JOIN_METHODDEF 13885 n/a UNICODE_CAPITALIZE_METHODDEF 13886 n/a UNICODE_CASEFOLD_METHODDEF 13887 n/a UNICODE_TITLE_METHODDEF 13888 n/a UNICODE_CENTER_METHODDEF 13889 n/a {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__}, 13890 n/a UNICODE_EXPANDTABS_METHODDEF 13891 n/a {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__}, 13892 n/a UNICODE_PARTITION_METHODDEF 13893 n/a {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__}, 13894 n/a UNICODE_LJUST_METHODDEF 13895 n/a UNICODE_LOWER_METHODDEF 13896 n/a UNICODE_LSTRIP_METHODDEF 13897 n/a {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__}, 13898 n/a {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__}, 13899 n/a UNICODE_RJUST_METHODDEF 13900 n/a UNICODE_RSTRIP_METHODDEF 13901 n/a UNICODE_RPARTITION_METHODDEF 13902 n/a UNICODE_SPLITLINES_METHODDEF 13903 n/a UNICODE_STRIP_METHODDEF 13904 n/a UNICODE_SWAPCASE_METHODDEF 13905 n/a UNICODE_TRANSLATE_METHODDEF 13906 n/a UNICODE_UPPER_METHODDEF 13907 n/a {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__}, 13908 n/a {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__}, 13909 n/a UNICODE_ISLOWER_METHODDEF 13910 n/a UNICODE_ISUPPER_METHODDEF 13911 n/a UNICODE_ISTITLE_METHODDEF 13912 n/a UNICODE_ISSPACE_METHODDEF 13913 n/a UNICODE_ISDECIMAL_METHODDEF 13914 n/a UNICODE_ISDIGIT_METHODDEF 13915 n/a UNICODE_ISNUMERIC_METHODDEF 13916 n/a UNICODE_ISALPHA_METHODDEF 13917 n/a UNICODE_ISALNUM_METHODDEF 13918 n/a UNICODE_ISIDENTIFIER_METHODDEF 13919 n/a UNICODE_ISPRINTABLE_METHODDEF 13920 n/a UNICODE_ZFILL_METHODDEF 13921 n/a {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__}, 13922 n/a {"format_map", (PyCFunction) do_string_format_map, METH_O, format_map__doc__}, 13923 n/a UNICODE___FORMAT___METHODDEF 13924 n/a UNICODE_MAKETRANS_METHODDEF 13925 n/a UNICODE_SIZEOF_METHODDEF 13926 n/a #if 0 13927 n/a /* These methods are just used for debugging the implementation. */ 13928 n/a {"_decimal2ascii", (PyCFunction) unicode__decimal2ascii, METH_NOARGS}, 13929 n/a #endif 13930 n/a 13931 n/a {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS}, 13932 n/a {NULL, NULL} 13933 n/a }; 13934 n/a 13935 n/a static PyObject * 13936 n/a unicode_mod(PyObject *v, PyObject *w) 13937 n/a { 13938 n/a if (!PyUnicode_Check(v)) 13939 n/a Py_RETURN_NOTIMPLEMENTED; 13940 n/a return PyUnicode_Format(v, w); 13941 n/a } 13942 n/a 13943 n/a static PyNumberMethods unicode_as_number = { 13944 n/a 0, /*nb_add*/ 13945 n/a 0, /*nb_subtract*/ 13946 n/a 0, /*nb_multiply*/ 13947 n/a unicode_mod, /*nb_remainder*/ 13948 n/a }; 13949 n/a 13950 n/a static PySequenceMethods unicode_as_sequence = { 13951 n/a (lenfunc) unicode_length, /* sq_length */ 13952 n/a PyUnicode_Concat, /* sq_concat */ 13953 n/a (ssizeargfunc) unicode_repeat, /* sq_repeat */ 13954 n/a (ssizeargfunc) unicode_getitem, /* sq_item */ 13955 n/a 0, /* sq_slice */ 13956 n/a 0, /* sq_ass_item */ 13957 n/a 0, /* sq_ass_slice */ 13958 n/a PyUnicode_Contains, /* sq_contains */ 13959 n/a }; 13960 n/a 13961 n/a static PyObject* 13962 n/a unicode_subscript(PyObject* self, PyObject* item) 13963 n/a { 13964 n/a if (PyUnicode_READY(self) == -1) 13965 n/a return NULL; 13966 n/a 13967 n/a if (PyIndex_Check(item)) { 13968 n/a Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError); 13969 n/a if (i == -1 && PyErr_Occurred()) 13970 n/a return NULL; 13971 n/a if (i < 0) 13972 n/a i += PyUnicode_GET_LENGTH(self); 13973 n/a return unicode_getitem(self, i); 13974 n/a } else if (PySlice_Check(item)) { 13975 n/a Py_ssize_t start, stop, step, slicelength, cur, i; 13976 n/a PyObject *result; 13977 n/a void *src_data, *dest_data; 13978 n/a int src_kind, dest_kind; 13979 n/a Py_UCS4 ch, max_char, kind_limit; 13980 n/a 13981 n/a if (PySlice_GetIndicesEx(item, PyUnicode_GET_LENGTH(self), 13982 n/a &start, &stop, &step, &slicelength) < 0) { 13983 n/a return NULL; 13984 n/a } 13985 n/a 13986 n/a if (slicelength <= 0) { 13987 n/a _Py_RETURN_UNICODE_EMPTY(); 13988 n/a } else if (start == 0 && step == 1 && 13989 n/a slicelength == PyUnicode_GET_LENGTH(self)) { 13990 n/a return unicode_result_unchanged(self); 13991 n/a } else if (step == 1) { 13992 n/a return PyUnicode_Substring(self, 13993 n/a start, start + slicelength); 13994 n/a } 13995 n/a /* General case */ 13996 n/a src_kind = PyUnicode_KIND(self); 13997 n/a src_data = PyUnicode_DATA(self); 13998 n/a if (!PyUnicode_IS_ASCII(self)) { 13999 n/a kind_limit = kind_maxchar_limit(src_kind); 14000 n/a max_char = 0; 14001 n/a for (cur = start, i = 0; i < slicelength; cur += step, i++) { 14002 n/a ch = PyUnicode_READ(src_kind, src_data, cur); 14003 n/a if (ch > max_char) { 14004 n/a max_char = ch; 14005 n/a if (max_char >= kind_limit) 14006 n/a break; 14007 n/a } 14008 n/a } 14009 n/a } 14010 n/a else 14011 n/a max_char = 127; 14012 n/a result = PyUnicode_New(slicelength, max_char); 14013 n/a if (result == NULL) 14014 n/a return NULL; 14015 n/a dest_kind = PyUnicode_KIND(result); 14016 n/a dest_data = PyUnicode_DATA(result); 14017 n/a 14018 n/a for (cur = start, i = 0; i < slicelength; cur += step, i++) { 14019 n/a Py_UCS4 ch = PyUnicode_READ(src_kind, src_data, cur); 14020 n/a PyUnicode_WRITE(dest_kind, dest_data, i, ch); 14021 n/a } 14022 n/a assert(_PyUnicode_CheckConsistency(result, 1)); 14023 n/a return result; 14024 n/a } else { 14025 n/a PyErr_SetString(PyExc_TypeError, "string indices must be integers"); 14026 n/a return NULL; 14027 n/a } 14028 n/a } 14029 n/a 14030 n/a static PyMappingMethods unicode_as_mapping = { 14031 n/a (lenfunc)unicode_length, /* mp_length */ 14032 n/a (binaryfunc)unicode_subscript, /* mp_subscript */ 14033 n/a (objobjargproc)0, /* mp_ass_subscript */ 14034 n/a }; 14035 n/a 14036 n/a 14037 n/a /* Helpers for PyUnicode_Format() */ 14038 n/a 14039 n/a struct unicode_formatter_t { 14040 n/a PyObject *args; 14041 n/a int args_owned; 14042 n/a Py_ssize_t arglen, argidx; 14043 n/a PyObject *dict; 14044 n/a 14045 n/a enum PyUnicode_Kind fmtkind; 14046 n/a Py_ssize_t fmtcnt, fmtpos; 14047 n/a void *fmtdata; 14048 n/a PyObject *fmtstr; 14049 n/a 14050 n/a _PyUnicodeWriter writer; 14051 n/a }; 14052 n/a 14053 n/a struct unicode_format_arg_t { 14054 n/a Py_UCS4 ch; 14055 n/a int flags; 14056 n/a Py_ssize_t width; 14057 n/a int prec; 14058 n/a int sign; 14059 n/a }; 14060 n/a 14061 n/a static PyObject * 14062 n/a unicode_format_getnextarg(struct unicode_formatter_t *ctx) 14063 n/a { 14064 n/a Py_ssize_t argidx = ctx->argidx; 14065 n/a 14066 n/a if (argidx < ctx->arglen) { 14067 n/a ctx->argidx++; 14068 n/a if (ctx->arglen < 0) 14069 n/a return ctx->args; 14070 n/a else 14071 n/a return PyTuple_GetItem(ctx->args, argidx); 14072 n/a } 14073 n/a PyErr_SetString(PyExc_TypeError, 14074 n/a "not enough arguments for format string"); 14075 n/a return NULL; 14076 n/a } 14077 n/a 14078 n/a /* Returns a new reference to a PyUnicode object, or NULL on failure. */ 14079 n/a 14080 n/a /* Format a float into the writer if the writer is not NULL, or into *p_output 14081 n/a otherwise. 14082 n/a 14083 n/a Return 0 on success, raise an exception and return -1 on error. */ 14084 n/a static int 14085 n/a formatfloat(PyObject *v, struct unicode_format_arg_t *arg, 14086 n/a PyObject **p_output, 14087 n/a _PyUnicodeWriter *writer) 14088 n/a { 14089 n/a char *p; 14090 n/a double x; 14091 n/a Py_ssize_t len; 14092 n/a int prec; 14093 n/a int dtoa_flags; 14094 n/a 14095 n/a x = PyFloat_AsDouble(v); 14096 n/a if (x == -1.0 && PyErr_Occurred()) 14097 n/a return -1; 14098 n/a 14099 n/a prec = arg->prec; 14100 n/a if (prec < 0) 14101 n/a prec = 6; 14102 n/a 14103 n/a if (arg->flags & F_ALT) 14104 n/a dtoa_flags = Py_DTSF_ALT; 14105 n/a else 14106 n/a dtoa_flags = 0; 14107 n/a p = PyOS_double_to_string(x, arg->ch, prec, dtoa_flags, NULL); 14108 n/a if (p == NULL) 14109 n/a return -1; 14110 n/a len = strlen(p); 14111 n/a if (writer) { 14112 n/a if (_PyUnicodeWriter_WriteASCIIString(writer, p, len) < 0) { 14113 n/a PyMem_Free(p); 14114 n/a return -1; 14115 n/a } 14116 n/a } 14117 n/a else 14118 n/a *p_output = _PyUnicode_FromASCII(p, len); 14119 n/a PyMem_Free(p); 14120 n/a return 0; 14121 n/a } 14122 n/a 14123 n/a /* formatlong() emulates the format codes d, u, o, x and X, and 14124 n/a * the F_ALT flag, for Python's long (unbounded) ints. It's not used for 14125 n/a * Python's regular ints. 14126 n/a * Return value: a new PyUnicodeObject*, or NULL if error. 14127 n/a * The output string is of the form 14128 n/a * "-"? ("0x" | "0X")? digit+ 14129 n/a * "0x"/"0X" are present only for x and X conversions, with F_ALT 14130 n/a * set in flags. The case of hex digits will be correct, 14131 n/a * There will be at least prec digits, zero-filled on the left if 14132 n/a * necessary to get that many. 14133 n/a * val object to be converted 14134 n/a * flags bitmask of format flags; only F_ALT is looked at 14135 n/a * prec minimum number of digits; 0-fill on left if needed 14136 n/a * type a character in [duoxX]; u acts the same as d 14137 n/a * 14138 n/a * CAUTION: o, x and X conversions on regular ints can never 14139 n/a * produce a '-' sign, but can for Python's unbounded ints. 14140 n/a */ 14141 n/a PyObject * 14142 n/a _PyUnicode_FormatLong(PyObject *val, int alt, int prec, int type) 14143 n/a { 14144 n/a PyObject *result = NULL; 14145 n/a char *buf; 14146 n/a Py_ssize_t i; 14147 n/a int sign; /* 1 if '-', else 0 */ 14148 n/a int len; /* number of characters */ 14149 n/a Py_ssize_t llen; 14150 n/a int numdigits; /* len == numnondigits + numdigits */ 14151 n/a int numnondigits = 0; 14152 n/a 14153 n/a /* Avoid exceeding SSIZE_T_MAX */ 14154 n/a if (prec > INT_MAX-3) { 14155 n/a PyErr_SetString(PyExc_OverflowError, 14156 n/a "precision too large"); 14157 n/a return NULL; 14158 n/a } 14159 n/a 14160 n/a assert(PyLong_Check(val)); 14161 n/a 14162 n/a switch (type) { 14163 n/a default: 14164 n/a assert(!"'type' not in [diuoxX]"); 14165 n/a case 'd': 14166 n/a case 'i': 14167 n/a case 'u': 14168 n/a /* int and int subclasses should print numerically when a numeric */ 14169 n/a /* format code is used (see issue18780) */ 14170 n/a result = PyNumber_ToBase(val, 10); 14171 n/a break; 14172 n/a case 'o': 14173 n/a numnondigits = 2; 14174 n/a result = PyNumber_ToBase(val, 8); 14175 n/a break; 14176 n/a case 'x': 14177 n/a case 'X': 14178 n/a numnondigits = 2; 14179 n/a result = PyNumber_ToBase(val, 16); 14180 n/a break; 14181 n/a } 14182 n/a if (!result) 14183 n/a return NULL; 14184 n/a 14185 n/a assert(unicode_modifiable(result)); 14186 n/a assert(PyUnicode_IS_READY(result)); 14187 n/a assert(PyUnicode_IS_ASCII(result)); 14188 n/a 14189 n/a /* To modify the string in-place, there can only be one reference. */ 14190 n/a if (Py_REFCNT(result) != 1) { 14191 n/a Py_DECREF(result); 14192 n/a PyErr_BadInternalCall(); 14193 n/a return NULL; 14194 n/a } 14195 n/a buf = PyUnicode_DATA(result); 14196 n/a llen = PyUnicode_GET_LENGTH(result); 14197 n/a if (llen > INT_MAX) { 14198 n/a Py_DECREF(result); 14199 n/a PyErr_SetString(PyExc_ValueError, 14200 n/a "string too large in _PyUnicode_FormatLong"); 14201 n/a return NULL; 14202 n/a } 14203 n/a len = (int)llen; 14204 n/a sign = buf[0] == '-'; 14205 n/a numnondigits += sign; 14206 n/a numdigits = len - numnondigits; 14207 n/a assert(numdigits > 0); 14208 n/a 14209 n/a /* Get rid of base marker unless F_ALT */ 14210 n/a if (((alt) == 0 && 14211 n/a (type == 'o' || type == 'x' || type == 'X'))) { 14212 n/a assert(buf[sign] == '0'); 14213 n/a assert(buf[sign+1] == 'x' || buf[sign+1] == 'X' || 14214 n/a buf[sign+1] == 'o'); 14215 n/a numnondigits -= 2; 14216 n/a buf += 2; 14217 n/a len -= 2; 14218 n/a if (sign) 14219 n/a buf[0] = '-'; 14220 n/a assert(len == numnondigits + numdigits); 14221 n/a assert(numdigits > 0); 14222 n/a } 14223 n/a 14224 n/a /* Fill with leading zeroes to meet minimum width. */ 14225 n/a if (prec > numdigits) { 14226 n/a PyObject *r1 = PyBytes_FromStringAndSize(NULL, 14227 n/a numnondigits + prec); 14228 n/a char *b1; 14229 n/a if (!r1) { 14230 n/a Py_DECREF(result); 14231 n/a return NULL; 14232 n/a } 14233 n/a b1 = PyBytes_AS_STRING(r1); 14234 n/a for (i = 0; i < numnondigits; ++i) 14235 n/a *b1++ = *buf++; 14236 n/a for (i = 0; i < prec - numdigits; i++) 14237 n/a *b1++ = '0'; 14238 n/a for (i = 0; i < numdigits; i++) 14239 n/a *b1++ = *buf++; 14240 n/a *b1 = '\0'; 14241 n/a Py_DECREF(result); 14242 n/a result = r1; 14243 n/a buf = PyBytes_AS_STRING(result); 14244 n/a len = numnondigits + prec; 14245 n/a } 14246 n/a 14247 n/a /* Fix up case for hex conversions. */ 14248 n/a if (type == 'X') { 14249 n/a /* Need to convert all lower case letters to upper case. 14250 n/a and need to convert 0x to 0X (and -0x to -0X). */ 14251 n/a for (i = 0; i < len; i++) 14252 n/a if (buf[i] >= 'a' && buf[i] <= 'x') 14253 n/a buf[i] -= 'a'-'A'; 14254 n/a } 14255 n/a if (!PyUnicode_Check(result) 14256 n/a || buf != PyUnicode_DATA(result)) { 14257 n/a PyObject *unicode; 14258 n/a unicode = _PyUnicode_FromASCII(buf, len); 14259 n/a Py_DECREF(result); 14260 n/a result = unicode; 14261 n/a } 14262 n/a else if (len != PyUnicode_GET_LENGTH(result)) { 14263 n/a if (PyUnicode_Resize(&result, len) < 0) 14264 n/a Py_CLEAR(result); 14265 n/a } 14266 n/a return result; 14267 n/a } 14268 n/a 14269 n/a /* Format an integer or a float as an integer. 14270 n/a * Return 1 if the number has been formatted into the writer, 14271 n/a * 0 if the number has been formatted into *p_output 14272 n/a * -1 and raise an exception on error */ 14273 n/a static int 14274 n/a mainformatlong(PyObject *v, 14275 n/a struct unicode_format_arg_t *arg, 14276 n/a PyObject **p_output, 14277 n/a _PyUnicodeWriter *writer) 14278 n/a { 14279 n/a PyObject *iobj, *res; 14280 n/a char type = (char)arg->ch; 14281 n/a 14282 n/a if (!PyNumber_Check(v)) 14283 n/a goto wrongtype; 14284 n/a 14285 n/a /* make sure number is a type of integer for o, x, and X */ 14286 n/a if (!PyLong_Check(v)) { 14287 n/a if (type == 'o' || type == 'x' || type == 'X') { 14288 n/a iobj = PyNumber_Index(v); 14289 n/a if (iobj == NULL) { 14290 n/a if (PyErr_ExceptionMatches(PyExc_TypeError)) 14291 n/a goto wrongtype; 14292 n/a return -1; 14293 n/a } 14294 n/a } 14295 n/a else { 14296 n/a iobj = PyNumber_Long(v); 14297 n/a if (iobj == NULL ) { 14298 n/a if (PyErr_ExceptionMatches(PyExc_TypeError)) 14299 n/a goto wrongtype; 14300 n/a return -1; 14301 n/a } 14302 n/a } 14303 n/a assert(PyLong_Check(iobj)); 14304 n/a } 14305 n/a else { 14306 n/a iobj = v; 14307 n/a Py_INCREF(iobj); 14308 n/a } 14309 n/a 14310 n/a if (PyLong_CheckExact(v) 14311 n/a && arg->width == -1 && arg->prec == -1 14312 n/a && !(arg->flags & (F_SIGN | F_BLANK)) 14313 n/a && type != 'X') 14314 n/a { 14315 n/a /* Fast path */ 14316 n/a int alternate = arg->flags & F_ALT; 14317 n/a int base; 14318 n/a 14319 n/a switch(type) 14320 n/a { 14321 n/a default: 14322 n/a assert(0 && "'type' not in [diuoxX]"); 14323 n/a case 'd': 14324 n/a case 'i': 14325 n/a case 'u': 14326 n/a base = 10; 14327 n/a break; 14328 n/a case 'o': 14329 n/a base = 8; 14330 n/a break; 14331 n/a case 'x': 14332 n/a case 'X': 14333 n/a base = 16; 14334 n/a break; 14335 n/a } 14336 n/a 14337 n/a if (_PyLong_FormatWriter(writer, v, base, alternate) == -1) { 14338 n/a Py_DECREF(iobj); 14339 n/a return -1; 14340 n/a } 14341 n/a Py_DECREF(iobj); 14342 n/a return 1; 14343 n/a } 14344 n/a 14345 n/a res = _PyUnicode_FormatLong(iobj, arg->flags & F_ALT, arg->prec, type); 14346 n/a Py_DECREF(iobj); 14347 n/a if (res == NULL) 14348 n/a return -1; 14349 n/a *p_output = res; 14350 n/a return 0; 14351 n/a 14352 n/a wrongtype: 14353 n/a switch(type) 14354 n/a { 14355 n/a case 'o': 14356 n/a case 'x': 14357 n/a case 'X': 14358 n/a PyErr_Format(PyExc_TypeError, 14359 n/a "%%%c format: an integer is required, " 14360 n/a "not %.200s", 14361 n/a type, Py_TYPE(v)->tp_name); 14362 n/a break; 14363 n/a default: 14364 n/a PyErr_Format(PyExc_TypeError, 14365 n/a "%%%c format: a number is required, " 14366 n/a "not %.200s", 14367 n/a type, Py_TYPE(v)->tp_name); 14368 n/a break; 14369 n/a } 14370 n/a return -1; 14371 n/a } 14372 n/a 14373 n/a static Py_UCS4 14374 n/a formatchar(PyObject *v) 14375 n/a { 14376 n/a /* presume that the buffer is at least 3 characters long */ 14377 n/a if (PyUnicode_Check(v)) { 14378 n/a if (PyUnicode_GET_LENGTH(v) == 1) { 14379 n/a return PyUnicode_READ_CHAR(v, 0); 14380 n/a } 14381 n/a goto onError; 14382 n/a } 14383 n/a else { 14384 n/a PyObject *iobj; 14385 n/a long x; 14386 n/a /* make sure number is a type of integer */ 14387 n/a if (!PyLong_Check(v)) { 14388 n/a iobj = PyNumber_Index(v); 14389 n/a if (iobj == NULL) { 14390 n/a goto onError; 14391 n/a } 14392 n/a x = PyLong_AsLong(iobj); 14393 n/a Py_DECREF(iobj); 14394 n/a } 14395 n/a else { 14396 n/a x = PyLong_AsLong(v); 14397 n/a } 14398 n/a if (x == -1 && PyErr_Occurred()) 14399 n/a goto onError; 14400 n/a 14401 n/a if (x < 0 || x > MAX_UNICODE) { 14402 n/a PyErr_SetString(PyExc_OverflowError, 14403 n/a "%c arg not in range(0x110000)"); 14404 n/a return (Py_UCS4) -1; 14405 n/a } 14406 n/a 14407 n/a return (Py_UCS4) x; 14408 n/a } 14409 n/a 14410 n/a onError: 14411 n/a PyErr_SetString(PyExc_TypeError, 14412 n/a "%c requires int or char"); 14413 n/a return (Py_UCS4) -1; 14414 n/a } 14415 n/a 14416 n/a /* Parse options of an argument: flags, width, precision. 14417 n/a Handle also "%(name)" syntax. 14418 n/a 14419 n/a Return 0 if the argument has been formatted into arg->str. 14420 n/a Return 1 if the argument has been written into ctx->writer, 14421 n/a Raise an exception and return -1 on error. */ 14422 n/a static int 14423 n/a unicode_format_arg_parse(struct unicode_formatter_t *ctx, 14424 n/a struct unicode_format_arg_t *arg) 14425 n/a { 14426 n/a #define FORMAT_READ(ctx) \ 14427 n/a PyUnicode_READ((ctx)->fmtkind, (ctx)->fmtdata, (ctx)->fmtpos) 14428 n/a 14429 n/a PyObject *v; 14430 n/a 14431 n/a if (arg->ch == '(') { 14432 n/a /* Get argument value from a dictionary. Example: "%(name)s". */ 14433 n/a Py_ssize_t keystart; 14434 n/a Py_ssize_t keylen; 14435 n/a PyObject *key; 14436 n/a int pcount = 1; 14437 n/a 14438 n/a if (ctx->dict == NULL) { 14439 n/a PyErr_SetString(PyExc_TypeError, 14440 n/a "format requires a mapping"); 14441 n/a return -1; 14442 n/a } 14443 n/a ++ctx->fmtpos; 14444 n/a --ctx->fmtcnt; 14445 n/a keystart = ctx->fmtpos; 14446 n/a /* Skip over balanced parentheses */ 14447 n/a while (pcount > 0 && --ctx->fmtcnt >= 0) { 14448 n/a arg->ch = FORMAT_READ(ctx); 14449 n/a if (arg->ch == ')') 14450 n/a --pcount; 14451 n/a else if (arg->ch == '(') 14452 n/a ++pcount; 14453 n/a ctx->fmtpos++; 14454 n/a } 14455 n/a keylen = ctx->fmtpos - keystart - 1; 14456 n/a if (ctx->fmtcnt < 0 || pcount > 0) { 14457 n/a PyErr_SetString(PyExc_ValueError, 14458 n/a "incomplete format key"); 14459 n/a return -1; 14460 n/a } 14461 n/a key = PyUnicode_Substring(ctx->fmtstr, 14462 n/a keystart, keystart + keylen); 14463 n/a if (key == NULL) 14464 n/a return -1; 14465 n/a if (ctx->args_owned) { 14466 n/a ctx->args_owned = 0; 14467 n/a Py_DECREF(ctx->args); 14468 n/a } 14469 n/a ctx->args = PyObject_GetItem(ctx->dict, key); 14470 n/a Py_DECREF(key); 14471 n/a if (ctx->args == NULL) 14472 n/a return -1; 14473 n/a ctx->args_owned = 1; 14474 n/a ctx->arglen = -1; 14475 n/a ctx->argidx = -2; 14476 n/a } 14477 n/a 14478 n/a /* Parse flags. Example: "%+i" => flags=F_SIGN. */ 14479 n/a while (--ctx->fmtcnt >= 0) { 14480 n/a arg->ch = FORMAT_READ(ctx); 14481 n/a ctx->fmtpos++; 14482 n/a switch (arg->ch) { 14483 n/a case '-': arg->flags |= F_LJUST; continue; 14484 n/a case '+': arg->flags |= F_SIGN; continue; 14485 n/a case ' ': arg->flags |= F_BLANK; continue; 14486 n/a case '#': arg->flags |= F_ALT; continue; 14487 n/a case '0': arg->flags |= F_ZERO; continue; 14488 n/a } 14489 n/a break; 14490 n/a } 14491 n/a 14492 n/a /* Parse width. Example: "%10s" => width=10 */ 14493 n/a if (arg->ch == '*') { 14494 n/a v = unicode_format_getnextarg(ctx); 14495 n/a if (v == NULL) 14496 n/a return -1; 14497 n/a if (!PyLong_Check(v)) { 14498 n/a PyErr_SetString(PyExc_TypeError, 14499 n/a "* wants int"); 14500 n/a return -1; 14501 n/a } 14502 n/a arg->width = PyLong_AsSsize_t(v); 14503 n/a if (arg->width == -1 && PyErr_Occurred()) 14504 n/a return -1; 14505 n/a if (arg->width < 0) { 14506 n/a arg->flags |= F_LJUST; 14507 n/a arg->width = -arg->width; 14508 n/a } 14509 n/a if (--ctx->fmtcnt >= 0) { 14510 n/a arg->ch = FORMAT_READ(ctx); 14511 n/a ctx->fmtpos++; 14512 n/a } 14513 n/a } 14514 n/a else if (arg->ch >= '0' && arg->ch <= '9') { 14515 n/a arg->width = arg->ch - '0'; 14516 n/a while (--ctx->fmtcnt >= 0) { 14517 n/a arg->ch = FORMAT_READ(ctx); 14518 n/a ctx->fmtpos++; 14519 n/a if (arg->ch < '0' || arg->ch > '9') 14520 n/a break; 14521 n/a /* Since arg->ch is unsigned, the RHS would end up as unsigned, 14522 n/a mixing signed and unsigned comparison. Since arg->ch is between 14523 n/a '0' and '9', casting to int is safe. */ 14524 n/a if (arg->width > (PY_SSIZE_T_MAX - ((int)arg->ch - '0')) / 10) { 14525 n/a PyErr_SetString(PyExc_ValueError, 14526 n/a "width too big"); 14527 n/a return -1; 14528 n/a } 14529 n/a arg->width = arg->width*10 + (arg->ch - '0'); 14530 n/a } 14531 n/a } 14532 n/a 14533 n/a /* Parse precision. Example: "%.3f" => prec=3 */ 14534 n/a if (arg->ch == '.') { 14535 n/a arg->prec = 0; 14536 n/a if (--ctx->fmtcnt >= 0) { 14537 n/a arg->ch = FORMAT_READ(ctx); 14538 n/a ctx->fmtpos++; 14539 n/a } 14540 n/a if (arg->ch == '*') { 14541 n/a v = unicode_format_getnextarg(ctx); 14542 n/a if (v == NULL) 14543 n/a return -1; 14544 n/a if (!PyLong_Check(v)) { 14545 n/a PyErr_SetString(PyExc_TypeError, 14546 n/a "* wants int"); 14547 n/a return -1; 14548 n/a } 14549 n/a arg->prec = _PyLong_AsInt(v); 14550 n/a if (arg->prec == -1 && PyErr_Occurred()) 14551 n/a return -1; 14552 n/a if (arg->prec < 0) 14553 n/a arg->prec = 0; 14554 n/a if (--ctx->fmtcnt >= 0) { 14555 n/a arg->ch = FORMAT_READ(ctx); 14556 n/a ctx->fmtpos++; 14557 n/a } 14558 n/a } 14559 n/a else if (arg->ch >= '0' && arg->ch <= '9') { 14560 n/a arg->prec = arg->ch - '0'; 14561 n/a while (--ctx->fmtcnt >= 0) { 14562 n/a arg->ch = FORMAT_READ(ctx); 14563 n/a ctx->fmtpos++; 14564 n/a if (arg->ch < '0' || arg->ch > '9') 14565 n/a break; 14566 n/a if (arg->prec > (INT_MAX - ((int)arg->ch - '0')) / 10) { 14567 n/a PyErr_SetString(PyExc_ValueError, 14568 n/a "precision too big"); 14569 n/a return -1; 14570 n/a } 14571 n/a arg->prec = arg->prec*10 + (arg->ch - '0'); 14572 n/a } 14573 n/a } 14574 n/a } 14575 n/a 14576 n/a /* Ignore "h", "l" and "L" format prefix (ex: "%hi" or "%ls") */ 14577 n/a if (ctx->fmtcnt >= 0) { 14578 n/a if (arg->ch == 'h' || arg->ch == 'l' || arg->ch == 'L') { 14579 n/a if (--ctx->fmtcnt >= 0) { 14580 n/a arg->ch = FORMAT_READ(ctx); 14581 n/a ctx->fmtpos++; 14582 n/a } 14583 n/a } 14584 n/a } 14585 n/a if (ctx->fmtcnt < 0) { 14586 n/a PyErr_SetString(PyExc_ValueError, 14587 n/a "incomplete format"); 14588 n/a return -1; 14589 n/a } 14590 n/a return 0; 14591 n/a 14592 n/a #undef FORMAT_READ 14593 n/a } 14594 n/a 14595 n/a /* Format one argument. Supported conversion specifiers: 14596 n/a 14597 n/a - "s", "r", "a": any type 14598 n/a - "i", "d", "u": int or float 14599 n/a - "o", "x", "X": int 14600 n/a - "e", "E", "f", "F", "g", "G": float 14601 n/a - "c": int or str (1 character) 14602 n/a 14603 n/a When possible, the output is written directly into the Unicode writer 14604 n/a (ctx->writer). A string is created when padding is required. 14605 n/a 14606 n/a Return 0 if the argument has been formatted into *p_str, 14607 n/a 1 if the argument has been written into ctx->writer, 14608 n/a -1 on error. */ 14609 n/a static int 14610 n/a unicode_format_arg_format(struct unicode_formatter_t *ctx, 14611 n/a struct unicode_format_arg_t *arg, 14612 n/a PyObject **p_str) 14613 n/a { 14614 n/a PyObject *v; 14615 n/a _PyUnicodeWriter *writer = &ctx->writer; 14616 n/a 14617 n/a if (ctx->fmtcnt == 0) 14618 n/a ctx->writer.overallocate = 0; 14619 n/a 14620 n/a if (arg->ch == '%') { 14621 n/a if (_PyUnicodeWriter_WriteCharInline(writer, '%') < 0) 14622 n/a return -1; 14623 n/a return 1; 14624 n/a } 14625 n/a 14626 n/a v = unicode_format_getnextarg(ctx); 14627 n/a if (v == NULL) 14628 n/a return -1; 14629 n/a 14630 n/a 14631 n/a switch (arg->ch) { 14632 n/a case 's': 14633 n/a case 'r': 14634 n/a case 'a': 14635 n/a if (PyLong_CheckExact(v) && arg->width == -1 && arg->prec == -1) { 14636 n/a /* Fast path */ 14637 n/a if (_PyLong_FormatWriter(writer, v, 10, arg->flags & F_ALT) == -1) 14638 n/a return -1; 14639 n/a return 1; 14640 n/a } 14641 n/a 14642 n/a if (PyUnicode_CheckExact(v) && arg->ch == 's') { 14643 n/a *p_str = v; 14644 n/a Py_INCREF(*p_str); 14645 n/a } 14646 n/a else { 14647 n/a if (arg->ch == 's') 14648 n/a *p_str = PyObject_Str(v); 14649 n/a else if (arg->ch == 'r') 14650 n/a *p_str = PyObject_Repr(v); 14651 n/a else 14652 n/a *p_str = PyObject_ASCII(v); 14653 n/a } 14654 n/a break; 14655 n/a 14656 n/a case 'i': 14657 n/a case 'd': 14658 n/a case 'u': 14659 n/a case 'o': 14660 n/a case 'x': 14661 n/a case 'X': 14662 n/a { 14663 n/a int ret = mainformatlong(v, arg, p_str, writer); 14664 n/a if (ret != 0) 14665 n/a return ret; 14666 n/a arg->sign = 1; 14667 n/a break; 14668 n/a } 14669 n/a 14670 n/a case 'e': 14671 n/a case 'E': 14672 n/a case 'f': 14673 n/a case 'F': 14674 n/a case 'g': 14675 n/a case 'G': 14676 n/a if (arg->width == -1 && arg->prec == -1 14677 n/a && !(arg->flags & (F_SIGN | F_BLANK))) 14678 n/a { 14679 n/a /* Fast path */ 14680 n/a if (formatfloat(v, arg, NULL, writer) == -1) 14681 n/a return -1; 14682 n/a return 1; 14683 n/a } 14684 n/a 14685 n/a arg->sign = 1; 14686 n/a if (formatfloat(v, arg, p_str, NULL) == -1) 14687 n/a return -1; 14688 n/a break; 14689 n/a 14690 n/a case 'c': 14691 n/a { 14692 n/a Py_UCS4 ch = formatchar(v); 14693 n/a if (ch == (Py_UCS4) -1) 14694 n/a return -1; 14695 n/a if (arg->width == -1 && arg->prec == -1) { 14696 n/a /* Fast path */ 14697 n/a if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0) 14698 n/a return -1; 14699 n/a return 1; 14700 n/a } 14701 n/a *p_str = PyUnicode_FromOrdinal(ch); 14702 n/a break; 14703 n/a } 14704 n/a 14705 n/a default: 14706 n/a PyErr_Format(PyExc_ValueError, 14707 n/a "unsupported format character '%c' (0x%x) " 14708 n/a "at index %zd", 14709 n/a (31<=arg->ch && arg->ch<=126) ? (char)arg->ch : '?', 14710 n/a (int)arg->ch, 14711 n/a ctx->fmtpos - 1); 14712 n/a return -1; 14713 n/a } 14714 n/a if (*p_str == NULL) 14715 n/a return -1; 14716 n/a assert (PyUnicode_Check(*p_str)); 14717 n/a return 0; 14718 n/a } 14719 n/a 14720 n/a static int 14721 n/a unicode_format_arg_output(struct unicode_formatter_t *ctx, 14722 n/a struct unicode_format_arg_t *arg, 14723 n/a PyObject *str) 14724 n/a { 14725 n/a Py_ssize_t len; 14726 n/a enum PyUnicode_Kind kind; 14727 n/a void *pbuf; 14728 n/a Py_ssize_t pindex; 14729 n/a Py_UCS4 signchar; 14730 n/a Py_ssize_t buflen; 14731 n/a Py_UCS4 maxchar; 14732 n/a Py_ssize_t sublen; 14733 n/a _PyUnicodeWriter *writer = &ctx->writer; 14734 n/a Py_UCS4 fill; 14735 n/a 14736 n/a fill = ' '; 14737 n/a if (arg->sign && arg->flags & F_ZERO) 14738 n/a fill = '0'; 14739 n/a 14740 n/a if (PyUnicode_READY(str) == -1) 14741 n/a return -1; 14742 n/a 14743 n/a len = PyUnicode_GET_LENGTH(str); 14744 n/a if ((arg->width == -1 || arg->width <= len) 14745 n/a && (arg->prec == -1 || arg->prec >= len) 14746 n/a && !(arg->flags & (F_SIGN | F_BLANK))) 14747 n/a { 14748 n/a /* Fast path */ 14749 n/a if (_PyUnicodeWriter_WriteStr(writer, str) == -1) 14750 n/a return -1; 14751 n/a return 0; 14752 n/a } 14753 n/a 14754 n/a /* Truncate the string for "s", "r" and "a" formats 14755 n/a if the precision is set */ 14756 n/a if (arg->ch == 's' || arg->ch == 'r' || arg->ch == 'a') { 14757 n/a if (arg->prec >= 0 && len > arg->prec) 14758 n/a len = arg->prec; 14759 n/a } 14760 n/a 14761 n/a /* Adjust sign and width */ 14762 n/a kind = PyUnicode_KIND(str); 14763 n/a pbuf = PyUnicode_DATA(str); 14764 n/a pindex = 0; 14765 n/a signchar = '\0'; 14766 n/a if (arg->sign) { 14767 n/a Py_UCS4 ch = PyUnicode_READ(kind, pbuf, pindex); 14768 n/a if (ch == '-' || ch == '+') { 14769 n/a signchar = ch; 14770 n/a len--; 14771 n/a pindex++; 14772 n/a } 14773 n/a else if (arg->flags & F_SIGN) 14774 n/a signchar = '+'; 14775 n/a else if (arg->flags & F_BLANK) 14776 n/a signchar = ' '; 14777 n/a else 14778 n/a arg->sign = 0; 14779 n/a } 14780 n/a if (arg->width < len) 14781 n/a arg->width = len; 14782 n/a 14783 n/a /* Prepare the writer */ 14784 n/a maxchar = writer->maxchar; 14785 n/a if (!(arg->flags & F_LJUST)) { 14786 n/a if (arg->sign) { 14787 n/a if ((arg->width-1) > len) 14788 n/a maxchar = Py_MAX(maxchar, fill); 14789 n/a } 14790 n/a else { 14791 n/a if (arg->width > len) 14792 n/a maxchar = Py_MAX(maxchar, fill); 14793 n/a } 14794 n/a } 14795 n/a if (PyUnicode_MAX_CHAR_VALUE(str) > maxchar) { 14796 n/a Py_UCS4 strmaxchar = _PyUnicode_FindMaxChar(str, 0, pindex+len); 14797 n/a maxchar = Py_MAX(maxchar, strmaxchar); 14798 n/a } 14799 n/a 14800 n/a buflen = arg->width; 14801 n/a if (arg->sign && len == arg->width) 14802 n/a buflen++; 14803 n/a if (_PyUnicodeWriter_Prepare(writer, buflen, maxchar) == -1) 14804 n/a return -1; 14805 n/a 14806 n/a /* Write the sign if needed */ 14807 n/a if (arg->sign) { 14808 n/a if (fill != ' ') { 14809 n/a PyUnicode_WRITE(writer->kind, writer->data, writer->pos, signchar); 14810 n/a writer->pos += 1; 14811 n/a } 14812 n/a if (arg->width > len) 14813 n/a arg->width--; 14814 n/a } 14815 n/a 14816 n/a /* Write the numeric prefix for "x", "X" and "o" formats 14817 n/a if the alternate form is used. 14818 n/a For example, write "0x" for the "%#x" format. */ 14819 n/a if ((arg->flags & F_ALT) && (arg->ch == 'x' || arg->ch == 'X' || arg->ch == 'o')) { 14820 n/a assert(PyUnicode_READ(kind, pbuf, pindex) == '0'); 14821 n/a assert(PyUnicode_READ(kind, pbuf, pindex + 1) == arg->ch); 14822 n/a if (fill != ' ') { 14823 n/a PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '0'); 14824 n/a PyUnicode_WRITE(writer->kind, writer->data, writer->pos+1, arg->ch); 14825 n/a writer->pos += 2; 14826 n/a pindex += 2; 14827 n/a } 14828 n/a arg->width -= 2; 14829 n/a if (arg->width < 0) 14830 n/a arg->width = 0; 14831 n/a len -= 2; 14832 n/a } 14833 n/a 14834 n/a /* Pad left with the fill character if needed */ 14835 n/a if (arg->width > len && !(arg->flags & F_LJUST)) { 14836 n/a sublen = arg->width - len; 14837 n/a FILL(writer->kind, writer->data, fill, writer->pos, sublen); 14838 n/a writer->pos += sublen; 14839 n/a arg->width = len; 14840 n/a } 14841 n/a 14842 n/a /* If padding with spaces: write sign if needed and/or numeric prefix if 14843 n/a the alternate form is used */ 14844 n/a if (fill == ' ') { 14845 n/a if (arg->sign) { 14846 n/a PyUnicode_WRITE(writer->kind, writer->data, writer->pos, signchar); 14847 n/a writer->pos += 1; 14848 n/a } 14849 n/a if ((arg->flags & F_ALT) && (arg->ch == 'x' || arg->ch == 'X' || arg->ch == 'o')) { 14850 n/a assert(PyUnicode_READ(kind, pbuf, pindex) == '0'); 14851 n/a assert(PyUnicode_READ(kind, pbuf, pindex+1) == arg->ch); 14852 n/a PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '0'); 14853 n/a PyUnicode_WRITE(writer->kind, writer->data, writer->pos+1, arg->ch); 14854 n/a writer->pos += 2; 14855 n/a pindex += 2; 14856 n/a } 14857 n/a } 14858 n/a 14859 n/a /* Write characters */ 14860 n/a if (len) { 14861 n/a _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos, 14862 n/a str, pindex, len); 14863 n/a writer->pos += len; 14864 n/a } 14865 n/a 14866 n/a /* Pad right with the fill character if needed */ 14867 n/a if (arg->width > len) { 14868 n/a sublen = arg->width - len; 14869 n/a FILL(writer->kind, writer->data, ' ', writer->pos, sublen); 14870 n/a writer->pos += sublen; 14871 n/a } 14872 n/a return 0; 14873 n/a } 14874 n/a 14875 n/a /* Helper of PyUnicode_Format(): format one arg. 14876 n/a Return 0 on success, raise an exception and return -1 on error. */ 14877 n/a static int 14878 n/a unicode_format_arg(struct unicode_formatter_t *ctx) 14879 n/a { 14880 n/a struct unicode_format_arg_t arg; 14881 n/a PyObject *str; 14882 n/a int ret; 14883 n/a 14884 n/a arg.ch = PyUnicode_READ(ctx->fmtkind, ctx->fmtdata, ctx->fmtpos); 14885 n/a arg.flags = 0; 14886 n/a arg.width = -1; 14887 n/a arg.prec = -1; 14888 n/a arg.sign = 0; 14889 n/a str = NULL; 14890 n/a 14891 n/a ret = unicode_format_arg_parse(ctx, &arg); 14892 n/a if (ret == -1) 14893 n/a return -1; 14894 n/a 14895 n/a ret = unicode_format_arg_format(ctx, &arg, &str); 14896 n/a if (ret == -1) 14897 n/a return -1; 14898 n/a 14899 n/a if (ret != 1) { 14900 n/a ret = unicode_format_arg_output(ctx, &arg, str); 14901 n/a Py_DECREF(str); 14902 n/a if (ret == -1) 14903 n/a return -1; 14904 n/a } 14905 n/a 14906 n/a if (ctx->dict && (ctx->argidx < ctx->arglen) && arg.ch != '%') { 14907 n/a PyErr_SetString(PyExc_TypeError, 14908 n/a "not all arguments converted during string formatting"); 14909 n/a return -1; 14910 n/a } 14911 n/a return 0; 14912 n/a } 14913 n/a 14914 n/a PyObject * 14915 n/a PyUnicode_Format(PyObject *format, PyObject *args) 14916 n/a { 14917 n/a struct unicode_formatter_t ctx; 14918 n/a 14919 n/a if (format == NULL || args == NULL) { 14920 n/a PyErr_BadInternalCall(); 14921 n/a return NULL; 14922 n/a } 14923 n/a 14924 n/a if (ensure_unicode(format) < 0) 14925 n/a return NULL; 14926 n/a 14927 n/a ctx.fmtstr = format; 14928 n/a ctx.fmtdata = PyUnicode_DATA(ctx.fmtstr); 14929 n/a ctx.fmtkind = PyUnicode_KIND(ctx.fmtstr); 14930 n/a ctx.fmtcnt = PyUnicode_GET_LENGTH(ctx.fmtstr); 14931 n/a ctx.fmtpos = 0; 14932 n/a 14933 n/a _PyUnicodeWriter_Init(&ctx.writer); 14934 n/a ctx.writer.min_length = ctx.fmtcnt + 100; 14935 n/a ctx.writer.overallocate = 1; 14936 n/a 14937 n/a if (PyTuple_Check(args)) { 14938 n/a ctx.arglen = PyTuple_Size(args); 14939 n/a ctx.argidx = 0; 14940 n/a } 14941 n/a else { 14942 n/a ctx.arglen = -1; 14943 n/a ctx.argidx = -2; 14944 n/a } 14945 n/a ctx.args_owned = 0; 14946 n/a if (PyMapping_Check(args) && !PyTuple_Check(args) && !PyUnicode_Check(args)) 14947 n/a ctx.dict = args; 14948 n/a else 14949 n/a ctx.dict = NULL; 14950 n/a ctx.args = args; 14951 n/a 14952 n/a while (--ctx.fmtcnt >= 0) { 14953 n/a if (PyUnicode_READ(ctx.fmtkind, ctx.fmtdata, ctx.fmtpos) != '%') { 14954 n/a Py_ssize_t nonfmtpos; 14955 n/a 14956 n/a nonfmtpos = ctx.fmtpos++; 14957 n/a while (ctx.fmtcnt >= 0 && 14958 n/a PyUnicode_READ(ctx.fmtkind, ctx.fmtdata, ctx.fmtpos) != '%') { 14959 n/a ctx.fmtpos++; 14960 n/a ctx.fmtcnt--; 14961 n/a } 14962 n/a if (ctx.fmtcnt < 0) { 14963 n/a ctx.fmtpos--; 14964 n/a ctx.writer.overallocate = 0; 14965 n/a } 14966 n/a 14967 n/a if (_PyUnicodeWriter_WriteSubstring(&ctx.writer, ctx.fmtstr, 14968 n/a nonfmtpos, ctx.fmtpos) < 0) 14969 n/a goto onError; 14970 n/a } 14971 n/a else { 14972 n/a ctx.fmtpos++; 14973 n/a if (unicode_format_arg(&ctx) == -1) 14974 n/a goto onError; 14975 n/a } 14976 n/a } 14977 n/a 14978 n/a if (ctx.argidx < ctx.arglen && !ctx.dict) { 14979 n/a PyErr_SetString(PyExc_TypeError, 14980 n/a "not all arguments converted during string formatting"); 14981 n/a goto onError; 14982 n/a } 14983 n/a 14984 n/a if (ctx.args_owned) { 14985 n/a Py_DECREF(ctx.args); 14986 n/a } 14987 n/a return _PyUnicodeWriter_Finish(&ctx.writer); 14988 n/a 14989 n/a onError: 14990 n/a _PyUnicodeWriter_Dealloc(&ctx.writer); 14991 n/a if (ctx.args_owned) { 14992 n/a Py_DECREF(ctx.args); 14993 n/a } 14994 n/a return NULL; 14995 n/a } 14996 n/a 14997 n/a static PyObject * 14998 n/a unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds); 14999 n/a 15000 n/a static PyObject * 15001 n/a unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds) 15002 n/a { 15003 n/a PyObject *x = NULL; 15004 n/a static char *kwlist[] = {"object", "encoding", "errors", 0}; 15005 n/a char *encoding = NULL; 15006 n/a char *errors = NULL; 15007 n/a 15008 n/a if (type != &PyUnicode_Type) 15009 n/a return unicode_subtype_new(type, args, kwds); 15010 n/a if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str", 15011 n/a kwlist, &x, &encoding, &errors)) 15012 n/a return NULL; 15013 n/a if (x == NULL) 15014 n/a _Py_RETURN_UNICODE_EMPTY(); 15015 n/a if (encoding == NULL && errors == NULL) 15016 n/a return PyObject_Str(x); 15017 n/a else 15018 n/a return PyUnicode_FromEncodedObject(x, encoding, errors); 15019 n/a } 15020 n/a 15021 n/a static PyObject * 15022 n/a unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds) 15023 n/a { 15024 n/a PyObject *unicode, *self; 15025 n/a Py_ssize_t length, char_size; 15026 n/a int share_wstr, share_utf8; 15027 n/a unsigned int kind; 15028 n/a void *data; 15029 n/a 15030 n/a assert(PyType_IsSubtype(type, &PyUnicode_Type)); 15031 n/a 15032 n/a unicode = unicode_new(&PyUnicode_Type, args, kwds); 15033 n/a if (unicode == NULL) 15034 n/a return NULL; 15035 n/a assert(_PyUnicode_CHECK(unicode)); 15036 n/a if (PyUnicode_READY(unicode) == -1) { 15037 n/a Py_DECREF(unicode); 15038 n/a return NULL; 15039 n/a } 15040 n/a 15041 n/a self = type->tp_alloc(type, 0); 15042 n/a if (self == NULL) { 15043 n/a Py_DECREF(unicode); 15044 n/a return NULL; 15045 n/a } 15046 n/a kind = PyUnicode_KIND(unicode); 15047 n/a length = PyUnicode_GET_LENGTH(unicode); 15048 n/a 15049 n/a _PyUnicode_LENGTH(self) = length; 15050 n/a #ifdef Py_DEBUG 15051 n/a _PyUnicode_HASH(self) = -1; 15052 n/a #else 15053 n/a _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode); 15054 n/a #endif 15055 n/a _PyUnicode_STATE(self).interned = 0; 15056 n/a _PyUnicode_STATE(self).kind = kind; 15057 n/a _PyUnicode_STATE(self).compact = 0; 15058 n/a _PyUnicode_STATE(self).ascii = _PyUnicode_STATE(unicode).ascii; 15059 n/a _PyUnicode_STATE(self).ready = 1; 15060 n/a _PyUnicode_WSTR(self) = NULL; 15061 n/a _PyUnicode_UTF8_LENGTH(self) = 0; 15062 n/a _PyUnicode_UTF8(self) = NULL; 15063 n/a _PyUnicode_WSTR_LENGTH(self) = 0; 15064 n/a _PyUnicode_DATA_ANY(self) = NULL; 15065 n/a 15066 n/a share_utf8 = 0; 15067 n/a share_wstr = 0; 15068 n/a if (kind == PyUnicode_1BYTE_KIND) { 15069 n/a char_size = 1; 15070 n/a if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128) 15071 n/a share_utf8 = 1; 15072 n/a } 15073 n/a else if (kind == PyUnicode_2BYTE_KIND) { 15074 n/a char_size = 2; 15075 n/a if (sizeof(wchar_t) == 2) 15076 n/a share_wstr = 1; 15077 n/a } 15078 n/a else { 15079 n/a assert(kind == PyUnicode_4BYTE_KIND); 15080 n/a char_size = 4; 15081 n/a if (sizeof(wchar_t) == 4) 15082 n/a share_wstr = 1; 15083 n/a } 15084 n/a 15085 n/a /* Ensure we won't overflow the length. */ 15086 n/a if (length > (PY_SSIZE_T_MAX / char_size - 1)) { 15087 n/a PyErr_NoMemory(); 15088 n/a goto onError; 15089 n/a } 15090 n/a data = PyObject_MALLOC((length + 1) * char_size); 15091 n/a if (data == NULL) { 15092 n/a PyErr_NoMemory(); 15093 n/a goto onError; 15094 n/a } 15095 n/a 15096 n/a _PyUnicode_DATA_ANY(self) = data; 15097 n/a if (share_utf8) { 15098 n/a _PyUnicode_UTF8_LENGTH(self) = length; 15099 n/a _PyUnicode_UTF8(self) = data; 15100 n/a } 15101 n/a if (share_wstr) { 15102 n/a _PyUnicode_WSTR_LENGTH(self) = length; 15103 n/a _PyUnicode_WSTR(self) = (wchar_t *)data; 15104 n/a } 15105 n/a 15106 n/a memcpy(data, PyUnicode_DATA(unicode), 15107 n/a kind * (length + 1)); 15108 n/a assert(_PyUnicode_CheckConsistency(self, 1)); 15109 n/a #ifdef Py_DEBUG 15110 n/a _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode); 15111 n/a #endif 15112 n/a Py_DECREF(unicode); 15113 n/a return self; 15114 n/a 15115 n/a onError: 15116 n/a Py_DECREF(unicode); 15117 n/a Py_DECREF(self); 15118 n/a return NULL; 15119 n/a } 15120 n/a 15121 n/a PyDoc_STRVAR(unicode_doc, 15122 n/a "str(object='') -> str\n\ 15123 n/a str(bytes_or_buffer[, encoding[, errors]]) -> str\n\ 15124 n/a \n\ 15125 n/a Create a new string object from the given object. If encoding or\n\ 15126 n/a errors is specified, then the object must expose a data buffer\n\ 15127 n/a that will be decoded using the given encoding and error handler.\n\ 15128 n/a Otherwise, returns the result of object.__str__() (if defined)\n\ 15129 n/a or repr(object).\n\ 15130 n/a encoding defaults to sys.getdefaultencoding().\n\ 15131 n/a errors defaults to 'strict'."); 15132 n/a 15133 n/a static PyObject *unicode_iter(PyObject *seq); 15134 n/a 15135 n/a PyTypeObject PyUnicode_Type = { 15136 n/a PyVarObject_HEAD_INIT(&PyType_Type, 0) 15137 n/a "str", /* tp_name */ 15138 n/a sizeof(PyUnicodeObject), /* tp_size */ 15139 n/a 0, /* tp_itemsize */ 15140 n/a /* Slots */ 15141 n/a (destructor)unicode_dealloc, /* tp_dealloc */ 15142 n/a 0, /* tp_print */ 15143 n/a 0, /* tp_getattr */ 15144 n/a 0, /* tp_setattr */ 15145 n/a 0, /* tp_reserved */ 15146 n/a unicode_repr, /* tp_repr */ 15147 n/a &unicode_as_number, /* tp_as_number */ 15148 n/a &unicode_as_sequence, /* tp_as_sequence */ 15149 n/a &unicode_as_mapping, /* tp_as_mapping */ 15150 n/a (hashfunc) unicode_hash, /* tp_hash*/ 15151 n/a 0, /* tp_call*/ 15152 n/a (reprfunc) unicode_str, /* tp_str */ 15153 n/a PyObject_GenericGetAttr, /* tp_getattro */ 15154 n/a 0, /* tp_setattro */ 15155 n/a 0, /* tp_as_buffer */ 15156 n/a Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE | 15157 n/a Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */ 15158 n/a unicode_doc, /* tp_doc */ 15159 n/a 0, /* tp_traverse */ 15160 n/a 0, /* tp_clear */ 15161 n/a PyUnicode_RichCompare, /* tp_richcompare */ 15162 n/a 0, /* tp_weaklistoffset */ 15163 n/a unicode_iter, /* tp_iter */ 15164 n/a 0, /* tp_iternext */ 15165 n/a unicode_methods, /* tp_methods */ 15166 n/a 0, /* tp_members */ 15167 n/a 0, /* tp_getset */ 15168 n/a &PyBaseObject_Type, /* tp_base */ 15169 n/a 0, /* tp_dict */ 15170 n/a 0, /* tp_descr_get */ 15171 n/a 0, /* tp_descr_set */ 15172 n/a 0, /* tp_dictoffset */ 15173 n/a 0, /* tp_init */ 15174 n/a 0, /* tp_alloc */ 15175 n/a unicode_new, /* tp_new */ 15176 n/a PyObject_Del, /* tp_free */ 15177 n/a }; 15178 n/a 15179 n/a /* Initialize the Unicode implementation */ 15180 n/a 15181 n/a int _PyUnicode_Init(void) 15182 n/a { 15183 n/a /* XXX - move this array to unicodectype.c ? */ 15184 n/a Py_UCS2 linebreak[] = { 15185 n/a 0x000A, /* LINE FEED */ 15186 n/a 0x000D, /* CARRIAGE RETURN */ 15187 n/a 0x001C, /* FILE SEPARATOR */ 15188 n/a 0x001D, /* GROUP SEPARATOR */ 15189 n/a 0x001E, /* RECORD SEPARATOR */ 15190 n/a 0x0085, /* NEXT LINE */ 15191 n/a 0x2028, /* LINE SEPARATOR */ 15192 n/a 0x2029, /* PARAGRAPH SEPARATOR */ 15193 n/a }; 15194 n/a 15195 n/a /* Init the implementation */ 15196 n/a _Py_INCREF_UNICODE_EMPTY(); 15197 n/a if (!unicode_empty) 15198 n/a Py_FatalError("Can't create empty string"); 15199 n/a Py_DECREF(unicode_empty); 15200 n/a 15201 n/a if (PyType_Ready(&PyUnicode_Type) < 0) 15202 n/a Py_FatalError("Can't initialize 'unicode'"); 15203 n/a 15204 n/a /* initialize the linebreak bloom filter */ 15205 n/a bloom_linebreak = make_bloom_mask( 15206 n/a PyUnicode_2BYTE_KIND, linebreak, 15207 n/a Py_ARRAY_LENGTH(linebreak)); 15208 n/a 15209 n/a if (PyType_Ready(&EncodingMapType) < 0) 15210 n/a Py_FatalError("Can't initialize encoding map type"); 15211 n/a 15212 n/a if (PyType_Ready(&PyFieldNameIter_Type) < 0) 15213 n/a Py_FatalError("Can't initialize field name iterator type"); 15214 n/a 15215 n/a if (PyType_Ready(&PyFormatterIter_Type) < 0) 15216 n/a Py_FatalError("Can't initialize formatter iter type"); 15217 n/a 15218 n/a return 0; 15219 n/a } 15220 n/a 15221 n/a /* Finalize the Unicode implementation */ 15222 n/a 15223 n/a int 15224 n/a PyUnicode_ClearFreeList(void) 15225 n/a { 15226 n/a return 0; 15227 n/a } 15228 n/a 15229 n/a void 15230 n/a _PyUnicode_Fini(void) 15231 n/a { 15232 n/a int i; 15233 n/a 15234 n/a Py_CLEAR(unicode_empty); 15235 n/a 15236 n/a for (i = 0; i < 256; i++) 15237 n/a Py_CLEAR(unicode_latin1[i]); 15238 n/a _PyUnicode_ClearStaticStrings(); 15239 n/a (void)PyUnicode_ClearFreeList(); 15240 n/a } 15241 n/a 15242 n/a void 15243 n/a PyUnicode_InternInPlace(PyObject **p) 15244 n/a { 15245 n/a PyObject *s = *p; 15246 n/a PyObject *t; 15247 n/a #ifdef Py_DEBUG 15248 n/a assert(s != NULL); 15249 n/a assert(_PyUnicode_CHECK(s)); 15250 n/a #else 15251 n/a if (s == NULL || !PyUnicode_Check(s)) 15252 n/a return; 15253 n/a #endif 15254 n/a /* If it's a subclass, we don't really know what putting 15255 n/a it in the interned dict might do. */ 15256 n/a if (!PyUnicode_CheckExact(s)) 15257 n/a return; 15258 n/a if (PyUnicode_CHECK_INTERNED(s)) 15259 n/a return; 15260 n/a if (interned == NULL) { 15261 n/a interned = PyDict_New(); 15262 n/a if (interned == NULL) { 15263 n/a PyErr_Clear(); /* Don't leave an exception */ 15264 n/a return; 15265 n/a } 15266 n/a } 15267 n/a Py_ALLOW_RECURSION 15268 n/a t = PyDict_SetDefault(interned, s, s); 15269 n/a Py_END_ALLOW_RECURSION 15270 n/a if (t == NULL) { 15271 n/a PyErr_Clear(); 15272 n/a return; 15273 n/a } 15274 n/a if (t != s) { 15275 n/a Py_INCREF(t); 15276 n/a Py_SETREF(*p, t); 15277 n/a return; 15278 n/a } 15279 n/a /* The two references in interned are not counted by refcnt. 15280 n/a The deallocator will take care of this */ 15281 n/a Py_REFCNT(s) -= 2; 15282 n/a _PyUnicode_STATE(s).interned = SSTATE_INTERNED_MORTAL; 15283 n/a } 15284 n/a 15285 n/a void 15286 n/a PyUnicode_InternImmortal(PyObject **p) 15287 n/a { 15288 n/a PyUnicode_InternInPlace(p); 15289 n/a if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) { 15290 n/a _PyUnicode_STATE(*p).interned = SSTATE_INTERNED_IMMORTAL; 15291 n/a Py_INCREF(*p); 15292 n/a } 15293 n/a } 15294 n/a 15295 n/a PyObject * 15296 n/a PyUnicode_InternFromString(const char *cp) 15297 n/a { 15298 n/a PyObject *s = PyUnicode_FromString(cp); 15299 n/a if (s == NULL) 15300 n/a return NULL; 15301 n/a PyUnicode_InternInPlace(&s); 15302 n/a return s; 15303 n/a } 15304 n/a 15305 n/a void 15306 n/a _Py_ReleaseInternedUnicodeStrings(void) 15307 n/a { 15308 n/a PyObject *keys; 15309 n/a PyObject *s; 15310 n/a Py_ssize_t i, n; 15311 n/a Py_ssize_t immortal_size = 0, mortal_size = 0; 15312 n/a 15313 n/a if (interned == NULL || !PyDict_Check(interned)) 15314 n/a return; 15315 n/a keys = PyDict_Keys(interned); 15316 n/a if (keys == NULL || !PyList_Check(keys)) { 15317 n/a PyErr_Clear(); 15318 n/a return; 15319 n/a } 15320 n/a 15321 n/a /* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak 15322 n/a detector, interned unicode strings are not forcibly deallocated; 15323 n/a rather, we give them their stolen references back, and then clear 15324 n/a and DECREF the interned dict. */ 15325 n/a 15326 n/a n = PyList_GET_SIZE(keys); 15327 n/a fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n", 15328 n/a n); 15329 n/a for (i = 0; i < n; i++) { 15330 n/a s = PyList_GET_ITEM(keys, i); 15331 n/a if (PyUnicode_READY(s) == -1) { 15332 n/a assert(0 && "could not ready string"); 15333 n/a fprintf(stderr, "could not ready string\n"); 15334 n/a } 15335 n/a switch (PyUnicode_CHECK_INTERNED(s)) { 15336 n/a case SSTATE_NOT_INTERNED: 15337 n/a /* XXX Shouldn't happen */ 15338 n/a break; 15339 n/a case SSTATE_INTERNED_IMMORTAL: 15340 n/a Py_REFCNT(s) += 1; 15341 n/a immortal_size += PyUnicode_GET_LENGTH(s); 15342 n/a break; 15343 n/a case SSTATE_INTERNED_MORTAL: 15344 n/a Py_REFCNT(s) += 2; 15345 n/a mortal_size += PyUnicode_GET_LENGTH(s); 15346 n/a break; 15347 n/a default: 15348 n/a Py_FatalError("Inconsistent interned string state."); 15349 n/a } 15350 n/a _PyUnicode_STATE(s).interned = SSTATE_NOT_INTERNED; 15351 n/a } 15352 n/a fprintf(stderr, "total size of all interned strings: " 15353 n/a "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d " 15354 n/a "mortal/immortal\n", mortal_size, immortal_size); 15355 n/a Py_DECREF(keys); 15356 n/a PyDict_Clear(interned); 15357 n/a Py_CLEAR(interned); 15358 n/a } 15359 n/a 15360 n/a 15361 n/a /********************* Unicode Iterator **************************/ 15362 n/a 15363 n/a typedef struct { 15364 n/a PyObject_HEAD 15365 n/a Py_ssize_t it_index; 15366 n/a PyObject *it_seq; /* Set to NULL when iterator is exhausted */ 15367 n/a } unicodeiterobject; 15368 n/a 15369 n/a static void 15370 n/a unicodeiter_dealloc(unicodeiterobject *it) 15371 n/a { 15372 n/a _PyObject_GC_UNTRACK(it); 15373 n/a Py_XDECREF(it->it_seq); 15374 n/a PyObject_GC_Del(it); 15375 n/a } 15376 n/a 15377 n/a static int 15378 n/a unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg) 15379 n/a { 15380 n/a Py_VISIT(it->it_seq); 15381 n/a return 0; 15382 n/a } 15383 n/a 15384 n/a static PyObject * 15385 n/a unicodeiter_next(unicodeiterobject *it) 15386 n/a { 15387 n/a PyObject *seq, *item; 15388 n/a 15389 n/a assert(it != NULL); 15390 n/a seq = it->it_seq; 15391 n/a if (seq == NULL) 15392 n/a return NULL; 15393 n/a assert(_PyUnicode_CHECK(seq)); 15394 n/a 15395 n/a if (it->it_index < PyUnicode_GET_LENGTH(seq)) { 15396 n/a int kind = PyUnicode_KIND(seq); 15397 n/a void *data = PyUnicode_DATA(seq); 15398 n/a Py_UCS4 chr = PyUnicode_READ(kind, data, it->it_index); 15399 n/a item = PyUnicode_FromOrdinal(chr); 15400 n/a if (item != NULL) 15401 n/a ++it->it_index; 15402 n/a return item; 15403 n/a } 15404 n/a 15405 n/a it->it_seq = NULL; 15406 n/a Py_DECREF(seq); 15407 n/a return NULL; 15408 n/a } 15409 n/a 15410 n/a static PyObject * 15411 n/a unicodeiter_len(unicodeiterobject *it) 15412 n/a { 15413 n/a Py_ssize_t len = 0; 15414 n/a if (it->it_seq) 15415 n/a len = PyUnicode_GET_LENGTH(it->it_seq) - it->it_index; 15416 n/a return PyLong_FromSsize_t(len); 15417 n/a } 15418 n/a 15419 n/a PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it))."); 15420 n/a 15421 n/a static PyObject * 15422 n/a unicodeiter_reduce(unicodeiterobject *it) 15423 n/a { 15424 n/a if (it->it_seq != NULL) { 15425 n/a return Py_BuildValue("N(O)n", _PyObject_GetBuiltin("iter"), 15426 n/a it->it_seq, it->it_index); 15427 n/a } else { 15428 n/a PyObject *u = (PyObject *)_PyUnicode_New(0); 15429 n/a if (u == NULL) 15430 n/a return NULL; 15431 n/a return Py_BuildValue("N(N)", _PyObject_GetBuiltin("iter"), u); 15432 n/a } 15433 n/a } 15434 n/a 15435 n/a PyDoc_STRVAR(reduce_doc, "Return state information for pickling."); 15436 n/a 15437 n/a static PyObject * 15438 n/a unicodeiter_setstate(unicodeiterobject *it, PyObject *state) 15439 n/a { 15440 n/a Py_ssize_t index = PyLong_AsSsize_t(state); 15441 n/a if (index == -1 && PyErr_Occurred()) 15442 n/a return NULL; 15443 n/a if (it->it_seq != NULL) { 15444 n/a if (index < 0) 15445 n/a index = 0; 15446 n/a else if (index > PyUnicode_GET_LENGTH(it->it_seq)) 15447 n/a index = PyUnicode_GET_LENGTH(it->it_seq); /* iterator truncated */ 15448 n/a it->it_index = index; 15449 n/a } 15450 n/a Py_RETURN_NONE; 15451 n/a } 15452 n/a 15453 n/a PyDoc_STRVAR(setstate_doc, "Set state information for unpickling."); 15454 n/a 15455 n/a static PyMethodDef unicodeiter_methods[] = { 15456 n/a {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS, 15457 n/a length_hint_doc}, 15458 n/a {"__reduce__", (PyCFunction)unicodeiter_reduce, METH_NOARGS, 15459 n/a reduce_doc}, 15460 n/a {"__setstate__", (PyCFunction)unicodeiter_setstate, METH_O, 15461 n/a setstate_doc}, 15462 n/a {NULL, NULL} /* sentinel */ 15463 n/a }; 15464 n/a 15465 n/a PyTypeObject PyUnicodeIter_Type = { 15466 n/a PyVarObject_HEAD_INIT(&PyType_Type, 0) 15467 n/a "str_iterator", /* tp_name */ 15468 n/a sizeof(unicodeiterobject), /* tp_basicsize */ 15469 n/a 0, /* tp_itemsize */ 15470 n/a /* methods */ 15471 n/a (destructor)unicodeiter_dealloc, /* tp_dealloc */ 15472 n/a 0, /* tp_print */ 15473 n/a 0, /* tp_getattr */ 15474 n/a 0, /* tp_setattr */ 15475 n/a 0, /* tp_reserved */ 15476 n/a 0, /* tp_repr */ 15477 n/a 0, /* tp_as_number */ 15478 n/a 0, /* tp_as_sequence */ 15479 n/a 0, /* tp_as_mapping */ 15480 n/a 0, /* tp_hash */ 15481 n/a 0, /* tp_call */ 15482 n/a 0, /* tp_str */ 15483 n/a PyObject_GenericGetAttr, /* tp_getattro */ 15484 n/a 0, /* tp_setattro */ 15485 n/a 0, /* tp_as_buffer */ 15486 n/a Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */ 15487 n/a 0, /* tp_doc */ 15488 n/a (traverseproc)unicodeiter_traverse, /* tp_traverse */ 15489 n/a 0, /* tp_clear */ 15490 n/a 0, /* tp_richcompare */ 15491 n/a 0, /* tp_weaklistoffset */ 15492 n/a PyObject_SelfIter, /* tp_iter */ 15493 n/a (iternextfunc)unicodeiter_next, /* tp_iternext */ 15494 n/a unicodeiter_methods, /* tp_methods */ 15495 n/a 0, 15496 n/a }; 15497 n/a 15498 n/a static PyObject * 15499 n/a unicode_iter(PyObject *seq) 15500 n/a { 15501 n/a unicodeiterobject *it; 15502 n/a 15503 n/a if (!PyUnicode_Check(seq)) { 15504 n/a PyErr_BadInternalCall(); 15505 n/a return NULL; 15506 n/a } 15507 n/a if (PyUnicode_READY(seq) == -1) 15508 n/a return NULL; 15509 n/a it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type); 15510 n/a if (it == NULL) 15511 n/a return NULL; 15512 n/a it->it_index = 0; 15513 n/a Py_INCREF(seq); 15514 n/a it->it_seq = seq; 15515 n/a _PyObject_GC_TRACK(it); 15516 n/a return (PyObject *)it; 15517 n/a } 15518 n/a 15519 n/a 15520 n/a size_t 15521 n/a Py_UNICODE_strlen(const Py_UNICODE *u) 15522 n/a { 15523 n/a return wcslen(u); 15524 n/a } 15525 n/a 15526 n/a Py_UNICODE* 15527 n/a Py_UNICODE_strcpy(Py_UNICODE *s1, const Py_UNICODE *s2) 15528 n/a { 15529 n/a Py_UNICODE *u = s1; 15530 n/a while ((*u++ = *s2++)); 15531 n/a return s1; 15532 n/a } 15533 n/a 15534 n/a Py_UNICODE* 15535 n/a Py_UNICODE_strncpy(Py_UNICODE *s1, const Py_UNICODE *s2, size_t n) 15536 n/a { 15537 n/a Py_UNICODE *u = s1; 15538 n/a while ((*u++ = *s2++)) 15539 n/a if (n-- == 0) 15540 n/a break; 15541 n/a return s1; 15542 n/a } 15543 n/a 15544 n/a Py_UNICODE* 15545 n/a Py_UNICODE_strcat(Py_UNICODE *s1, const Py_UNICODE *s2) 15546 n/a { 15547 n/a Py_UNICODE *u1 = s1; 15548 n/a u1 += wcslen(u1); 15549 n/a while ((*u1++ = *s2++)); 15550 n/a return s1; 15551 n/a } 15552 n/a 15553 n/a int 15554 n/a Py_UNICODE_strcmp(const Py_UNICODE *s1, const Py_UNICODE *s2) 15555 n/a { 15556 n/a while (*s1 && *s2 && *s1 == *s2) 15557 n/a s1++, s2++; 15558 n/a if (*s1 && *s2) 15559 n/a return (*s1 < *s2) ? -1 : +1; 15560 n/a if (*s1) 15561 n/a return 1; 15562 n/a if (*s2) 15563 n/a return -1; 15564 n/a return 0; 15565 n/a } 15566 n/a 15567 n/a int 15568 n/a Py_UNICODE_strncmp(const Py_UNICODE *s1, const Py_UNICODE *s2, size_t n) 15569 n/a { 15570 n/a Py_UNICODE u1, u2; 15571 n/a for (; n != 0; n--) { 15572 n/a u1 = *s1; 15573 n/a u2 = *s2; 15574 n/a if (u1 != u2) 15575 n/a return (u1 < u2) ? -1 : +1; 15576 n/a if (u1 == '\0') 15577 n/a return 0; 15578 n/a s1++; 15579 n/a s2++; 15580 n/a } 15581 n/a return 0; 15582 n/a } 15583 n/a 15584 n/a Py_UNICODE* 15585 n/a Py_UNICODE_strchr(const Py_UNICODE *s, Py_UNICODE c) 15586 n/a { 15587 n/a const Py_UNICODE *p; 15588 n/a for (p = s; *p; p++) 15589 n/a if (*p == c) 15590 n/a return (Py_UNICODE*)p; 15591 n/a return NULL; 15592 n/a } 15593 n/a 15594 n/a Py_UNICODE* 15595 n/a Py_UNICODE_strrchr(const Py_UNICODE *s, Py_UNICODE c) 15596 n/a { 15597 n/a const Py_UNICODE *p; 15598 n/a p = s + wcslen(s); 15599 n/a while (p != s) { 15600 n/a p--; 15601 n/a if (*p == c) 15602 n/a return (Py_UNICODE*)p; 15603 n/a } 15604 n/a return NULL; 15605 n/a } 15606 n/a 15607 n/a Py_UNICODE* 15608 n/a PyUnicode_AsUnicodeCopy(PyObject *unicode) 15609 n/a { 15610 n/a Py_UNICODE *u, *copy; 15611 n/a Py_ssize_t len, size; 15612 n/a 15613 n/a if (!PyUnicode_Check(unicode)) { 15614 n/a PyErr_BadArgument(); 15615 n/a return NULL; 15616 n/a } 15617 n/a u = PyUnicode_AsUnicodeAndSize(unicode, &len); 15618 n/a if (u == NULL) 15619 n/a return NULL; 15620 n/a /* Ensure we won't overflow the size. */ 15621 n/a if (len > ((PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(Py_UNICODE)) - 1)) { 15622 n/a PyErr_NoMemory(); 15623 n/a return NULL; 15624 n/a } 15625 n/a size = len + 1; /* copy the null character */ 15626 n/a size *= sizeof(Py_UNICODE); 15627 n/a copy = PyMem_Malloc(size); 15628 n/a if (copy == NULL) { 15629 n/a PyErr_NoMemory(); 15630 n/a return NULL; 15631 n/a } 15632 n/a memcpy(copy, u, size); 15633 n/a return copy; 15634 n/a } 15635 n/a 15636 n/a /* A _string module, to export formatter_parser and formatter_field_name_split 15637 n/a to the string.Formatter class implemented in Python. */ 15638 n/a 15639 n/a static PyMethodDef _string_methods[] = { 15640 n/a {"formatter_field_name_split", (PyCFunction) formatter_field_name_split, 15641 n/a METH_O, PyDoc_STR("split the argument as a field name")}, 15642 n/a {"formatter_parser", (PyCFunction) formatter_parser, 15643 n/a METH_O, PyDoc_STR("parse the argument as a format string")}, 15644 n/a {NULL, NULL} 15645 n/a }; 15646 n/a 15647 n/a static struct PyModuleDef _string_module = { 15648 n/a PyModuleDef_HEAD_INIT, 15649 n/a "_string", 15650 n/a PyDoc_STR("string helper module"), 15651 n/a 0, 15652 n/a _string_methods, 15653 n/a NULL, 15654 n/a NULL, 15655 n/a NULL, 15656 n/a NULL 15657 n/a }; 15658 n/a 15659 n/a PyMODINIT_FUNC 15660 n/a PyInit__string(void) 15661 n/a { 15662 n/a return PyModule_Create(&_string_module); 15663 n/a } 15664 n/a 15665 n/a 15666 n/a #ifdef __cplusplus 15667 n/a } 15668 n/a #endif
RetroSearch is an open source project built by @garambo
| Open a GitHub Issue
Search and Browse the WWW like it's 1997 | Search results from DuckDuckGo
HTML:
3.2
| Encoding:
UTF-8
| Version:
0.7.4