Where Online Learning is simpler!
The C and C++ Include Header Files
/usr/include/python3.12/cpython/unicodeobject.h
$ cat -n /usr/include/python3.12/cpython/unicodeobject.h 1 #ifndef Py_CPYTHON_UNICODEOBJECT_H 2 # error "this header file must not be included directly" 3 #endif 4 5 /* Py_UNICODE was the native Unicode storage format (code unit) used by 6 Python and represents a single Unicode element in the Unicode type. 7 With PEP 393, Py_UNICODE is deprecated and replaced with a 8 typedef to wchar_t. */ 9 #define PY_UNICODE_TYPE wchar_t 10 /* Py_DEPRECATED(3.3) */ typedef wchar_t Py_UNICODE; 11 12 /* --- Internal Unicode Operations ---------------------------------------- */ 13 14 // Static inline functions to work with surrogates 15 static inline int Py_UNICODE_IS_SURROGATE(Py_UCS4 ch) { 16 return (0xD800 <= ch && ch <= 0xDFFF); 17 } 18 static inline int Py_UNICODE_IS_HIGH_SURROGATE(Py_UCS4 ch) { 19 return (0xD800 <= ch && ch <= 0xDBFF); 20 } 21 static inline int Py_UNICODE_IS_LOW_SURROGATE(Py_UCS4 ch) { 22 return (0xDC00 <= ch && ch <= 0xDFFF); 23 } 24 25 // Join two surrogate characters and return a single Py_UCS4 value. 26 static inline Py_UCS4 Py_UNICODE_JOIN_SURROGATES(Py_UCS4 high, Py_UCS4 low) { 27 assert(Py_UNICODE_IS_HIGH_SURROGATE(high)); 28 assert(Py_UNICODE_IS_LOW_SURROGATE(low)); 29 return 0x10000 + (((high & 0x03FF) << 10) | (low & 0x03FF)); 30 } 31 32 // High surrogate = top 10 bits added to 0xD800. 33 // The character must be in the range [U+10000; U+10ffff]. 34 static inline Py_UCS4 Py_UNICODE_HIGH_SURROGATE(Py_UCS4 ch) { 35 assert(0x10000 <= ch && ch <= 0x10ffff); 36 return (0xD800 - (0x10000 >> 10) + (ch >> 10)); 37 } 38 39 // Low surrogate = bottom 10 bits added to 0xDC00. 40 // The character must be in the range [U+10000; U+10ffff]. 41 static inline Py_UCS4 Py_UNICODE_LOW_SURROGATE(Py_UCS4 ch) { 42 assert(0x10000 <= ch && ch <= 0x10ffff); 43 return (0xDC00 + (ch & 0x3FF)); 44 } 45 46 /* --- Unicode Type ------------------------------------------------------- */ 47 48 /* ASCII-only strings created through PyUnicode_New use the PyASCIIObject 49 structure. state.ascii and state.compact are set, and the data 50 immediately follow the structure. utf8_length can be found 51 in the length field; the utf8 pointer is equal to the data pointer. */ 52 typedef struct { 53 /* There are 4 forms of Unicode strings: 54 55 - compact ascii: 56 57 * structure = PyASCIIObject 58 * test: PyUnicode_IS_COMPACT_ASCII(op) 59 * kind = PyUnicode_1BYTE_KIND 60 * compact = 1 61 * ascii = 1 62 * (length is the length of the utf8) 63 * (data starts just after the structure) 64 * (since ASCII is decoded from UTF-8, the utf8 string are the data) 65 66 - compact: 67 68 * structure = PyCompactUnicodeObject 69 * test: PyUnicode_IS_COMPACT(op) && !PyUnicode_IS_ASCII(op) 70 * kind = PyUnicode_1BYTE_KIND, PyUnicode_2BYTE_KIND or 71 PyUnicode_4BYTE_KIND 72 * compact = 1 73 * ascii = 0 74 * utf8 is not shared with data 75 * utf8_length = 0 if utf8 is NULL 76 * (data starts just after the structure) 77 78 - legacy string: 79 80 * structure = PyUnicodeObject structure 81 * test: !PyUnicode_IS_COMPACT(op) 82 * kind = PyUnicode_1BYTE_KIND, PyUnicode_2BYTE_KIND or 83 PyUnicode_4BYTE_KIND 84 * compact = 0 85 * data.any is not NULL 86 * utf8 is shared and utf8_length = length with data.any if ascii = 1 87 * utf8_length = 0 if utf8 is NULL 88 89 Compact strings use only one memory block (structure + characters), 90 whereas legacy strings use one block for the structure and one block 91 for characters. 92 93 Legacy strings are created by subclasses of Unicode. 94 95 See also _PyUnicode_CheckConsistency(). 96 */ 97 PyObject_HEAD 98 Py_ssize_t length; /* Number of code points in the string */ 99 Py_hash_t hash; /* Hash value; -1 if not set */ 100 struct { 101 /* If interned is non-zero, the two references from the 102 dictionary to this object are *not* counted in ob_refcnt. 103 The possible values here are: 104 0: Not Interned 105 1: Interned 106 2: Interned and Immortal 107 3: Interned, Immortal, and Static 108 This categorization allows the runtime to determine the right 109 cleanup mechanism at runtime shutdown. */ 110 unsigned int interned:2; 111 /* Character size: 112 113 - PyUnicode_1BYTE_KIND (1): 114 115 * character type = Py_UCS1 (8 bits, unsigned) 116 * all characters are in the range U+0000-U+00FF (latin1) 117 * if ascii is set, all characters are in the range U+0000-U+007F 118 (ASCII), otherwise at least one character is in the range 119 U+0080-U+00FF 120 121 - PyUnicode_2BYTE_KIND (2): 122 123 * character type = Py_UCS2 (16 bits, unsigned) 124 * all characters are in the range U+0000-U+FFFF (BMP) 125 * at least one character is in the range U+0100-U+FFFF 126 127 - PyUnicode_4BYTE_KIND (4): 128 129 * character type = Py_UCS4 (32 bits, unsigned) 130 * all characters are in the range U+0000-U+10FFFF 131 * at least one character is in the range U+10000-U+10FFFF 132 */ 133 unsigned int kind:3; 134 /* Compact is with respect to the allocation scheme. Compact unicode 135 objects only require one memory block while non-compact objects use 136 one block for the PyUnicodeObject struct and another for its data 137 buffer. */ 138 unsigned int compact:1; 139 /* The string only contains characters in the range U+0000-U+007F (ASCII) 140 and the kind is PyUnicode_1BYTE_KIND. If ascii is set and compact is 141 set, use the PyASCIIObject structure. */ 142 unsigned int ascii:1; 143 /* The object is statically allocated. */ 144 unsigned int statically_allocated:1; 145 /* Padding to ensure that PyUnicode_DATA() is always aligned to 146 4 bytes (see issue #19537 on m68k). */ 147 unsigned int :24; 148 } state; 149 } PyASCIIObject; 150 151 /* Non-ASCII strings allocated through PyUnicode_New use the 152 PyCompactUnicodeObject structure. state.compact is set, and the data 153 immediately follow the structure. */ 154 typedef struct { 155 PyASCIIObject _base; 156 Py_ssize_t utf8_length; /* Number of bytes in utf8, excluding the 157 * terminating \0. */ 158 char *utf8; /* UTF-8 representation (null-terminated) */ 159 } PyCompactUnicodeObject; 160 161 /* Object format for Unicode subclasses. */ 162 typedef struct { 163 PyCompactUnicodeObject _base; 164 union { 165 void *any; 166 Py_UCS1 *latin1; 167 Py_UCS2 *ucs2; 168 Py_UCS4 *ucs4; 169 } data; /* Canonical, smallest-form Unicode buffer */ 170 } PyUnicodeObject; 171 172 PyAPI_FUNC(int) _PyUnicode_CheckConsistency( 173 PyObject *op, 174 int check_content); 175 176 177 #define _PyASCIIObject_CAST(op) \ 178 (assert(PyUnicode_Check(op)), \ 179 _Py_CAST(PyASCIIObject*, (op))) 180 #define _PyCompactUnicodeObject_CAST(op) \ 181 (assert(PyUnicode_Check(op)), \ 182 _Py_CAST(PyCompactUnicodeObject*, (op))) 183 #define _PyUnicodeObject_CAST(op) \ 184 (assert(PyUnicode_Check(op)), \ 185 _Py_CAST(PyUnicodeObject*, (op))) 186 187 188 /* --- Flexible String Representation Helper Macros (PEP 393) -------------- */ 189 190 /* Values for PyASCIIObject.state: */ 191 192 /* Interning state. */ 193 #define SSTATE_NOT_INTERNED 0 194 #define SSTATE_INTERNED_MORTAL 1 195 #define SSTATE_INTERNED_IMMORTAL 2 196 #define SSTATE_INTERNED_IMMORTAL_STATIC 3 197 198 /* Use only if you know it's a string */ 199 static inline unsigned int PyUnicode_CHECK_INTERNED(PyObject *op) { 200 return _PyASCIIObject_CAST(op)->state.interned; 201 } 202 #define PyUnicode_CHECK_INTERNED(op) PyUnicode_CHECK_INTERNED(_PyObject_CAST(op)) 203 204 /* For backward compatibility */ 205 static inline unsigned int PyUnicode_IS_READY(PyObject* Py_UNUSED(op)) { 206 return 1; 207 } 208 #define PyUnicode_IS_READY(op) PyUnicode_IS_READY(_PyObject_CAST(op)) 209 210 /* Return true if the string contains only ASCII characters, or 0 if not. The 211 string may be compact (PyUnicode_IS_COMPACT_ASCII) or not, but must be 212 ready. */ 213 static inline unsigned int PyUnicode_IS_ASCII(PyObject *op) { 214 return _PyASCIIObject_CAST(op)->state.ascii; 215 } 216 #define PyUnicode_IS_ASCII(op) PyUnicode_IS_ASCII(_PyObject_CAST(op)) 217 218 /* Return true if the string is compact or 0 if not. 219 No type checks or Ready calls are performed. */ 220 static inline unsigned int PyUnicode_IS_COMPACT(PyObject *op) { 221 return _PyASCIIObject_CAST(op)->state.compact; 222 } 223 #define PyUnicode_IS_COMPACT(op) PyUnicode_IS_COMPACT(_PyObject_CAST(op)) 224 225 /* Return true if the string is a compact ASCII string (use PyASCIIObject 226 structure), or 0 if not. No type checks or Ready calls are performed. */ 227 static inline int PyUnicode_IS_COMPACT_ASCII(PyObject *op) { 228 return (_PyASCIIObject_CAST(op)->state.ascii && PyUnicode_IS_COMPACT(op)); 229 } 230 #define PyUnicode_IS_COMPACT_ASCII(op) PyUnicode_IS_COMPACT_ASCII(_PyObject_CAST(op)) 231 232 enum PyUnicode_Kind { 233 /* Return values of the PyUnicode_KIND() function: */ 234 PyUnicode_1BYTE_KIND = 1, 235 PyUnicode_2BYTE_KIND = 2, 236 PyUnicode_4BYTE_KIND = 4 237 }; 238 239 // PyUnicode_KIND(): Return one of the PyUnicode_*_KIND values defined above. 240 // 241 // gh-89653: Converting this macro to a static inline function would introduce 242 // new compiler warnings on "kind < PyUnicode_KIND(str)" (compare signed and 243 // unsigned numbers) where kind type is an int or on 244 // "unsigned int kind = PyUnicode_KIND(str)" (cast signed to unsigned). 245 #define PyUnicode_KIND(op) _Py_RVALUE(_PyASCIIObject_CAST(op)->state.kind) 246 247 /* Return a void pointer to the raw unicode buffer. */ 248 static inline void* _PyUnicode_COMPACT_DATA(PyObject *op) { 249 if (PyUnicode_IS_ASCII(op)) { 250 return _Py_STATIC_CAST(void*, (_PyASCIIObject_CAST(op) + 1)); 251 } 252 return _Py_STATIC_CAST(void*, (_PyCompactUnicodeObject_CAST(op) + 1)); 253 } 254 255 static inline void* _PyUnicode_NONCOMPACT_DATA(PyObject *op) { 256 void *data; 257 assert(!PyUnicode_IS_COMPACT(op)); 258 data = _PyUnicodeObject_CAST(op)->data.any; 259 assert(data != NULL); 260 return data; 261 } 262 263 static inline void* PyUnicode_DATA(PyObject *op) { 264 if (PyUnicode_IS_COMPACT(op)) { 265 return _PyUnicode_COMPACT_DATA(op); 266 } 267 return _PyUnicode_NONCOMPACT_DATA(op); 268 } 269 #define PyUnicode_DATA(op) PyUnicode_DATA(_PyObject_CAST(op)) 270 271 /* Return pointers to the canonical representation cast to unsigned char, 272 Py_UCS2, or Py_UCS4 for direct character access. 273 No checks are performed, use PyUnicode_KIND() before to ensure 274 these will work correctly. */ 275 276 #define PyUnicode_1BYTE_DATA(op) _Py_STATIC_CAST(Py_UCS1*, PyUnicode_DATA(op)) 277 #define PyUnicode_2BYTE_DATA(op) _Py_STATIC_CAST(Py_UCS2*, PyUnicode_DATA(op)) 278 #define PyUnicode_4BYTE_DATA(op) _Py_STATIC_CAST(Py_UCS4*, PyUnicode_DATA(op)) 279 280 /* Returns the length of the unicode string. */ 281 static inline Py_ssize_t PyUnicode_GET_LENGTH(PyObject *op) { 282 return _PyASCIIObject_CAST(op)->length; 283 } 284 #define PyUnicode_GET_LENGTH(op) PyUnicode_GET_LENGTH(_PyObject_CAST(op)) 285 286 /* Write into the canonical representation, this function does not do any sanity 287 checks and is intended for usage in loops. The caller should cache the 288 kind and data pointers obtained from other function calls. 289 index is the index in the string (starts at 0) and value is the new 290 code point value which should be written to that location. */ 291 static inline void PyUnicode_WRITE(int kind, void *data, 292 Py_ssize_t index, Py_UCS4 value) 293 { 294 assert(index >= 0); 295 if (kind == PyUnicode_1BYTE_KIND) { 296 assert(value <= 0xffU); 297 _Py_STATIC_CAST(Py_UCS1*, data)[index] = _Py_STATIC_CAST(Py_UCS1, value); 298 } 299 else if (kind == PyUnicode_2BYTE_KIND) { 300 assert(value <= 0xffffU); 301 _Py_STATIC_CAST(Py_UCS2*, data)[index] = _Py_STATIC_CAST(Py_UCS2, value); 302 } 303 else { 304 assert(kind == PyUnicode_4BYTE_KIND); 305 assert(value <= 0x10ffffU); 306 _Py_STATIC_CAST(Py_UCS4*, data)[index] = value; 307 } 308 } 309 #define PyUnicode_WRITE(kind, data, index, value) \ 310 PyUnicode_WRITE(_Py_STATIC_CAST(int, kind), _Py_CAST(void*, data), \ 311 (index), _Py_STATIC_CAST(Py_UCS4, value)) 312 313 /* Read a code point from the string's canonical representation. No checks 314 or ready calls are performed. */ 315 static inline Py_UCS4 PyUnicode_READ(int kind, 316 const void *data, Py_ssize_t index) 317 { 318 assert(index >= 0); 319 if (kind == PyUnicode_1BYTE_KIND) { 320 return _Py_STATIC_CAST(const Py_UCS1*, data)[index]; 321 } 322 if (kind == PyUnicode_2BYTE_KIND) { 323 return _Py_STATIC_CAST(const Py_UCS2*, data)[index]; 324 } 325 assert(kind == PyUnicode_4BYTE_KIND); 326 return _Py_STATIC_CAST(const Py_UCS4*, data)[index]; 327 } 328 #define PyUnicode_READ(kind, data, index) \ 329 PyUnicode_READ(_Py_STATIC_CAST(int, kind), \ 330 _Py_STATIC_CAST(const void*, data), \ 331 (index)) 332 333 /* PyUnicode_READ_CHAR() is less efficient than PyUnicode_READ() because it 334 calls PyUnicode_KIND() and might call it twice. For single reads, use 335 PyUnicode_READ_CHAR, for multiple consecutive reads callers should 336 cache kind and use PyUnicode_READ instead. */ 337 static inline Py_UCS4 PyUnicode_READ_CHAR(PyObject *unicode, Py_ssize_t index) 338 { 339 int kind; 340 341 assert(index >= 0); 342 // Tolerate reading the NUL character at str[len(str)] 343 assert(index <= PyUnicode_GET_LENGTH(unicode)); 344 345 kind = PyUnicode_KIND(unicode); 346 if (kind == PyUnicode_1BYTE_KIND) { 347 return PyUnicode_1BYTE_DATA(unicode)[index]; 348 } 349 if (kind == PyUnicode_2BYTE_KIND) { 350 return PyUnicode_2BYTE_DATA(unicode)[index]; 351 } 352 assert(kind == PyUnicode_4BYTE_KIND); 353 return PyUnicode_4BYTE_DATA(unicode)[index]; 354 } 355 #define PyUnicode_READ_CHAR(unicode, index) \ 356 PyUnicode_READ_CHAR(_PyObject_CAST(unicode), (index)) 357 358 /* Return a maximum character value which is suitable for creating another 359 string based on op. This is always an approximation but more efficient 360 than iterating over the string. */ 361 static inline Py_UCS4 PyUnicode_MAX_CHAR_VALUE(PyObject *op) 362 { 363 int kind; 364 365 if (PyUnicode_IS_ASCII(op)) { 366 return 0x7fU; 367 } 368 369 kind = PyUnicode_KIND(op); 370 if (kind == PyUnicode_1BYTE_KIND) { 371 return 0xffU; 372 } 373 if (kind == PyUnicode_2BYTE_KIND) { 374 return 0xffffU; 375 } 376 assert(kind == PyUnicode_4BYTE_KIND); 377 return 0x10ffffU; 378 } 379 #define PyUnicode_MAX_CHAR_VALUE(op) \ 380 PyUnicode_MAX_CHAR_VALUE(_PyObject_CAST(op)) 381 382 /* === Public API ========================================================= */ 383 384 /* --- Plain Py_UNICODE --------------------------------------------------- */ 385 386 /* With PEP 393, this is the recommended way to allocate a new unicode object. 387 This function will allocate the object and its buffer in a single memory 388 block. Objects created using this function are not resizable. */ 389 PyAPI_FUNC(PyObject*) PyUnicode_New( 390 Py_ssize_t size, /* Number of code points in the new string */ 391 Py_UCS4 maxchar /* maximum code point value in the string */ 392 ); 393 394 /* For backward compatibility */ 395 static inline int PyUnicode_READY(PyObject* Py_UNUSED(op)) 396 { 397 return 0; 398 } 399 #define PyUnicode_READY(op) PyUnicode_READY(_PyObject_CAST(op)) 400 401 /* Get a copy of a Unicode string. */ 402 PyAPI_FUNC(PyObject*) _PyUnicode_Copy( 403 PyObject *unicode 404 ); 405 406 /* Copy character from one unicode object into another, this function performs 407 character conversion when necessary and falls back to memcpy() if possible. 408 409 Fail if to is too small (smaller than *how_many* or smaller than 410 len(from)-from_start), or if kind(from[from_start:from_start+how_many]) > 411 kind(to), or if *to* has more than 1 reference. 412 413 Return the number of written character, or return -1 and raise an exception 414 on error. 415 416 Pseudo-code: 417 418 how_many = min(how_many, len(from) - from_start) 419 to[to_start:to_start+how_many] = from[from_start:from_start+how_many] 420 return how_many 421 422 Note: The function doesn't write a terminating null character. 423 */ 424 PyAPI_FUNC(Py_ssize_t) PyUnicode_CopyCharacters( 425 PyObject *to, 426 Py_ssize_t to_start, 427 PyObject *from, 428 Py_ssize_t from_start, 429 Py_ssize_t how_many 430 ); 431 432 /* Unsafe version of PyUnicode_CopyCharacters(): don't check arguments and so 433 may crash if parameters are invalid (e.g. if the output string 434 is too short). */ 435 PyAPI_FUNC(void) _PyUnicode_FastCopyCharacters( 436 PyObject *to, 437 Py_ssize_t to_start, 438 PyObject *from, 439 Py_ssize_t from_start, 440 Py_ssize_t how_many 441 ); 442 443 /* Fill a string with a character: write fill_char into 444 unicode[start:start+length]. 445 446 Fail if fill_char is bigger than the string maximum character, or if the 447 string has more than 1 reference. 448 449 Return the number of written character, or return -1 and raise an exception 450 on error. */ 451 PyAPI_FUNC(Py_ssize_t) PyUnicode_Fill( 452 PyObject *unicode, 453 Py_ssize_t start, 454 Py_ssize_t length, 455 Py_UCS4 fill_char 456 ); 457 458 /* Unsafe version of PyUnicode_Fill(): don't check arguments and so may crash 459 if parameters are invalid (e.g. if length is longer than the string). */ 460 PyAPI_FUNC(void) _PyUnicode_FastFill( 461 PyObject *unicode, 462 Py_ssize_t start, 463 Py_ssize_t length, 464 Py_UCS4 fill_char 465 ); 466 467 /* Create a new string from a buffer of Py_UCS1, Py_UCS2 or Py_UCS4 characters. 468 Scan the string to find the maximum character. */ 469 PyAPI_FUNC(PyObject*) PyUnicode_FromKindAndData( 470 int kind, 471 const void *buffer, 472 Py_ssize_t size); 473 474 /* Create a new string from a buffer of ASCII characters. 475 WARNING: Don't check if the string contains any non-ASCII character. */ 476 PyAPI_FUNC(PyObject*) _PyUnicode_FromASCII( 477 const char *buffer, 478 Py_ssize_t size); 479 480 /* Compute the maximum character of the substring unicode[start:end]. 481 Return 127 for an empty string. */ 482 PyAPI_FUNC(Py_UCS4) _PyUnicode_FindMaxChar ( 483 PyObject *unicode, 484 Py_ssize_t start, 485 Py_ssize_t end); 486 487 /* --- _PyUnicodeWriter API ----------------------------------------------- */ 488 489 typedef struct { 490 PyObject *buffer; 491 void *data; 492 int kind; 493 Py_UCS4 maxchar; 494 Py_ssize_t size; 495 Py_ssize_t pos; 496 497 /* minimum number of allocated characters (default: 0) */ 498 Py_ssize_t min_length; 499 500 /* minimum character (default: 127, ASCII) */ 501 Py_UCS4 min_char; 502 503 /* If non-zero, overallocate the buffer (default: 0). */ 504 unsigned char overallocate; 505 506 /* If readonly is 1, buffer is a shared string (cannot be modified) 507 and size is set to 0. */ 508 unsigned char readonly; 509 } _PyUnicodeWriter ; 510 511 /* Initialize a Unicode writer. 512 * 513 * By default, the minimum buffer size is 0 character and overallocation is 514 * disabled. Set min_length, min_char and overallocate attributes to control 515 * the allocation of the buffer. */ 516 PyAPI_FUNC(void) 517 _PyUnicodeWriter_Init(_PyUnicodeWriter *writer); 518 519 /* Prepare the buffer to write 'length' characters 520 with the specified maximum character. 521 522 Return 0 on success, raise an exception and return -1 on error. */ 523 #define _PyUnicodeWriter_Prepare(WRITER, LENGTH, MAXCHAR) \ 524 (((MAXCHAR) <= (WRITER)->maxchar \ 525 && (LENGTH) <= (WRITER)->size - (WRITER)->pos) \ 526 ? 0 \ 527 : (((LENGTH) == 0) \ 528 ? 0 \ 529 : _PyUnicodeWriter_PrepareInternal((WRITER), (LENGTH), (MAXCHAR)))) 530 531 /* Don't call this function directly, use the _PyUnicodeWriter_Prepare() macro 532 instead. */ 533 PyAPI_FUNC(int) 534 _PyUnicodeWriter_PrepareInternal(_PyUnicodeWriter *writer, 535 Py_ssize_t length, Py_UCS4 maxchar); 536 537 /* Prepare the buffer to have at least the kind KIND. 538 For example, kind=PyUnicode_2BYTE_KIND ensures that the writer will 539 support characters in range U+000-U+FFFF. 540 541 Return 0 on success, raise an exception and return -1 on error. */ 542 #define _PyUnicodeWriter_PrepareKind(WRITER, KIND) \ 543 ((KIND) <= (WRITER)->kind \ 544 ? 0 \ 545 : _PyUnicodeWriter_PrepareKindInternal((WRITER), (KIND))) 546 547 /* Don't call this function directly, use the _PyUnicodeWriter_PrepareKind() 548 macro instead. */ 549 PyAPI_FUNC(int) 550 _PyUnicodeWriter_PrepareKindInternal(_PyUnicodeWriter *writer, 551 int kind); 552 553 /* Append a Unicode character. 554 Return 0 on success, raise an exception and return -1 on error. */ 555 PyAPI_FUNC(int) 556 _PyUnicodeWriter_WriteChar(_PyUnicodeWriter *writer, 557 Py_UCS4 ch 558 ); 559 560 /* Append a Unicode string. 561 Return 0 on success, raise an exception and return -1 on error. */ 562 PyAPI_FUNC(int) 563 _PyUnicodeWriter_WriteStr(_PyUnicodeWriter *writer, 564 PyObject *str /* Unicode string */ 565 ); 566 567 /* Append a substring of a Unicode string. 568 Return 0 on success, raise an exception and return -1 on error. */ 569 PyAPI_FUNC(int) 570 _PyUnicodeWriter_WriteSubstring(_PyUnicodeWriter *writer, 571 PyObject *str, /* Unicode string */ 572 Py_ssize_t start, 573 Py_ssize_t end 574 ); 575 576 /* Append an ASCII-encoded byte string. 577 Return 0 on success, raise an exception and return -1 on error. */ 578 PyAPI_FUNC(int) 579 _PyUnicodeWriter_WriteASCIIString(_PyUnicodeWriter *writer, 580 const char *str, /* ASCII-encoded byte string */ 581 Py_ssize_t len /* number of bytes, or -1 if unknown */ 582 ); 583 584 /* Append a latin1-encoded byte string. 585 Return 0 on success, raise an exception and return -1 on error. */ 586 PyAPI_FUNC(int) 587 _PyUnicodeWriter_WriteLatin1String(_PyUnicodeWriter *writer, 588 const char *str, /* latin1-encoded byte string */ 589 Py_ssize_t len /* length in bytes */ 590 ); 591 592 /* Get the value of the writer as a Unicode string. Clear the 593 buffer of the writer. Raise an exception and return NULL 594 on error. */ 595 PyAPI_FUNC(PyObject *) 596 _PyUnicodeWriter_Finish(_PyUnicodeWriter *writer); 597 598 /* Deallocate memory of a writer (clear its internal buffer). */ 599 PyAPI_FUNC(void) 600 _PyUnicodeWriter_Dealloc(_PyUnicodeWriter *writer); 601 602 603 /* Format the object based on the format_spec, as defined in PEP 3101 604 (Advanced String Formatting). */ 605 PyAPI_FUNC(int) _PyUnicode_FormatAdvancedWriter( 606 _PyUnicodeWriter *writer, 607 PyObject *obj, 608 PyObject *format_spec, 609 Py_ssize_t start, 610 Py_ssize_t end); 611 612 /* --- Manage the default encoding ---------------------------------------- */ 613 614 /* Returns a pointer to the default encoding (UTF-8) of the 615 Unicode object unicode. 616 617 Like PyUnicode_AsUTF8AndSize(), this also caches the UTF-8 representation 618 in the unicodeobject. 619 620 _PyUnicode_AsString is a #define for PyUnicode_AsUTF8 to 621 support the previous internal function with the same behaviour. 622 623 Use of this API is DEPRECATED since no size information can be 624 extracted from the returned data. 625 */ 626 627 PyAPI_FUNC(const char *) PyUnicode_AsUTF8(PyObject *unicode); 628 629 #define _PyUnicode_AsString PyUnicode_AsUTF8 630 631 /* --- UTF-7 Codecs ------------------------------------------------------- */ 632 633 PyAPI_FUNC(PyObject*) _PyUnicode_EncodeUTF7( 634 PyObject *unicode, /* Unicode object */ 635 int base64SetO, /* Encode RFC2152 Set O characters in base64 */ 636 int base64WhiteSpace, /* Encode whitespace (sp, ht, nl, cr) in base64 */ 637 const char *errors /* error handling */ 638 ); 639 640 /* --- UTF-8 Codecs ------------------------------------------------------- */ 641 642 PyAPI_FUNC(PyObject*) _PyUnicode_AsUTF8String( 643 PyObject *unicode, 644 const char *errors); 645 646 /* --- UTF-32 Codecs ------------------------------------------------------ */ 647 648 PyAPI_FUNC(PyObject*) _PyUnicode_EncodeUTF32( 649 PyObject *object, /* Unicode object */ 650 const char *errors, /* error handling */ 651 int byteorder /* byteorder to use 0=BOM+native;-1=LE,1=BE */ 652 ); 653 654 /* --- UTF-16 Codecs ------------------------------------------------------ */ 655 656 /* Returns a Python string object holding the UTF-16 encoded value of 657 the Unicode data. 658 659 If byteorder is not 0, output is written according to the following 660 byte order: 661 662 byteorder == -1: little endian 663 byteorder == 0: native byte order (writes a BOM mark) 664 byteorder == 1: big endian 665 666 If byteorder is 0, the output string will always start with the 667 Unicode BOM mark (U+FEFF). In the other two modes, no BOM mark is 668 prepended. 669 */ 670 PyAPI_FUNC(PyObject*) _PyUnicode_EncodeUTF16( 671 PyObject* unicode, /* Unicode object */ 672 const char *errors, /* error handling */ 673 int byteorder /* byteorder to use 0=BOM+native;-1=LE,1=BE */ 674 ); 675 676 /* --- Unicode-Escape Codecs ---------------------------------------------- */ 677 678 /* Variant of PyUnicode_DecodeUnicodeEscape that supports partial decoding. */ 679 PyAPI_FUNC(PyObject*) _PyUnicode_DecodeUnicodeEscapeStateful( 680 const char *string, /* Unicode-Escape encoded string */ 681 Py_ssize_t length, /* size of string */ 682 const char *errors, /* error handling */ 683 Py_ssize_t *consumed /* bytes consumed */ 684 ); 685 /* Helper for PyUnicode_DecodeUnicodeEscape that detects invalid escape 686 chars. */ 687 PyAPI_FUNC(PyObject*) _PyUnicode_DecodeUnicodeEscapeInternal( 688 const char *string, /* Unicode-Escape encoded string */ 689 Py_ssize_t length, /* size of string */ 690 const char *errors, /* error handling */ 691 Py_ssize_t *consumed, /* bytes consumed */ 692 const char **first_invalid_escape /* on return, points to first 693 invalid escaped char in 694 string. */ 695 ); 696 697 /* --- Raw-Unicode-Escape Codecs ---------------------------------------------- */ 698 699 /* Variant of PyUnicode_DecodeRawUnicodeEscape that supports partial decoding. */ 700 PyAPI_FUNC(PyObject*) _PyUnicode_DecodeRawUnicodeEscapeStateful( 701 const char *string, /* Unicode-Escape encoded string */ 702 Py_ssize_t length, /* size of string */ 703 const char *errors, /* error handling */ 704 Py_ssize_t *consumed /* bytes consumed */ 705 ); 706 707 /* --- Latin-1 Codecs ----------------------------------------------------- */ 708 709 PyAPI_FUNC(PyObject*) _PyUnicode_AsLatin1String( 710 PyObject* unicode, 711 const char* errors); 712 713 /* --- ASCII Codecs ------------------------------------------------------- */ 714 715 PyAPI_FUNC(PyObject*) _PyUnicode_AsASCIIString( 716 PyObject* unicode, 717 const char* errors); 718 719 /* --- Character Map Codecs ----------------------------------------------- */ 720 721 /* Translate an Unicode object by applying a character mapping table to 722 it and return the resulting Unicode object. 723 724 The mapping table must map Unicode ordinal integers to Unicode strings, 725 Unicode ordinal integers or None (causing deletion of the character). 726 727 Mapping tables may be dictionaries or sequences. Unmapped character 728 ordinals (ones which cause a LookupError) are left untouched and 729 are copied as-is. 730 */ 731 PyAPI_FUNC(PyObject*) _PyUnicode_EncodeCharmap( 732 PyObject *unicode, /* Unicode object */ 733 PyObject *mapping, /* encoding mapping */ 734 const char *errors /* error handling */ 735 ); 736 737 /* --- Decimal Encoder ---------------------------------------------------- */ 738 739 /* Coverts a Unicode object holding a decimal value to an ASCII string 740 for using in int, float and complex parsers. 741 Transforms code points that have decimal digit property to the 742 corresponding ASCII digit code points. Transforms spaces to ASCII. 743 Transforms code points starting from the first non-ASCII code point that 744 is neither a decimal digit nor a space to the end into '?'. */ 745 746 PyAPI_FUNC(PyObject*) _PyUnicode_TransformDecimalAndSpaceToASCII( 747 PyObject *unicode /* Unicode object */ 748 ); 749 750 /* --- Methods & Slots ---------------------------------------------------- */ 751 752 PyAPI_FUNC(PyObject *) _PyUnicode_JoinArray( 753 PyObject *separator, 754 PyObject *const *items, 755 Py_ssize_t seqlen 756 ); 757 758 /* Test whether a unicode is equal to ASCII identifier. Return 1 if true, 759 0 otherwise. The right argument must be ASCII identifier. 760 Any error occurs inside will be cleared before return. */ 761 PyAPI_FUNC(int) _PyUnicode_EqualToASCIIId( 762 PyObject *left, /* Left string */ 763 _Py_Identifier *right /* Right identifier */ 764 ); 765 766 /* Test whether a unicode is equal to ASCII string. Return 1 if true, 767 0 otherwise. The right argument must be ASCII-encoded string. 768 Any error occurs inside will be cleared before return. */ 769 PyAPI_FUNC(int) _PyUnicode_EqualToASCIIString( 770 PyObject *left, 771 const char *right /* ASCII-encoded string */ 772 ); 773 774 /* Externally visible for str.strip(unicode) */ 775 PyAPI_FUNC(PyObject *) _PyUnicode_XStrip( 776 PyObject *self, 777 int striptype, 778 PyObject *sepobj 779 ); 780 781 /* Using explicit passed-in values, insert the thousands grouping 782 into the string pointed to by buffer. For the argument descriptions, 783 see Objects/stringlib/localeutil.h */ 784 PyAPI_FUNC(Py_ssize_t) _PyUnicode_InsertThousandsGrouping( 785 _PyUnicodeWriter *writer, 786 Py_ssize_t n_buffer, 787 PyObject *digits, 788 Py_ssize_t d_pos, 789 Py_ssize_t n_digits, 790 Py_ssize_t min_width, 791 const char *grouping, 792 PyObject *thousands_sep, 793 Py_UCS4 *maxchar); 794 795 /* === Characters Type APIs =============================================== */ 796 797 /* These should not be used directly. Use the Py_UNICODE_IS* and 798 Py_UNICODE_TO* macros instead. 799 800 These APIs are implemented in Objects/unicodectype.c. 801 802 */ 803 804 PyAPI_FUNC(int) _PyUnicode_IsLowercase( 805 Py_UCS4 ch /* Unicode character */ 806 ); 807 808 PyAPI_FUNC(int) _PyUnicode_IsUppercase( 809 Py_UCS4 ch /* Unicode character */ 810 ); 811 812 PyAPI_FUNC(int) _PyUnicode_IsTitlecase( 813 Py_UCS4 ch /* Unicode character */ 814 ); 815 816 PyAPI_FUNC(int) _PyUnicode_IsXidStart( 817 Py_UCS4 ch /* Unicode character */ 818 ); 819 820 PyAPI_FUNC(int) _PyUnicode_IsXidContinue( 821 Py_UCS4 ch /* Unicode character */ 822 ); 823 824 PyAPI_FUNC(int) _PyUnicode_IsWhitespace( 825 const Py_UCS4 ch /* Unicode character */ 826 ); 827 828 PyAPI_FUNC(int) _PyUnicode_IsLinebreak( 829 const Py_UCS4 ch /* Unicode character */ 830 ); 831 832 /* Py_DEPRECATED(3.3) */ PyAPI_FUNC(Py_UCS4) _PyUnicode_ToLowercase( 833 Py_UCS4 ch /* Unicode character */ 834 ); 835 836 /* Py_DEPRECATED(3.3) */ PyAPI_FUNC(Py_UCS4) _PyUnicode_ToUppercase( 837 Py_UCS4 ch /* Unicode character */ 838 ); 839 840 Py_DEPRECATED(3.3) PyAPI_FUNC(Py_UCS4) _PyUnicode_ToTitlecase( 841 Py_UCS4 ch /* Unicode character */ 842 ); 843 844 PyAPI_FUNC(int) _PyUnicode_ToLowerFull( 845 Py_UCS4 ch, /* Unicode character */ 846 Py_UCS4 *res 847 ); 848 849 PyAPI_FUNC(int) _PyUnicode_ToTitleFull( 850 Py_UCS4 ch, /* Unicode character */ 851 Py_UCS4 *res 852 ); 853 854 PyAPI_FUNC(int) _PyUnicode_ToUpperFull( 855 Py_UCS4 ch, /* Unicode character */ 856 Py_UCS4 *res 857 ); 858 859 PyAPI_FUNC(int) _PyUnicode_ToFoldedFull( 860 Py_UCS4 ch, /* Unicode character */ 861 Py_UCS4 *res 862 ); 863 864 PyAPI_FUNC(int) _PyUnicode_IsCaseIgnorable( 865 Py_UCS4 ch /* Unicode character */ 866 ); 867 868 PyAPI_FUNC(int) _PyUnicode_IsCased( 869 Py_UCS4 ch /* Unicode character */ 870 ); 871 872 PyAPI_FUNC(int) _PyUnicode_ToDecimalDigit( 873 Py_UCS4 ch /* Unicode character */ 874 ); 875 876 PyAPI_FUNC(int) _PyUnicode_ToDigit( 877 Py_UCS4 ch /* Unicode character */ 878 ); 879 880 PyAPI_FUNC(double) _PyUnicode_ToNumeric( 881 Py_UCS4 ch /* Unicode character */ 882 ); 883 884 PyAPI_FUNC(int) _PyUnicode_IsDecimalDigit( 885 Py_UCS4 ch /* Unicode character */ 886 ); 887 888 PyAPI_FUNC(int) _PyUnicode_IsDigit( 889 Py_UCS4 ch /* Unicode character */ 890 ); 891 892 PyAPI_FUNC(int) _PyUnicode_IsNumeric( 893 Py_UCS4 ch /* Unicode character */ 894 ); 895 896 PyAPI_FUNC(int) _PyUnicode_IsPrintable( 897 Py_UCS4 ch /* Unicode character */ 898 ); 899 900 PyAPI_FUNC(int) _PyUnicode_IsAlpha( 901 Py_UCS4 ch /* Unicode character */ 902 ); 903 904 // Helper array used by Py_UNICODE_ISSPACE(). 905 PyAPI_DATA(const unsigned char) _Py_ascii_whitespace[]; 906 907 // Since splitting on whitespace is an important use case, and 908 // whitespace in most situations is solely ASCII whitespace, we 909 // optimize for the common case by using a quick look-up table 910 // _Py_ascii_whitespace (see below) with an inlined check. 911 static inline int Py_UNICODE_ISSPACE(Py_UCS4 ch) { 912 if (ch < 128) { 913 return _Py_ascii_whitespace[ch]; 914 } 915 return _PyUnicode_IsWhitespace(ch); 916 } 917 918 #define Py_UNICODE_ISLOWER(ch) _PyUnicode_IsLowercase(ch) 919 #define Py_UNICODE_ISUPPER(ch) _PyUnicode_IsUppercase(ch) 920 #define Py_UNICODE_ISTITLE(ch) _PyUnicode_IsTitlecase(ch) 921 #define Py_UNICODE_ISLINEBREAK(ch) _PyUnicode_IsLinebreak(ch) 922 923 #define Py_UNICODE_TOLOWER(ch) _PyUnicode_ToLowercase(ch) 924 #define Py_UNICODE_TOUPPER(ch) _PyUnicode_ToUppercase(ch) 925 #define Py_UNICODE_TOTITLE(ch) _PyUnicode_ToTitlecase(ch) 926 927 #define Py_UNICODE_ISDECIMAL(ch) _PyUnicode_IsDecimalDigit(ch) 928 #define Py_UNICODE_ISDIGIT(ch) _PyUnicode_IsDigit(ch) 929 #define Py_UNICODE_ISNUMERIC(ch) _PyUnicode_IsNumeric(ch) 930 #define Py_UNICODE_ISPRINTABLE(ch) _PyUnicode_IsPrintable(ch) 931 932 #define Py_UNICODE_TODECIMAL(ch) _PyUnicode_ToDecimalDigit(ch) 933 #define Py_UNICODE_TODIGIT(ch) _PyUnicode_ToDigit(ch) 934 #define Py_UNICODE_TONUMERIC(ch) _PyUnicode_ToNumeric(ch) 935 936 #define Py_UNICODE_ISALPHA(ch) _PyUnicode_IsAlpha(ch) 937 938 static inline int Py_UNICODE_ISALNUM(Py_UCS4 ch) { 939 return (Py_UNICODE_ISALPHA(ch) 940 || Py_UNICODE_ISDECIMAL(ch) 941 || Py_UNICODE_ISDIGIT(ch) 942 || Py_UNICODE_ISNUMERIC(ch)); 943 } 944 945 946 /* === Misc functions ===================================================== */ 947 948 PyAPI_FUNC(PyObject*) _PyUnicode_FormatLong(PyObject *, int, int, int); 949 950 /* Return an interned Unicode object for an Identifier; may fail if there is no memory.*/ 951 PyAPI_FUNC(PyObject*) _PyUnicode_FromId(_Py_Identifier*); 952 953 /* Fast equality check when the inputs are known to be exact unicode types 954 and where the hash values are equal (i.e. a very probable match) */ 955 PyAPI_FUNC(int) _PyUnicode_EQ(PyObject *, PyObject *); 956 957 /* Equality check. */ 958 PyAPI_FUNC(int) _PyUnicode_Equal(PyObject *, PyObject *); 959 960 PyAPI_FUNC(int) _PyUnicode_WideCharString_Converter(PyObject *, void *); 961 PyAPI_FUNC(int) _PyUnicode_WideCharString_Opt_Converter(PyObject *, void *); 962 963 PyAPI_FUNC(Py_ssize_t) _PyUnicode_ScanIdentifier(PyObject *);
Contact us
|
About us
|
Term of use
|
Copyright © 2000-2025 MyWebUniversity.com ™