Where Online Learning is simpler!

The C and C++ Include Header Files

/usr/include/python3.12/cpython/unicodeobject.h


$ cat -n /usr/include/python3.12/cpython/unicodeobject.h

     1	#ifndef Py_CPYTHON_UNICODEOBJECT_H
     2	#  error "this header file must not be included directly"
     3	#endif
     4	
     5	/* Py_UNICODE was the native Unicode storage format (code unit) used by
     6	   Python and represents a single Unicode element in the Unicode type.
     7	   With PEP 393, Py_UNICODE is deprecated and replaced with a
     8	   typedef to wchar_t. */
     9	#define PY_UNICODE_TYPE wchar_t
    10	/* Py_DEPRECATED(3.3) */ typedef wchar_t Py_UNICODE;
    11	
    12	/* --- Internal Unicode Operations ---------------------------------------- */
    13	
    14	// Static inline functions to work with surrogates
    15	static inline int Py_UNICODE_IS_SURROGATE(Py_UCS4 ch) {
    16	    return (0xD800 <= ch && ch <= 0xDFFF);
    17	}
    18	static inline int Py_UNICODE_IS_HIGH_SURROGATE(Py_UCS4 ch) {
    19	    return (0xD800 <= ch && ch <= 0xDBFF);
    20	}
    21	static inline int Py_UNICODE_IS_LOW_SURROGATE(Py_UCS4 ch) {
    22	    return (0xDC00 <= ch && ch <= 0xDFFF);
    23	}
    24	
    25	// Join two surrogate characters and return a single Py_UCS4 value.
    26	static inline Py_UCS4 Py_UNICODE_JOIN_SURROGATES(Py_UCS4 high, Py_UCS4 low)  {
    27	    assert(Py_UNICODE_IS_HIGH_SURROGATE(high));
    28	    assert(Py_UNICODE_IS_LOW_SURROGATE(low));
    29	    return 0x10000 + (((high & 0x03FF) << 10) | (low & 0x03FF));
    30	}
    31	
    32	// High surrogate = top 10 bits added to 0xD800.
    33	// The character must be in the range [U+10000; U+10ffff].
    34	static inline Py_UCS4 Py_UNICODE_HIGH_SURROGATE(Py_UCS4 ch) {
    35	    assert(0x10000 <= ch && ch <= 0x10ffff);
    36	    return (0xD800 - (0x10000 >> 10) + (ch >> 10));
    37	}
    38	
    39	// Low surrogate = bottom 10 bits added to 0xDC00.
    40	// The character must be in the range [U+10000; U+10ffff].
    41	static inline Py_UCS4 Py_UNICODE_LOW_SURROGATE(Py_UCS4 ch) {
    42	    assert(0x10000 <= ch && ch <= 0x10ffff);
    43	    return (0xDC00 + (ch & 0x3FF));
    44	}
    45	
    46	/* --- Unicode Type ------------------------------------------------------- */
    47	
    48	/* ASCII-only strings created through PyUnicode_New use the PyASCIIObject
    49	   structure. state.ascii and state.compact are set, and the data
    50	   immediately follow the structure. utf8_length can be found
    51	   in the length field; the utf8 pointer is equal to the data pointer. */
    52	typedef struct {
    53	    /* There are 4 forms of Unicode strings:
    54	
    55	       - compact ascii:
    56	
    57	         * structure = PyASCIIObject
    58	         * test: PyUnicode_IS_COMPACT_ASCII(op)
    59	         * kind = PyUnicode_1BYTE_KIND
    60	         * compact = 1
    61	         * ascii = 1
    62	         * (length is the length of the utf8)
    63	         * (data starts just after the structure)
    64	         * (since ASCII is decoded from UTF-8, the utf8 string are the data)
    65	
    66	       - compact:
    67	
    68	         * structure = PyCompactUnicodeObject
    69	         * test: PyUnicode_IS_COMPACT(op) && !PyUnicode_IS_ASCII(op)
    70	         * kind = PyUnicode_1BYTE_KIND, PyUnicode_2BYTE_KIND or
    71	           PyUnicode_4BYTE_KIND
    72	         * compact = 1
    73	         * ascii = 0
    74	         * utf8 is not shared with data
    75	         * utf8_length = 0 if utf8 is NULL
    76	         * (data starts just after the structure)
    77	
    78	       - legacy string:
    79	
    80	         * structure = PyUnicodeObject structure
    81	         * test: !PyUnicode_IS_COMPACT(op)
    82	         * kind = PyUnicode_1BYTE_KIND, PyUnicode_2BYTE_KIND or
    83	           PyUnicode_4BYTE_KIND
    84	         * compact = 0
    85	         * data.any is not NULL
    86	         * utf8 is shared and utf8_length = length with data.any if ascii = 1
    87	         * utf8_length = 0 if utf8 is NULL
    88	
    89	       Compact strings use only one memory block (structure + characters),
    90	       whereas legacy strings use one block for the structure and one block
    91	       for characters.
    92	
    93	       Legacy strings are created by subclasses of Unicode.
    94	
    95	       See also _PyUnicode_CheckConsistency().
    96	    */
    97	    PyObject_HEAD
    98	    Py_ssize_t length;          /* Number of code points in the string */
    99	    Py_hash_t hash;             /* Hash value; -1 if not set */
   100	    struct {
   101	        /* If interned is non-zero, the two references from the
   102	           dictionary to this object are *not* counted in ob_refcnt.
   103	           The possible values here are:
   104	               0: Not Interned
   105	               1: Interned
   106	               2: Interned and Immortal
   107	               3: Interned, Immortal, and Static
   108	           This categorization allows the runtime to determine the right
   109	           cleanup mechanism at runtime shutdown. */
   110	        unsigned int interned:2;
   111	        /* Character size:
   112	
   113	           - PyUnicode_1BYTE_KIND (1):
   114	
   115	             * character type = Py_UCS1 (8 bits, unsigned)
   116	             * all characters are in the range U+0000-U+00FF (latin1)
   117	             * if ascii is set, all characters are in the range U+0000-U+007F
   118	               (ASCII), otherwise at least one character is in the range
   119	               U+0080-U+00FF
   120	
   121	           - PyUnicode_2BYTE_KIND (2):
   122	
   123	             * character type = Py_UCS2 (16 bits, unsigned)
   124	             * all characters are in the range U+0000-U+FFFF (BMP)
   125	             * at least one character is in the range U+0100-U+FFFF
   126	
   127	           - PyUnicode_4BYTE_KIND (4):
   128	
   129	             * character type = Py_UCS4 (32 bits, unsigned)
   130	             * all characters are in the range U+0000-U+10FFFF
   131	             * at least one character is in the range U+10000-U+10FFFF
   132	         */
   133	        unsigned int kind:3;
   134	        /* Compact is with respect to the allocation scheme. Compact unicode
   135	           objects only require one memory block while non-compact objects use
   136	           one block for the PyUnicodeObject struct and another for its data
   137	           buffer. */
   138	        unsigned int compact:1;
   139	        /* The string only contains characters in the range U+0000-U+007F (ASCII)
   140	           and the kind is PyUnicode_1BYTE_KIND. If ascii is set and compact is
   141	           set, use the PyASCIIObject structure. */
   142	        unsigned int ascii:1;
   143	        /* The object is statically allocated. */
   144	        unsigned int statically_allocated:1;
   145	        /* Padding to ensure that PyUnicode_DATA() is always aligned to
   146	           4 bytes (see issue #19537 on m68k). */
   147	        unsigned int :24;
   148	    } state;
   149	} PyASCIIObject;
   150	
   151	/* Non-ASCII strings allocated through PyUnicode_New use the
   152	   PyCompactUnicodeObject structure. state.compact is set, and the data
   153	   immediately follow the structure. */
   154	typedef struct {
   155	    PyASCIIObject _base;
   156	    Py_ssize_t utf8_length;     /* Number of bytes in utf8, excluding the
   157	                                 * terminating \0. */
   158	    char *utf8;                 /* UTF-8 representation (null-terminated) */
   159	} PyCompactUnicodeObject;
   160	
   161	/* Object format for Unicode subclasses. */
   162	typedef struct {
   163	    PyCompactUnicodeObject _base;
   164	    union {
   165	        void *any;
   166	        Py_UCS1 *latin1;
   167	        Py_UCS2 *ucs2;
   168	        Py_UCS4 *ucs4;
   169	    } data;                     /* Canonical, smallest-form Unicode buffer */
   170	} PyUnicodeObject;
   171	
   172	PyAPI_FUNC(int) _PyUnicode_CheckConsistency(
   173	    PyObject *op,
   174	    int check_content);
   175	
   176	
   177	#define _PyASCIIObject_CAST(op) \
   178	    (assert(PyUnicode_Check(op)), \
   179	     _Py_CAST(PyASCIIObject*, (op)))
   180	#define _PyCompactUnicodeObject_CAST(op) \
   181	    (assert(PyUnicode_Check(op)), \
   182	     _Py_CAST(PyCompactUnicodeObject*, (op)))
   183	#define _PyUnicodeObject_CAST(op) \
   184	    (assert(PyUnicode_Check(op)), \
   185	     _Py_CAST(PyUnicodeObject*, (op)))
   186	
   187	
   188	/* --- Flexible String Representation Helper Macros (PEP 393) -------------- */
   189	
   190	/* Values for PyASCIIObject.state: */
   191	
   192	/* Interning state. */
   193	#define SSTATE_NOT_INTERNED 0
   194	#define SSTATE_INTERNED_MORTAL 1
   195	#define SSTATE_INTERNED_IMMORTAL 2
   196	#define SSTATE_INTERNED_IMMORTAL_STATIC 3
   197	
   198	/* Use only if you know it's a string */
   199	static inline unsigned int PyUnicode_CHECK_INTERNED(PyObject *op) {
   200	    return _PyASCIIObject_CAST(op)->state.interned;
   201	}
   202	#define PyUnicode_CHECK_INTERNED(op) PyUnicode_CHECK_INTERNED(_PyObject_CAST(op))
   203	
   204	/* For backward compatibility */
   205	static inline unsigned int PyUnicode_IS_READY(PyObject* Py_UNUSED(op)) {
   206	    return 1;
   207	}
   208	#define PyUnicode_IS_READY(op) PyUnicode_IS_READY(_PyObject_CAST(op))
   209	
   210	/* Return true if the string contains only ASCII characters, or 0 if not. The
   211	   string may be compact (PyUnicode_IS_COMPACT_ASCII) or not, but must be
   212	   ready. */
   213	static inline unsigned int PyUnicode_IS_ASCII(PyObject *op) {
   214	    return _PyASCIIObject_CAST(op)->state.ascii;
   215	}
   216	#define PyUnicode_IS_ASCII(op) PyUnicode_IS_ASCII(_PyObject_CAST(op))
   217	
   218	/* Return true if the string is compact or 0 if not.
   219	   No type checks or Ready calls are performed. */
   220	static inline unsigned int PyUnicode_IS_COMPACT(PyObject *op) {
   221	    return _PyASCIIObject_CAST(op)->state.compact;
   222	}
   223	#define PyUnicode_IS_COMPACT(op) PyUnicode_IS_COMPACT(_PyObject_CAST(op))
   224	
   225	/* Return true if the string is a compact ASCII string (use PyASCIIObject
   226	   structure), or 0 if not.  No type checks or Ready calls are performed. */
   227	static inline int PyUnicode_IS_COMPACT_ASCII(PyObject *op) {
   228	    return (_PyASCIIObject_CAST(op)->state.ascii && PyUnicode_IS_COMPACT(op));
   229	}
   230	#define PyUnicode_IS_COMPACT_ASCII(op) PyUnicode_IS_COMPACT_ASCII(_PyObject_CAST(op))
   231	
   232	enum PyUnicode_Kind {
   233	/* Return values of the PyUnicode_KIND() function: */
   234	    PyUnicode_1BYTE_KIND = 1,
   235	    PyUnicode_2BYTE_KIND = 2,
   236	    PyUnicode_4BYTE_KIND = 4
   237	};
   238	
   239	// PyUnicode_KIND(): Return one of the PyUnicode_*_KIND values defined above.
   240	//
   241	// gh-89653: Converting this macro to a static inline function would introduce
   242	// new compiler warnings on "kind < PyUnicode_KIND(str)" (compare signed and
   243	// unsigned numbers) where kind type is an int or on
   244	// "unsigned int kind = PyUnicode_KIND(str)" (cast signed to unsigned).
   245	#define PyUnicode_KIND(op) _Py_RVALUE(_PyASCIIObject_CAST(op)->state.kind)
   246	
   247	/* Return a void pointer to the raw unicode buffer. */
   248	static inline void* _PyUnicode_COMPACT_DATA(PyObject *op) {
   249	    if (PyUnicode_IS_ASCII(op)) {
   250	        return _Py_STATIC_CAST(void*, (_PyASCIIObject_CAST(op) + 1));
   251	    }
   252	    return _Py_STATIC_CAST(void*, (_PyCompactUnicodeObject_CAST(op) + 1));
   253	}
   254	
   255	static inline void* _PyUnicode_NONCOMPACT_DATA(PyObject *op) {
   256	    void *data;
   257	    assert(!PyUnicode_IS_COMPACT(op));
   258	    data = _PyUnicodeObject_CAST(op)->data.any;
   259	    assert(data != NULL);
   260	    return data;
   261	}
   262	
   263	static inline void* PyUnicode_DATA(PyObject *op) {
   264	    if (PyUnicode_IS_COMPACT(op)) {
   265	        return _PyUnicode_COMPACT_DATA(op);
   266	    }
   267	    return _PyUnicode_NONCOMPACT_DATA(op);
   268	}
   269	#define PyUnicode_DATA(op) PyUnicode_DATA(_PyObject_CAST(op))
   270	
   271	/* Return pointers to the canonical representation cast to unsigned char,
   272	   Py_UCS2, or Py_UCS4 for direct character access.
   273	   No checks are performed, use PyUnicode_KIND() before to ensure
   274	   these will work correctly. */
   275	
   276	#define PyUnicode_1BYTE_DATA(op) _Py_STATIC_CAST(Py_UCS1*, PyUnicode_DATA(op))
   277	#define PyUnicode_2BYTE_DATA(op) _Py_STATIC_CAST(Py_UCS2*, PyUnicode_DATA(op))
   278	#define PyUnicode_4BYTE_DATA(op) _Py_STATIC_CAST(Py_UCS4*, PyUnicode_DATA(op))
   279	
   280	/* Returns the length of the unicode string. */
   281	static inline Py_ssize_t PyUnicode_GET_LENGTH(PyObject *op) {
   282	    return _PyASCIIObject_CAST(op)->length;
   283	}
   284	#define PyUnicode_GET_LENGTH(op) PyUnicode_GET_LENGTH(_PyObject_CAST(op))
   285	
   286	/* Write into the canonical representation, this function does not do any sanity
   287	   checks and is intended for usage in loops.  The caller should cache the
   288	   kind and data pointers obtained from other function calls.
   289	   index is the index in the string (starts at 0) and value is the new
   290	   code point value which should be written to that location. */
   291	static inline void PyUnicode_WRITE(int kind, void *data,
   292	                                   Py_ssize_t index, Py_UCS4 value)
   293	{
   294	    assert(index >= 0);
   295	    if (kind == PyUnicode_1BYTE_KIND) {
   296	        assert(value <= 0xffU);
   297	        _Py_STATIC_CAST(Py_UCS1*, data)[index] = _Py_STATIC_CAST(Py_UCS1, value);
   298	    }
   299	    else if (kind == PyUnicode_2BYTE_KIND) {
   300	        assert(value <= 0xffffU);
   301	        _Py_STATIC_CAST(Py_UCS2*, data)[index] = _Py_STATIC_CAST(Py_UCS2, value);
   302	    }
   303	    else {
   304	        assert(kind == PyUnicode_4BYTE_KIND);
   305	        assert(value <= 0x10ffffU);
   306	        _Py_STATIC_CAST(Py_UCS4*, data)[index] = value;
   307	    }
   308	}
   309	#define PyUnicode_WRITE(kind, data, index, value) \
   310	    PyUnicode_WRITE(_Py_STATIC_CAST(int, kind), _Py_CAST(void*, data), \
   311	                    (index), _Py_STATIC_CAST(Py_UCS4, value))
   312	
   313	/* Read a code point from the string's canonical representation.  No checks
   314	   or ready calls are performed. */
   315	static inline Py_UCS4 PyUnicode_READ(int kind,
   316	                                     const void *data, Py_ssize_t index)
   317	{
   318	    assert(index >= 0);
   319	    if (kind == PyUnicode_1BYTE_KIND) {
   320	        return _Py_STATIC_CAST(const Py_UCS1*, data)[index];
   321	    }
   322	    if (kind == PyUnicode_2BYTE_KIND) {
   323	        return _Py_STATIC_CAST(const Py_UCS2*, data)[index];
   324	    }
   325	    assert(kind == PyUnicode_4BYTE_KIND);
   326	    return _Py_STATIC_CAST(const Py_UCS4*, data)[index];
   327	}
   328	#define PyUnicode_READ(kind, data, index) \
   329	    PyUnicode_READ(_Py_STATIC_CAST(int, kind), \
   330	                   _Py_STATIC_CAST(const void*, data), \
   331	                   (index))
   332	
   333	/* PyUnicode_READ_CHAR() is less efficient than PyUnicode_READ() because it
   334	   calls PyUnicode_KIND() and might call it twice.  For single reads, use
   335	   PyUnicode_READ_CHAR, for multiple consecutive reads callers should
   336	   cache kind and use PyUnicode_READ instead. */
   337	static inline Py_UCS4 PyUnicode_READ_CHAR(PyObject *unicode, Py_ssize_t index)
   338	{
   339	    int kind;
   340	
   341	    assert(index >= 0);
   342	    // Tolerate reading the NUL character at str[len(str)]
   343	    assert(index <= PyUnicode_GET_LENGTH(unicode));
   344	
   345	    kind = PyUnicode_KIND(unicode);
   346	    if (kind == PyUnicode_1BYTE_KIND) {
   347	        return PyUnicode_1BYTE_DATA(unicode)[index];
   348	    }
   349	    if (kind == PyUnicode_2BYTE_KIND) {
   350	        return PyUnicode_2BYTE_DATA(unicode)[index];
   351	    }
   352	    assert(kind == PyUnicode_4BYTE_KIND);
   353	    return PyUnicode_4BYTE_DATA(unicode)[index];
   354	}
   355	#define PyUnicode_READ_CHAR(unicode, index) \
   356	    PyUnicode_READ_CHAR(_PyObject_CAST(unicode), (index))
   357	
   358	/* Return a maximum character value which is suitable for creating another
   359	   string based on op.  This is always an approximation but more efficient
   360	   than iterating over the string. */
   361	static inline Py_UCS4 PyUnicode_MAX_CHAR_VALUE(PyObject *op)
   362	{
   363	    int kind;
   364	
   365	    if (PyUnicode_IS_ASCII(op)) {
   366	        return 0x7fU;
   367	    }
   368	
   369	    kind = PyUnicode_KIND(op);
   370	    if (kind == PyUnicode_1BYTE_KIND) {
   371	       return 0xffU;
   372	    }
   373	    if (kind == PyUnicode_2BYTE_KIND) {
   374	        return 0xffffU;
   375	    }
   376	    assert(kind == PyUnicode_4BYTE_KIND);
   377	    return 0x10ffffU;
   378	}
   379	#define PyUnicode_MAX_CHAR_VALUE(op) \
   380	    PyUnicode_MAX_CHAR_VALUE(_PyObject_CAST(op))
   381	
   382	/* === Public API ========================================================= */
   383	
   384	/* --- Plain Py_UNICODE --------------------------------------------------- */
   385	
   386	/* With PEP 393, this is the recommended way to allocate a new unicode object.
   387	   This function will allocate the object and its buffer in a single memory
   388	   block.  Objects created using this function are not resizable. */
   389	PyAPI_FUNC(PyObject*) PyUnicode_New(
   390	    Py_ssize_t size,            /* Number of code points in the new string */
   391	    Py_UCS4 maxchar             /* maximum code point value in the string */
   392	    );
   393	
   394	/* For backward compatibility */
   395	static inline int PyUnicode_READY(PyObject* Py_UNUSED(op))
   396	{
   397	    return 0;
   398	}
   399	#define PyUnicode_READY(op) PyUnicode_READY(_PyObject_CAST(op))
   400	
   401	/* Get a copy of a Unicode string. */
   402	PyAPI_FUNC(PyObject*) _PyUnicode_Copy(
   403	    PyObject *unicode
   404	    );
   405	
   406	/* Copy character from one unicode object into another, this function performs
   407	   character conversion when necessary and falls back to memcpy() if possible.
   408	
   409	   Fail if to is too small (smaller than *how_many* or smaller than
   410	   len(from)-from_start), or if kind(from[from_start:from_start+how_many]) >
   411	   kind(to), or if *to* has more than 1 reference.
   412	
   413	   Return the number of written character, or return -1 and raise an exception
   414	   on error.
   415	
   416	   Pseudo-code:
   417	
   418	       how_many = min(how_many, len(from) - from_start)
   419	       to[to_start:to_start+how_many] = from[from_start:from_start+how_many]
   420	       return how_many
   421	
   422	   Note: The function doesn't write a terminating null character.
   423	   */
   424	PyAPI_FUNC(Py_ssize_t) PyUnicode_CopyCharacters(
   425	    PyObject *to,
   426	    Py_ssize_t to_start,
   427	    PyObject *from,
   428	    Py_ssize_t from_start,
   429	    Py_ssize_t how_many
   430	    );
   431	
   432	/* Unsafe version of PyUnicode_CopyCharacters(): don't check arguments and so
   433	   may crash if parameters are invalid (e.g. if the output string
   434	   is too short). */
   435	PyAPI_FUNC(void) _PyUnicode_FastCopyCharacters(
   436	    PyObject *to,
   437	    Py_ssize_t to_start,
   438	    PyObject *from,
   439	    Py_ssize_t from_start,
   440	    Py_ssize_t how_many
   441	    );
   442	
   443	/* Fill a string with a character: write fill_char into
   444	   unicode[start:start+length].
   445	
   446	   Fail if fill_char is bigger than the string maximum character, or if the
   447	   string has more than 1 reference.
   448	
   449	   Return the number of written character, or return -1 and raise an exception
   450	   on error. */
   451	PyAPI_FUNC(Py_ssize_t) PyUnicode_Fill(
   452	    PyObject *unicode,
   453	    Py_ssize_t start,
   454	    Py_ssize_t length,
   455	    Py_UCS4 fill_char
   456	    );
   457	
   458	/* Unsafe version of PyUnicode_Fill(): don't check arguments and so may crash
   459	   if parameters are invalid (e.g. if length is longer than the string). */
   460	PyAPI_FUNC(void) _PyUnicode_FastFill(
   461	    PyObject *unicode,
   462	    Py_ssize_t start,
   463	    Py_ssize_t length,
   464	    Py_UCS4 fill_char
   465	    );
   466	
   467	/* Create a new string from a buffer of Py_UCS1, Py_UCS2 or Py_UCS4 characters.
   468	   Scan the string to find the maximum character. */
   469	PyAPI_FUNC(PyObject*) PyUnicode_FromKindAndData(
   470	    int kind,
   471	    const void *buffer,
   472	    Py_ssize_t size);
   473	
   474	/* Create a new string from a buffer of ASCII characters.
   475	   WARNING: Don't check if the string contains any non-ASCII character. */
   476	PyAPI_FUNC(PyObject*) _PyUnicode_FromASCII(
   477	    const char *buffer,
   478	    Py_ssize_t size);
   479	
   480	/* Compute the maximum character of the substring unicode[start:end].
   481	   Return 127 for an empty string. */
   482	PyAPI_FUNC(Py_UCS4) _PyUnicode_FindMaxChar (
   483	    PyObject *unicode,
   484	    Py_ssize_t start,
   485	    Py_ssize_t end);
   486	
   487	/* --- _PyUnicodeWriter API ----------------------------------------------- */
   488	
   489	typedef struct {
   490	    PyObject *buffer;
   491	    void *data;
   492	    int kind;
   493	    Py_UCS4 maxchar;
   494	    Py_ssize_t size;
   495	    Py_ssize_t pos;
   496	
   497	    /* minimum number of allocated characters (default: 0) */
   498	    Py_ssize_t min_length;
   499	
   500	    /* minimum character (default: 127, ASCII) */
   501	    Py_UCS4 min_char;
   502	
   503	    /* If non-zero, overallocate the buffer (default: 0). */
   504	    unsigned char overallocate;
   505	
   506	    /* If readonly is 1, buffer is a shared string (cannot be modified)
   507	       and size is set to 0. */
   508	    unsigned char readonly;
   509	} _PyUnicodeWriter ;
   510	
   511	/* Initialize a Unicode writer.
   512	 *
   513	 * By default, the minimum buffer size is 0 character and overallocation is
   514	 * disabled. Set min_length, min_char and overallocate attributes to control
   515	 * the allocation of the buffer. */
   516	PyAPI_FUNC(void)
   517	_PyUnicodeWriter_Init(_PyUnicodeWriter *writer);
   518	
   519	/* Prepare the buffer to write 'length' characters
   520	   with the specified maximum character.
   521	
   522	   Return 0 on success, raise an exception and return -1 on error. */
   523	#define _PyUnicodeWriter_Prepare(WRITER, LENGTH, MAXCHAR)             \
   524	    (((MAXCHAR) <= (WRITER)->maxchar                                  \
   525	      && (LENGTH) <= (WRITER)->size - (WRITER)->pos)                  \
   526	     ? 0                                                              \
   527	     : (((LENGTH) == 0)                                               \
   528	        ? 0                                                           \
   529	        : _PyUnicodeWriter_PrepareInternal((WRITER), (LENGTH), (MAXCHAR))))
   530	
   531	/* Don't call this function directly, use the _PyUnicodeWriter_Prepare() macro
   532	   instead. */
   533	PyAPI_FUNC(int)
   534	_PyUnicodeWriter_PrepareInternal(_PyUnicodeWriter *writer,
   535	                                 Py_ssize_t length, Py_UCS4 maxchar);
   536	
   537	/* Prepare the buffer to have at least the kind KIND.
   538	   For example, kind=PyUnicode_2BYTE_KIND ensures that the writer will
   539	   support characters in range U+000-U+FFFF.
   540	
   541	   Return 0 on success, raise an exception and return -1 on error. */
   542	#define _PyUnicodeWriter_PrepareKind(WRITER, KIND)                    \
   543	    ((KIND) <= (WRITER)->kind                                         \
   544	     ? 0                                                              \
   545	     : _PyUnicodeWriter_PrepareKindInternal((WRITER), (KIND)))
   546	
   547	/* Don't call this function directly, use the _PyUnicodeWriter_PrepareKind()
   548	   macro instead. */
   549	PyAPI_FUNC(int)
   550	_PyUnicodeWriter_PrepareKindInternal(_PyUnicodeWriter *writer,
   551	                                     int kind);
   552	
   553	/* Append a Unicode character.
   554	   Return 0 on success, raise an exception and return -1 on error. */
   555	PyAPI_FUNC(int)
   556	_PyUnicodeWriter_WriteChar(_PyUnicodeWriter *writer,
   557	    Py_UCS4 ch
   558	    );
   559	
   560	/* Append a Unicode string.
   561	   Return 0 on success, raise an exception and return -1 on error. */
   562	PyAPI_FUNC(int)
   563	_PyUnicodeWriter_WriteStr(_PyUnicodeWriter *writer,
   564	    PyObject *str               /* Unicode string */
   565	    );
   566	
   567	/* Append a substring of a Unicode string.
   568	   Return 0 on success, raise an exception and return -1 on error. */
   569	PyAPI_FUNC(int)
   570	_PyUnicodeWriter_WriteSubstring(_PyUnicodeWriter *writer,
   571	    PyObject *str,              /* Unicode string */
   572	    Py_ssize_t start,
   573	    Py_ssize_t end
   574	    );
   575	
   576	/* Append an ASCII-encoded byte string.
   577	   Return 0 on success, raise an exception and return -1 on error. */
   578	PyAPI_FUNC(int)
   579	_PyUnicodeWriter_WriteASCIIString(_PyUnicodeWriter *writer,
   580	    const char *str,           /* ASCII-encoded byte string */
   581	    Py_ssize_t len             /* number of bytes, or -1 if unknown */
   582	    );
   583	
   584	/* Append a latin1-encoded byte string.
   585	   Return 0 on success, raise an exception and return -1 on error. */
   586	PyAPI_FUNC(int)
   587	_PyUnicodeWriter_WriteLatin1String(_PyUnicodeWriter *writer,
   588	    const char *str,           /* latin1-encoded byte string */
   589	    Py_ssize_t len             /* length in bytes */
   590	    );
   591	
   592	/* Get the value of the writer as a Unicode string. Clear the
   593	   buffer of the writer. Raise an exception and return NULL
   594	   on error. */
   595	PyAPI_FUNC(PyObject *)
   596	_PyUnicodeWriter_Finish(_PyUnicodeWriter *writer);
   597	
   598	/* Deallocate memory of a writer (clear its internal buffer). */
   599	PyAPI_FUNC(void)
   600	_PyUnicodeWriter_Dealloc(_PyUnicodeWriter *writer);
   601	
   602	
   603	/* Format the object based on the format_spec, as defined in PEP 3101
   604	   (Advanced String Formatting). */
   605	PyAPI_FUNC(int) _PyUnicode_FormatAdvancedWriter(
   606	    _PyUnicodeWriter *writer,
   607	    PyObject *obj,
   608	    PyObject *format_spec,
   609	    Py_ssize_t start,
   610	    Py_ssize_t end);
   611	
   612	/* --- Manage the default encoding ---------------------------------------- */
   613	
   614	/* Returns a pointer to the default encoding (UTF-8) of the
   615	   Unicode object unicode.
   616	
   617	   Like PyUnicode_AsUTF8AndSize(), this also caches the UTF-8 representation
   618	   in the unicodeobject.
   619	
   620	   _PyUnicode_AsString is a #define for PyUnicode_AsUTF8 to
   621	   support the previous internal function with the same behaviour.
   622	
   623	   Use of this API is DEPRECATED since no size information can be
   624	   extracted from the returned data.
   625	*/
   626	
   627	PyAPI_FUNC(const char *) PyUnicode_AsUTF8(PyObject *unicode);
   628	
   629	#define _PyUnicode_AsString PyUnicode_AsUTF8
   630	
   631	/* --- UTF-7 Codecs ------------------------------------------------------- */
   632	
   633	PyAPI_FUNC(PyObject*) _PyUnicode_EncodeUTF7(
   634	    PyObject *unicode,          /* Unicode object */
   635	    int base64SetO,             /* Encode RFC2152 Set O characters in base64 */
   636	    int base64WhiteSpace,       /* Encode whitespace (sp, ht, nl, cr) in base64 */
   637	    const char *errors          /* error handling */
   638	    );
   639	
   640	/* --- UTF-8 Codecs ------------------------------------------------------- */
   641	
   642	PyAPI_FUNC(PyObject*) _PyUnicode_AsUTF8String(
   643	    PyObject *unicode,
   644	    const char *errors);
   645	
   646	/* --- UTF-32 Codecs ------------------------------------------------------ */
   647	
   648	PyAPI_FUNC(PyObject*) _PyUnicode_EncodeUTF32(
   649	    PyObject *object,           /* Unicode object */
   650	    const char *errors,         /* error handling */
   651	    int byteorder               /* byteorder to use 0=BOM+native;-1=LE,1=BE */
   652	    );
   653	
   654	/* --- UTF-16 Codecs ------------------------------------------------------ */
   655	
   656	/* Returns a Python string object holding the UTF-16 encoded value of
   657	   the Unicode data.
   658	
   659	   If byteorder is not 0, output is written according to the following
   660	   byte order:
   661	
   662	   byteorder == -1: little endian
   663	   byteorder == 0:  native byte order (writes a BOM mark)
   664	   byteorder == 1:  big endian
   665	
   666	   If byteorder is 0, the output string will always start with the
   667	   Unicode BOM mark (U+FEFF). In the other two modes, no BOM mark is
   668	   prepended.
   669	*/
   670	PyAPI_FUNC(PyObject*) _PyUnicode_EncodeUTF16(
   671	    PyObject* unicode,          /* Unicode object */
   672	    const char *errors,         /* error handling */
   673	    int byteorder               /* byteorder to use 0=BOM+native;-1=LE,1=BE */
   674	    );
   675	
   676	/* --- Unicode-Escape Codecs ---------------------------------------------- */
   677	
   678	/* Variant of PyUnicode_DecodeUnicodeEscape that supports partial decoding. */
   679	PyAPI_FUNC(PyObject*) _PyUnicode_DecodeUnicodeEscapeStateful(
   680	        const char *string,     /* Unicode-Escape encoded string */
   681	        Py_ssize_t length,      /* size of string */
   682	        const char *errors,     /* error handling */
   683	        Py_ssize_t *consumed    /* bytes consumed */
   684	);
   685	/* Helper for PyUnicode_DecodeUnicodeEscape that detects invalid escape
   686	   chars. */
   687	PyAPI_FUNC(PyObject*) _PyUnicode_DecodeUnicodeEscapeInternal(
   688	        const char *string,     /* Unicode-Escape encoded string */
   689	        Py_ssize_t length,      /* size of string */
   690	        const char *errors,     /* error handling */
   691	        Py_ssize_t *consumed,   /* bytes consumed */
   692	        const char **first_invalid_escape  /* on return, points to first
   693	                                              invalid escaped char in
   694	                                              string. */
   695	);
   696	
   697	/* --- Raw-Unicode-Escape Codecs ---------------------------------------------- */
   698	
   699	/* Variant of PyUnicode_DecodeRawUnicodeEscape that supports partial decoding. */
   700	PyAPI_FUNC(PyObject*) _PyUnicode_DecodeRawUnicodeEscapeStateful(
   701	        const char *string,     /* Unicode-Escape encoded string */
   702	        Py_ssize_t length,      /* size of string */
   703	        const char *errors,     /* error handling */
   704	        Py_ssize_t *consumed    /* bytes consumed */
   705	);
   706	
   707	/* --- Latin-1 Codecs ----------------------------------------------------- */
   708	
   709	PyAPI_FUNC(PyObject*) _PyUnicode_AsLatin1String(
   710	    PyObject* unicode,
   711	    const char* errors);
   712	
   713	/* --- ASCII Codecs ------------------------------------------------------- */
   714	
   715	PyAPI_FUNC(PyObject*) _PyUnicode_AsASCIIString(
   716	    PyObject* unicode,
   717	    const char* errors);
   718	
   719	/* --- Character Map Codecs ----------------------------------------------- */
   720	
   721	/* Translate an Unicode object by applying a character mapping table to
   722	   it and return the resulting Unicode object.
   723	
   724	   The mapping table must map Unicode ordinal integers to Unicode strings,
   725	   Unicode ordinal integers or None (causing deletion of the character).
   726	
   727	   Mapping tables may be dictionaries or sequences. Unmapped character
   728	   ordinals (ones which cause a LookupError) are left untouched and
   729	   are copied as-is.
   730	*/
   731	PyAPI_FUNC(PyObject*) _PyUnicode_EncodeCharmap(
   732	    PyObject *unicode,          /* Unicode object */
   733	    PyObject *mapping,          /* encoding mapping */
   734	    const char *errors          /* error handling */
   735	    );
   736	
   737	/* --- Decimal Encoder ---------------------------------------------------- */
   738	
   739	/* Coverts a Unicode object holding a decimal value to an ASCII string
   740	   for using in int, float and complex parsers.
   741	   Transforms code points that have decimal digit property to the
   742	   corresponding ASCII digit code points.  Transforms spaces to ASCII.
   743	   Transforms code points starting from the first non-ASCII code point that
   744	   is neither a decimal digit nor a space to the end into '?'. */
   745	
   746	PyAPI_FUNC(PyObject*) _PyUnicode_TransformDecimalAndSpaceToASCII(
   747	    PyObject *unicode           /* Unicode object */
   748	    );
   749	
   750	/* --- Methods & Slots ---------------------------------------------------- */
   751	
   752	PyAPI_FUNC(PyObject *) _PyUnicode_JoinArray(
   753	    PyObject *separator,
   754	    PyObject *const *items,
   755	    Py_ssize_t seqlen
   756	    );
   757	
   758	/* Test whether a unicode is equal to ASCII identifier.  Return 1 if true,
   759	   0 otherwise.  The right argument must be ASCII identifier.
   760	   Any error occurs inside will be cleared before return. */
   761	PyAPI_FUNC(int) _PyUnicode_EqualToASCIIId(
   762	    PyObject *left,             /* Left string */
   763	    _Py_Identifier *right       /* Right identifier */
   764	    );
   765	
   766	/* Test whether a unicode is equal to ASCII string.  Return 1 if true,
   767	   0 otherwise.  The right argument must be ASCII-encoded string.
   768	   Any error occurs inside will be cleared before return. */
   769	PyAPI_FUNC(int) _PyUnicode_EqualToASCIIString(
   770	    PyObject *left,
   771	    const char *right           /* ASCII-encoded string */
   772	    );
   773	
   774	/* Externally visible for str.strip(unicode) */
   775	PyAPI_FUNC(PyObject *) _PyUnicode_XStrip(
   776	    PyObject *self,
   777	    int striptype,
   778	    PyObject *sepobj
   779	    );
   780	
   781	/* Using explicit passed-in values, insert the thousands grouping
   782	   into the string pointed to by buffer.  For the argument descriptions,
   783	   see Objects/stringlib/localeutil.h */
   784	PyAPI_FUNC(Py_ssize_t) _PyUnicode_InsertThousandsGrouping(
   785	    _PyUnicodeWriter *writer,
   786	    Py_ssize_t n_buffer,
   787	    PyObject *digits,
   788	    Py_ssize_t d_pos,
   789	    Py_ssize_t n_digits,
   790	    Py_ssize_t min_width,
   791	    const char *grouping,
   792	    PyObject *thousands_sep,
   793	    Py_UCS4 *maxchar);
   794	
   795	/* === Characters Type APIs =============================================== */
   796	
   797	/* These should not be used directly. Use the Py_UNICODE_IS* and
   798	   Py_UNICODE_TO* macros instead.
   799	
   800	   These APIs are implemented in Objects/unicodectype.c.
   801	
   802	*/
   803	
   804	PyAPI_FUNC(int) _PyUnicode_IsLowercase(
   805	    Py_UCS4 ch       /* Unicode character */
   806	    );
   807	
   808	PyAPI_FUNC(int) _PyUnicode_IsUppercase(
   809	    Py_UCS4 ch       /* Unicode character */
   810	    );
   811	
   812	PyAPI_FUNC(int) _PyUnicode_IsTitlecase(
   813	    Py_UCS4 ch       /* Unicode character */
   814	    );
   815	
   816	PyAPI_FUNC(int) _PyUnicode_IsXidStart(
   817	    Py_UCS4 ch       /* Unicode character */
   818	    );
   819	
   820	PyAPI_FUNC(int) _PyUnicode_IsXidContinue(
   821	    Py_UCS4 ch       /* Unicode character */
   822	    );
   823	
   824	PyAPI_FUNC(int) _PyUnicode_IsWhitespace(
   825	    const Py_UCS4 ch         /* Unicode character */
   826	    );
   827	
   828	PyAPI_FUNC(int) _PyUnicode_IsLinebreak(
   829	    const Py_UCS4 ch         /* Unicode character */
   830	    );
   831	
   832	/* Py_DEPRECATED(3.3) */ PyAPI_FUNC(Py_UCS4) _PyUnicode_ToLowercase(
   833	    Py_UCS4 ch       /* Unicode character */
   834	    );
   835	
   836	/* Py_DEPRECATED(3.3) */ PyAPI_FUNC(Py_UCS4) _PyUnicode_ToUppercase(
   837	    Py_UCS4 ch       /* Unicode character */
   838	    );
   839	
   840	Py_DEPRECATED(3.3) PyAPI_FUNC(Py_UCS4) _PyUnicode_ToTitlecase(
   841	    Py_UCS4 ch       /* Unicode character */
   842	    );
   843	
   844	PyAPI_FUNC(int) _PyUnicode_ToLowerFull(
   845	    Py_UCS4 ch,       /* Unicode character */
   846	    Py_UCS4 *res
   847	    );
   848	
   849	PyAPI_FUNC(int) _PyUnicode_ToTitleFull(
   850	    Py_UCS4 ch,       /* Unicode character */
   851	    Py_UCS4 *res
   852	    );
   853	
   854	PyAPI_FUNC(int) _PyUnicode_ToUpperFull(
   855	    Py_UCS4 ch,       /* Unicode character */
   856	    Py_UCS4 *res
   857	    );
   858	
   859	PyAPI_FUNC(int) _PyUnicode_ToFoldedFull(
   860	    Py_UCS4 ch,       /* Unicode character */
   861	    Py_UCS4 *res
   862	    );
   863	
   864	PyAPI_FUNC(int) _PyUnicode_IsCaseIgnorable(
   865	    Py_UCS4 ch         /* Unicode character */
   866	    );
   867	
   868	PyAPI_FUNC(int) _PyUnicode_IsCased(
   869	    Py_UCS4 ch         /* Unicode character */
   870	    );
   871	
   872	PyAPI_FUNC(int) _PyUnicode_ToDecimalDigit(
   873	    Py_UCS4 ch       /* Unicode character */
   874	    );
   875	
   876	PyAPI_FUNC(int) _PyUnicode_ToDigit(
   877	    Py_UCS4 ch       /* Unicode character */
   878	    );
   879	
   880	PyAPI_FUNC(double) _PyUnicode_ToNumeric(
   881	    Py_UCS4 ch       /* Unicode character */
   882	    );
   883	
   884	PyAPI_FUNC(int) _PyUnicode_IsDecimalDigit(
   885	    Py_UCS4 ch       /* Unicode character */
   886	    );
   887	
   888	PyAPI_FUNC(int) _PyUnicode_IsDigit(
   889	    Py_UCS4 ch       /* Unicode character */
   890	    );
   891	
   892	PyAPI_FUNC(int) _PyUnicode_IsNumeric(
   893	    Py_UCS4 ch       /* Unicode character */
   894	    );
   895	
   896	PyAPI_FUNC(int) _PyUnicode_IsPrintable(
   897	    Py_UCS4 ch       /* Unicode character */
   898	    );
   899	
   900	PyAPI_FUNC(int) _PyUnicode_IsAlpha(
   901	    Py_UCS4 ch       /* Unicode character */
   902	    );
   903	
   904	// Helper array used by Py_UNICODE_ISSPACE().
   905	PyAPI_DATA(const unsigned char) _Py_ascii_whitespace[];
   906	
   907	// Since splitting on whitespace is an important use case, and
   908	// whitespace in most situations is solely ASCII whitespace, we
   909	// optimize for the common case by using a quick look-up table
   910	// _Py_ascii_whitespace (see below) with an inlined check.
   911	static inline int Py_UNICODE_ISSPACE(Py_UCS4 ch) {
   912	    if (ch < 128) {
   913	        return _Py_ascii_whitespace[ch];
   914	    }
   915	    return _PyUnicode_IsWhitespace(ch);
   916	}
   917	
   918	#define Py_UNICODE_ISLOWER(ch) _PyUnicode_IsLowercase(ch)
   919	#define Py_UNICODE_ISUPPER(ch) _PyUnicode_IsUppercase(ch)
   920	#define Py_UNICODE_ISTITLE(ch) _PyUnicode_IsTitlecase(ch)
   921	#define Py_UNICODE_ISLINEBREAK(ch) _PyUnicode_IsLinebreak(ch)
   922	
   923	#define Py_UNICODE_TOLOWER(ch) _PyUnicode_ToLowercase(ch)
   924	#define Py_UNICODE_TOUPPER(ch) _PyUnicode_ToUppercase(ch)
   925	#define Py_UNICODE_TOTITLE(ch) _PyUnicode_ToTitlecase(ch)
   926	
   927	#define Py_UNICODE_ISDECIMAL(ch) _PyUnicode_IsDecimalDigit(ch)
   928	#define Py_UNICODE_ISDIGIT(ch) _PyUnicode_IsDigit(ch)
   929	#define Py_UNICODE_ISNUMERIC(ch) _PyUnicode_IsNumeric(ch)
   930	#define Py_UNICODE_ISPRINTABLE(ch) _PyUnicode_IsPrintable(ch)
   931	
   932	#define Py_UNICODE_TODECIMAL(ch) _PyUnicode_ToDecimalDigit(ch)
   933	#define Py_UNICODE_TODIGIT(ch) _PyUnicode_ToDigit(ch)
   934	#define Py_UNICODE_TONUMERIC(ch) _PyUnicode_ToNumeric(ch)
   935	
   936	#define Py_UNICODE_ISALPHA(ch) _PyUnicode_IsAlpha(ch)
   937	
   938	static inline int Py_UNICODE_ISALNUM(Py_UCS4 ch) {
   939	   return (Py_UNICODE_ISALPHA(ch)
   940	           || Py_UNICODE_ISDECIMAL(ch)
   941	           || Py_UNICODE_ISDIGIT(ch)
   942	           || Py_UNICODE_ISNUMERIC(ch));
   943	}
   944	
   945	
   946	/* === Misc functions ===================================================== */
   947	
   948	PyAPI_FUNC(PyObject*) _PyUnicode_FormatLong(PyObject *, int, int, int);
   949	
   950	/* Return an interned Unicode object for an Identifier; may fail if there is no memory.*/
   951	PyAPI_FUNC(PyObject*) _PyUnicode_FromId(_Py_Identifier*);
   952	
   953	/* Fast equality check when the inputs are known to be exact unicode types
   954	   and where the hash values are equal (i.e. a very probable match) */
   955	PyAPI_FUNC(int) _PyUnicode_EQ(PyObject *, PyObject *);
   956	
   957	/* Equality check. */
   958	PyAPI_FUNC(int) _PyUnicode_Equal(PyObject *, PyObject *);
   959	
   960	PyAPI_FUNC(int) _PyUnicode_WideCharString_Converter(PyObject *, void *);
   961	PyAPI_FUNC(int) _PyUnicode_WideCharString_Opt_Converter(PyObject *, void *);
   962	
   963	PyAPI_FUNC(Py_ssize_t) _PyUnicode_ScanIdentifier(PyObject *);