42 * Character set detection is at best an imprecise operation. The detection 43 * process will attempt to identify the charset that best matches the characteristics 44 * of the byte data, but the process is partly statistical in nature, and 45 * the results can not be guaranteed to always be correct. 46 *
47 * For best accuracy in charset detection, the input data should be primarily 48 * in a single language, and a minimum of a few hundred bytes worth of plain text 49 * in the language are needed. The detection process will attempt to 50 * ignore html or xml style markup that could otherwise obscure the content. 51 *
52 * An alternative to the ICU Charset Detector is the 53 * Compact Encoding Detector, https://github.com/google/compact_enc_det. 54 * It often gives more accurate results, especially with short input samples. 55 */ 56 57 58 struct UCharsetDetector; 59 /** 60 * Structure representing a charset detector 61 * @stable ICU 3.6 62 */ 63 typedef struct UCharsetDetector UCharsetDetector; 64 65 struct UCharsetMatch; 66 /** 67 * Opaque structure representing a match that was identified 68 * from a charset detection operation. 69 * @stable ICU 3.6 70 */ 71 typedef struct UCharsetMatch UCharsetMatch; 72 73 /** 74 * Open a charset detector. 75 * 76 * @param status Any error conditions occurring during the open 77 * operation are reported back in this variable. 78 * @return the newly opened charset detector. 79 * @stable ICU 3.6 80 */ 81 U_CAPI UCharsetDetector * U_EXPORT2 82 ucsdet_open(UErrorCode *status); 83 84 /** 85 * Close a charset detector. All storage and any other resources 86 * owned by this charset detector will be released. Failure to 87 * close a charset detector when finished with it can result in 88 * memory leaks in the application. 89 * 90 * @param ucsd The charset detector to be closed. 91 * @stable ICU 3.6 92 */ 93 U_CAPI void U_EXPORT2 94 ucsdet_close(UCharsetDetector *ucsd); 95 96 #if U_SHOW_CPLUSPLUS_API 97 98 U_NAMESPACE_BEGIN 99 100 /** 101 * \class LocalUCharsetDetectorPointer 102 * "Smart pointer" class, closes a UCharsetDetector via ucsdet_close(). 103 * For most methods see the LocalPointerBase base class. 104 * 105 * @see LocalPointerBase 106 * @see LocalPointer 107 * @stable ICU 4.4 108 */ 109 U_DEFINE_LOCAL_OPEN_POINTER(LocalUCharsetDetectorPointer, UCharsetDetector, ucsdet_close); 110 111 U_NAMESPACE_END 112 113 #endif 114 115 /** 116 * Set the input byte data whose charset is to detected. 117 * 118 * Ownership of the input text byte array remains with the caller. 119 * The input string must not be altered or deleted until the charset 120 * detector is either closed or reset to refer to different input text. 121 * 122 * @param ucsd the charset detector to be used. 123 * @param textIn the input text of unknown encoding. . 124 * @param len the length of the input text, or -1 if the text 125 * is NUL terminated. 126 * @param status any error conditions are reported back in this variable. 127 * 128 * @stable ICU 3.6 129 */ 130 U_CAPI void U_EXPORT2 131 ucsdet_setText(UCharsetDetector *ucsd, const char *textIn, int32_t len, UErrorCode *status); 132 133 134 /** Set the declared encoding for charset detection. 135 * The declared encoding of an input text is an encoding obtained 136 * by the user from an http header or xml declaration or similar source that 137 * can be provided as an additional hint to the charset detector. 138 * 139 * How and whether the declared encoding will be used during the 140 * detection process is TBD. 141 * 142 * @param ucsd the charset detector to be used. 143 * @param encoding an encoding for the current data obtained from 144 * a header or declaration or other source outside 145 * of the byte data itself. 146 * @param length the length of the encoding name, or -1 if the name string 147 * is NUL terminated. 148 * @param status any error conditions are reported back in this variable. 149 * 150 * @stable ICU 3.6 151 */ 152 U_CAPI void U_EXPORT2 153 ucsdet_setDeclaredEncoding(UCharsetDetector *ucsd, const char *encoding, int32_t length, UErrorCode *status); 154 155 156 /** 157 * Return the charset that best matches the supplied input data. 158 * 159 * Note though, that because the detection 160 * only looks at the start of the input data, 161 * there is a possibility that the returned charset will fail to handle 162 * the full set of input data. 163 *
164 * The returned UCharsetMatch object is owned by the UCharsetDetector. 165 * It will remain valid until the detector input is reset, or until 166 * the detector is closed. 167 *
168 * The function will fail if 169 *
194 * The returned UCharsetMatch objects are owned by the UCharsetDetector. 195 * They will remain valid until the detector is closed or modified 196 * 197 *
198 * Return an error if 199 *
334 * The state of the Charset detector that is passed in does not 335 * affect the result of this function, but requiring a valid, open 336 * charset detector as a parameter insures that the charset detection 337 * service has been safely initialized and that the required detection 338 * data is available. 339 * 340 *
341 * Note: Multiple different charset encodings in a same family may use 342 * a single shared name in this implementation. For example, this method returns 343 * an array including "ISO-8859-1" (ISO Latin 1), but not including "windows-1252" 344 * (Windows Latin 1). However, actual detection result could be "windows-1252" 345 * when the input data matches Latin 1 code points with any points only available 346 * in "windows-1252". 347 * 348 * @param ucsd a Charset detector. 349 * @param status Any error conditions are reported back in this variable. 350 * @return an iterator providing access to the detectable charset names. 351 * @stable ICU 3.6 352 */ 353 U_CAPI UEnumeration * U_EXPORT2 354 ucsdet_getAllDetectableCharsets(const UCharsetDetector *ucsd, UErrorCode *status); 355 356 /** 357 * Test whether input filtering is enabled for this charset detector. 358 * Input filtering removes text that appears to be HTML or xml 359 * markup from the input before applying the code page detection 360 * heuristics. 361 * 362 * @param ucsd The charset detector to check. 363 * @return true if filtering is enabled. 364 * @stable ICU 3.6 365 */ 366 367 U_CAPI UBool U_EXPORT2 368 ucsdet_isInputFilterEnabled(const UCharsetDetector *ucsd); 369 370 371 /** 372 * Enable filtering of input text. If filtering is enabled, 373 * text within angle brackets ("<" and ">") will be removed 374 * before detection, which will remove most HTML or xml markup. 375 * 376 * @param ucsd the charset detector to be modified. 377 * @param filter true to enable input text filtering. 378 * @return The previous setting. 379 * 380 * @stable ICU 3.6 381 */ 382 U_CAPI UBool U_EXPORT2 383 ucsdet_enableInputFilter(UCharsetDetector *ucsd, UBool filter); 384 385 #ifndef U_HIDE_INTERNAL_API 386 /** 387 * Get an iterator over the set of detectable charsets - 388 * over the charsets that are enabled by the specified charset detector. 389 * 390 * The returned UEnumeration provides access to the names of 391 * the charsets. 392 * 393 * @param ucsd a Charset detector. 394 * @param status Any error conditions are reported back in this variable. 395 * @return an iterator providing access to the detectable charset names by 396 * the specified charset detector. 397 * @internal 398 */ 399 U_CAPI UEnumeration * U_EXPORT2 400 ucsdet_getDetectableCharsets(const UCharsetDetector *ucsd, UErrorCode *status); 401 402 /** 403 * Enable or disable individual charset encoding. 404 * A name of charset encoding must be included in the names returned by 405 * {@link #ucsdet_getAllDetectableCharsets()}. 406 * 407 * @param ucsd a Charset detector. 408 * @param encoding encoding the name of charset encoding. 409 * @param enabled true to enable, or false to disable the 410 * charset encoding. 411 * @param status receives the return status. When the name of charset encoding 412 * is not supported, U_ILLEGAL_ARGUMENT_ERROR is set. 413 * @internal 414 */ 415 U_CAPI void U_EXPORT2 416 ucsdet_setDetectableCharset(UCharsetDetector *ucsd, const char *encoding, UBool enabled, UErrorCode *status); 417 #endif /* U_HIDE_INTERNAL_API */ 418 419 #endif 420 #endif /* __UCSDET_H */ 421 422
true
false