180 lines
7.6 KiB
C
180 lines
7.6 KiB
C
/* zueci.h - UTF-8 to/from Extended Channel Interpretations */
|
|
/*
|
|
libzueci - an open source UTF-8 ECI library adapted from libzint
|
|
Copyright (C) 2022 gitlost
|
|
*/
|
|
/* SPDX-License-Identifier: BSD-3-Clause */
|
|
|
|
#ifndef ZUECI_H
|
|
#define ZUECI_H
|
|
|
|
/* Version: 1.0.1 */
|
|
|
|
/* Warning and error returns from API functions below */
|
|
#define ZUECI_WARN_INVALID_DATA 1 /* Invalid data but replacement character used */
|
|
#define ZUECI_ERROR 5 /* Warn/error marker, not returned */
|
|
#define ZUECI_ERROR_INVALID_DATA 6 /* Source data invalid or unmappable */
|
|
#define ZUECI_ERROR_INVALID_ECI 7 /* ECI not a valid Character Set ECI */
|
|
#define ZUECI_ERROR_INVALID_ARGS 8 /* One or more arguments invalid (e.g. NULL) */
|
|
#define ZUECI_ERROR_INVALID_UTF8 9 /* Source data not valid UTF-8 */
|
|
|
|
#ifdef _WIN32
|
|
# if defined(DLL_EXPORT) || defined(PIC) || defined(_USRDLL)
|
|
# define ZUECI_EXTERN __declspec(dllexport)
|
|
# elif defined(ZUECI_DLL)
|
|
# define ZUECI_EXTERN __declspec(dllimport)
|
|
# else
|
|
# define ZUECI_EXTERN extern
|
|
# endif
|
|
#else
|
|
# define ZUECI_EXTERN extern
|
|
#endif
|
|
|
|
#ifdef __cplusplus
|
|
extern "C" {
|
|
#endif
|
|
|
|
/*
|
|
ECI arg `eci` must be a valid Interpretative Character Set ECI, i.e. 0-13, 15-18, 20-35, 170 or 899,
|
|
as defined by AIM ITS/04-023 International Technical Standard - Extended Channel Interpretations
|
|
Part 3: Register (Version 2, February 2022):
|
|
|
|
0 IBM CP437 (top)
|
|
1 ISO/IEC 8859-1 - Latin alphabet No. 1 (top)
|
|
2 IBM CP437 (top)
|
|
3 ISO/IEC 8859-1 - Latin alphabet No. 1 (top)
|
|
4 ISO/IEC 8859-2 - Latin alphabet No. 2 (top)
|
|
5 ISO/IEC 8859-3 - Latin alphabet No. 3 (top)
|
|
6 ISO/IEC 8859-4 - Latin alphabet No. 4 (top)
|
|
7 ISO/IEC 8859-5 - Latin/Cyrillic alphabet (top)
|
|
8 ISO/IEC 8859-6 - Latin/Arabic alphabet (top)
|
|
9 ISO/IEC 8859-7 - Latin/Greek alphabet (top)
|
|
10 ISO/IEC 8859-8 - Latin/Hebrew alphabet (top)
|
|
11 ISO/IEC 8859-9 - Latin alphabet No. 5 (Turkish) (top)
|
|
12 ISO/IEC 8859-10 - Latin alphabet No. 6 (Nordic) (top)
|
|
13 ISO/IEC 8859-11 - Latin/Thai alphabet (top)
|
|
15 ISO/IEC 8859-13 - Latin alphabet No. 7 (Baltic) (top)
|
|
16 ISO/IEC 8859-14 - Latin alphabet No. 8 (Celtic) (top)
|
|
17 ISO/IEC 8859-15 - Latin alphabet No. 9 (top)
|
|
18 ISO/IEC 8859-16 - Latin alphabet No. 10 (top)
|
|
20 Shift JIS (JIS X 0208 and JIS X 0201) Japanese
|
|
21 Windows 1250 - Latin 2 (Central Europe)
|
|
22 Windows 1251 - Cyrillic
|
|
23 Windows 1252 - Latin 1
|
|
24 Windows 1256 - Arabic
|
|
25 UTF-16BE (big-endian)
|
|
26 UTF-8
|
|
27 ASCII (ISO/IEC 646 IRV)
|
|
28 Big5 (Taiwan) Chinese
|
|
29 GB 2312 (PRC) Chinese
|
|
30 EUC-KR (KS X 1001:2002) Korean
|
|
31 GBK Chinese
|
|
32 GB 18030 Chinese
|
|
33 UTF-16LE (little-endian)
|
|
34 UTF-32BE (big-endian)
|
|
35 UTF-32LE (little-endian)
|
|
170 ISO/IEC 646 Invariant
|
|
899 8-bit binary data
|
|
|
|
"(top)" means encoding applies to codepoints 0x80..FF (or 0xA0..FF for ISO/IEC 8859) with 0x00..7F as ASCII
|
|
|
|
ECIs 0, 1 and 2 are obsolete, however ECI 2 is still referenced by ISO/IEC 15438:2015 (PDF417) Annex H.2.3
|
|
|
|
All except ECI 20 (Shift JIS) and ECI 170 (ISO/IEC 646 Invariant) map ASCII one-to-one (but see
|
|
`ZUECI_FLAG_XXX` flags below).
|
|
ECI 20 re-maps 2 characters (backslash and tilde), and ECI 170 has no mapping for 12 characters (#$@[\]^`{|}~).
|
|
|
|
All mappings are the same as libiconv with the following exception for ECI 20 (Shift JIS):
|
|
Unicode Shift JIS Unicode
|
|
libzueci U+005C -> 0x815F -> U+005C (U+005C REVERSE SOLIDUS)
|
|
U+FF3C -> no mapping (U+FF3C FULLWIDTH REVERSE SOLIDUS)
|
|
|
|
libiconv U+005C -> no mapping
|
|
U+FF3C -> 0x815F -> U+FF3C
|
|
The rationale for this difference is that libzueci is following the "official" source
|
|
https://unicode.org/Public/MAPPINGS/OBSOLETE/EASTASIA/JIS/SHIFTJIS.TXT
|
|
(2015-12-02) which gives those mappings. (Note "official" is used loosely, there's no such thing unfortunately.)
|
|
Could not find a reason for libiconv doing it its way from reading the source.
|
|
|
|
All other mappings are the same; in particular:
|
|
Unicode Shift JIS Unicode
|
|
U+007E -> no mapping (U+007E TILDE)
|
|
U+203E -> 0x7E -> U+202E (U+203E OVERLINE)
|
|
U+00A5 -> 0x5C -> U+00A5 (U+00A5 YEN SIGN)
|
|
*/
|
|
|
|
/*
|
|
If embedding the library (i.e. including the 10 files directly) and only want ECI-to-UTF-8 functionality,
|
|
define `ZUECI_EMBED_NO_TO_ECI`
|
|
*/
|
|
#ifndef ZUECI_EMBED_NO_TO_ECI
|
|
|
|
/*
|
|
Convert UTF-8 `src` of length `src_len` to `eci`-encoded `dest`.
|
|
`p_dest_len` is set to length of `dest` on output.
|
|
`dest` must be big enough (4-times the `src_len`, or see `zueci_dest_len_eci()`). It is not NUL-terminated.
|
|
Returns 0 if successful, one of `ZUECI_ERROR_XXX` if not.
|
|
*/
|
|
ZUECI_EXTERN int zueci_utf8_to_eci(const int eci, const unsigned char src[], const int src_len,
|
|
unsigned char dest[], int *p_dest_len);
|
|
|
|
/*
|
|
Calculate sufficient (i.e. approx.) length needed to convert UTF-8 `src` of length `len` from UTF-8 to ECI
|
|
`eci`, and place in `p_dest_len`.
|
|
Returns 0 if successful, one of `ZUECI_ERROR_XXX` if not.
|
|
*/
|
|
ZUECI_EXTERN int zueci_dest_len_eci(const int eci, const unsigned char src[], const int src_len, int *p_dest_len);
|
|
|
|
#endif /* ZUECI_EMBED_NO_TO_ECI */
|
|
|
|
/*
|
|
These flags can be OR-ed together to change the behaviour of `zueci_eci_to_utf8()` and `zueci_dest_len_utf8()`
|
|
*/
|
|
|
|
/*
|
|
For single-byte ECIs copy the source straight-thru rather than erroring or replacing if undefined. Affects
|
|
ISO/IEC 8859 (ECIs 1, 3-13, 15-18), Windows 125x (ECIs 21-24), ASCII (ECI 27) & ISO/IEC 646 Invariant (ECI 170).
|
|
*/
|
|
#define ZUECI_FLAG_SB_STRAIGHT_THRU 1
|
|
|
|
/*
|
|
For ECI 20 Shift JIS, copy backslash & tilde straight-thru rather than mapping to Yen sign & overline resp.
|
|
*/
|
|
#define ZUECI_FLAG_SJIS_STRAIGHT_THRU 2
|
|
|
|
/*
|
|
If embedding the library (i.e. including the 10 files directly) and only want UTF-8-to-ECI functionality,
|
|
define `ZUECI_EMBED_NO_TO_UTF8`
|
|
*/
|
|
#ifndef ZUECI_EMBED_NO_TO_UTF8
|
|
|
|
/*
|
|
Convert ECI-encoded `src` of length `src_len` to UTF-8 `dest`.
|
|
`p_dest_len` is set to length of `dest` on output.
|
|
`dest` must be big enough (4-times the `src_len`, or see `zueci_dest_len_utf8()`). It is not NUL-terminated.
|
|
If the Unicode BMP `replacement_char` (<= 0xFFFF) is non-zero then it will substituted for all source characters
|
|
with no mapping and processing will continue, returning ZUECI_WARN_INVALID_DATA unless other errors.
|
|
`flags` can be set with `ZUECI_FLAG_XXX` to change behaviour.
|
|
Returns 0 if successful, one of `ZUECI_ERROR_XXX` if not.
|
|
*/
|
|
ZUECI_EXTERN int zueci_eci_to_utf8(const int eci, const unsigned char src[], const int src_len,
|
|
const unsigned int replacement_char, const unsigned int flags, unsigned char dest[],
|
|
int *p_dest_len);
|
|
|
|
/*
|
|
Calculate exact length needed to convert ECI-encoded `src` of length `len` from ECI `eci`, and place in
|
|
`p_dest_len`.
|
|
Returns 0 if successful, one of `ZUECI_ERROR_XXX` if not.
|
|
*/
|
|
ZUECI_EXTERN int zueci_dest_len_utf8(const int eci, const unsigned char src[], const int src_len,
|
|
const unsigned int replacement_char, const unsigned int flags, int *p_dest_len);
|
|
|
|
#endif /* ZUECI_EMBED_NO_TO_UTF8 */
|
|
|
|
#ifdef __cplusplus
|
|
}
|
|
#endif
|
|
|
|
/* vim: set ts=4 sw=4 et : */
|
|
#endif /* ZUECI_H */
|