// CkCharsetW.h: interface for the CkCharsetW class. // ////////////////////////////////////////////////////////////////////// // This header is generated for Chilkat 11.3.0 #ifndef _CkCharsetW_H #define _CkCharsetW_H #include "chilkatDefs.h" #include "CkString.h" #include "CkWideCharBase.h" class CkByteData; #if !defined(__sun__) && !defined(__sun) #pragma pack (push, 8) #endif // CLASS: CkCharsetW class CK_VISIBLE_PUBLIC CkCharsetW : public CkWideCharBase { private: // Don't allow assignment or copying these objects. CkCharsetW(const CkCharsetW &); CkCharsetW &operator=(const CkCharsetW &); public: CkCharsetW(void); virtual ~CkCharsetW(void); static CkCharsetW *createNew(void); void CK_VISIBLE_PRIVATE inject(void *impl); // May be called when finished with the object to free/dispose of any // internal resources held by the object. void dispose(void); // BEGIN PUBLIC INTERFACE // ---------------------- // Properties // ---------------------- // If the ErrorAction property is set to 6, then this property controls how errors // are handled. It specifies an alternate To charset. When a character in the input // data cannot be converted to the target charset, an attempt is made to convert it // to the AltToCharset. If that fails, the input character is dropped. void get_AltToCharset(CkString &str); // If the ErrorAction property is set to 6, then this property controls how errors // are handled. It specifies an alternate To charset. When a character in the input // data cannot be converted to the target charset, an attempt is made to convert it // to the AltToCharset. If that fails, the input character is dropped. const wchar_t *altToCharset(void); // If the ErrorAction property is set to 6, then this property controls how errors // are handled. It specifies an alternate To charset. When a character in the input // data cannot be converted to the target charset, an attempt is made to convert it // to the AltToCharset. If that fails, the input character is dropped. void put_AltToCharset(const wchar_t *newVal); // Controls how errors are handled. When a character in the input data cannot be // converted to the target charset, the action taken is controlled by this // property. The possible settings are: (0) drop the error characters, (1) // substitute the data set by the SetErrorString method, (2) convert to a // hex-escaped string ( int get_ErrorAction(void); // Controls how errors are handled. When a character in the input data cannot be // converted to the target charset, the action taken is controlled by this // property. The possible settings are: (0) drop the error characters, (1) // substitute the data set by the SetErrorString method, (2) convert to a // hex-escaped string ( void put_ErrorAction(int newVal); // Tells the charset converter the charset of the input data for a conversion. // Possible values are:2 face=MS Sans Serif> // us-ascii // unicode (also known as UTF16LE or simply UTF16) // unicodefffe (also known as UTF16BE) // ebcdic // iso-8859-1 // iso-8859-2 // iso-8859-3 // iso-8859-4 // iso-8859-5 // iso-8859-6 // iso-8859-7 // iso-8859-8 // iso-8859-9 // iso-8859-13 // iso-8859-15 // windows-874 // windows-1250 // windows-1251 // windows-1252 // windows-1253 // windows-1254 // windows-1255 // windows-1256 // windows-1257 // windows-1258 // utf-7 // utf-8 // utf-32 // utf-32be // shift_jis // gb2312 // ks_c_5601-1987 // big5 // iso-2022-jp // iso-2022-kr // euc-jp // euc-kr // macintosh // x-mac-japanese // x-mac-chinesetrad // x-mac-korean // x-mac-arabic // x-mac-hebrew // x-mac-greek // x-mac-cyrillic // x-mac-chinesesimp // x-mac-romanian // x-mac-ukrainian // x-mac-thai // x-mac-ce // x-mac-icelandic // x-mac-turkish // x-mac-croatian // asmo-708 // dos-720 // dos-862 // ibm01140 // ibm01141 // ibm01142 // ibm01143 // ibm01144 // ibm01145 // ibm01146 // ibm01147 // ibm01148 // ibm01149 // ibm037 // ibm437 // ibm500 // ibm737 // ibm775 // ibm850 // ibm852 // ibm855 // ibm857 // ibm00858 // ibm860 // ibm861 // ibm863 // ibm864 // ibm865 // cp866 // ibm869 // ibm870 // cp875 // koi8-r // koi8-u void get_FromCharset(CkString &str); // Tells the charset converter the charset of the input data for a conversion. // Possible values are:2 face=MS Sans Serif> // us-ascii // unicode (also known as UTF16LE or simply UTF16) // unicodefffe (also known as UTF16BE) // ebcdic // iso-8859-1 // iso-8859-2 // iso-8859-3 // iso-8859-4 // iso-8859-5 // iso-8859-6 // iso-8859-7 // iso-8859-8 // iso-8859-9 // iso-8859-13 // iso-8859-15 // windows-874 // windows-1250 // windows-1251 // windows-1252 // windows-1253 // windows-1254 // windows-1255 // windows-1256 // windows-1257 // windows-1258 // utf-7 // utf-8 // utf-32 // utf-32be // shift_jis // gb2312 // ks_c_5601-1987 // big5 // iso-2022-jp // iso-2022-kr // euc-jp // euc-kr // macintosh // x-mac-japanese // x-mac-chinesetrad // x-mac-korean // x-mac-arabic // x-mac-hebrew // x-mac-greek // x-mac-cyrillic // x-mac-chinesesimp // x-mac-romanian // x-mac-ukrainian // x-mac-thai // x-mac-ce // x-mac-icelandic // x-mac-turkish // x-mac-croatian // asmo-708 // dos-720 // dos-862 // ibm01140 // ibm01141 // ibm01142 // ibm01143 // ibm01144 // ibm01145 // ibm01146 // ibm01147 // ibm01148 // ibm01149 // ibm037 // ibm437 // ibm500 // ibm737 // ibm775 // ibm850 // ibm852 // ibm855 // ibm857 // ibm00858 // ibm860 // ibm861 // ibm863 // ibm864 // ibm865 // cp866 // ibm869 // ibm870 // cp875 // koi8-r // koi8-u const wchar_t *fromCharset(void); // Tells the charset converter the charset of the input data for a conversion. // Possible values are:2 face=MS Sans Serif> // us-ascii // unicode (also known as UTF16LE or simply UTF16) // unicodefffe (also known as UTF16BE) // ebcdic // iso-8859-1 // iso-8859-2 // iso-8859-3 // iso-8859-4 // iso-8859-5 // iso-8859-6 // iso-8859-7 // iso-8859-8 // iso-8859-9 // iso-8859-13 // iso-8859-15 // windows-874 // windows-1250 // windows-1251 // windows-1252 // windows-1253 // windows-1254 // windows-1255 // windows-1256 // windows-1257 // windows-1258 // utf-7 // utf-8 // utf-32 // utf-32be // shift_jis // gb2312 // ks_c_5601-1987 // big5 // iso-2022-jp // iso-2022-kr // euc-jp // euc-kr // macintosh // x-mac-japanese // x-mac-chinesetrad // x-mac-korean // x-mac-arabic // x-mac-hebrew // x-mac-greek // x-mac-cyrillic // x-mac-chinesesimp // x-mac-romanian // x-mac-ukrainian // x-mac-thai // x-mac-ce // x-mac-icelandic // x-mac-turkish // x-mac-croatian // asmo-708 // dos-720 // dos-862 // ibm01140 // ibm01141 // ibm01142 // ibm01143 // ibm01144 // ibm01145 // ibm01146 // ibm01147 // ibm01148 // ibm01149 // ibm037 // ibm437 // ibm500 // ibm737 // ibm775 // ibm850 // ibm852 // ibm855 // ibm857 // ibm00858 // ibm860 // ibm861 // ibm863 // ibm864 // ibm865 // cp866 // ibm869 // ibm870 // cp875 // koi8-r // koi8-u void put_FromCharset(const wchar_t *newVal); // If SaveLast is set to true, then the input and output of a conversion is saved // to allow the exact bytes that are sent to the converter to be seen (for // debugging purposes). This property shows the last input data in a // hexidecimalized string. void get_LastInputAsHex(CkString &str); // If SaveLast is set to true, then the input and output of a conversion is saved // to allow the exact bytes that are sent to the converter to be seen (for // debugging purposes). This property shows the last input data in a // hexidecimalized string. const wchar_t *lastInputAsHex(void); // If SaveLast is set to true, then the input and output of a conversion is saved // to allow the exact bytes that are sent to the converter to be seen (for // debugging purposes). This property shows the last input data in a // quoted-printable string. void get_LastInputAsQP(CkString &str); // If SaveLast is set to true, then the input and output of a conversion is saved // to allow the exact bytes that are sent to the converter to be seen (for // debugging purposes). This property shows the last input data in a // quoted-printable string. const wchar_t *lastInputAsQP(void); // If SaveLast is set to true, then the input and output of a conversion is saved // to allow the exact bytes that are sent to the converter to be seen (for // debugging purposes). This property shows the last output data in a // hexidecimalized string. void get_LastOutputAsHex(CkString &str); // If SaveLast is set to true, then the input and output of a conversion is saved // to allow the exact bytes that are sent to the converter to be seen (for // debugging purposes). This property shows the last output data in a // hexidecimalized string. const wchar_t *lastOutputAsHex(void); // If SaveLast is set to true, then the input and output of a conversion is saved // to allow the exact bytes that are sent to the converter to be seen (for // debugging purposes). This property shows the last output data in a // quoted-printable string. void get_LastOutputAsQP(CkString &str); // If SaveLast is set to true, then the input and output of a conversion is saved // to allow the exact bytes that are sent to the converter to be seen (for // debugging purposes). This property shows the last output data in a // quoted-printable string. const wchar_t *lastOutputAsQP(void); // Tells the component to keep the input/output byte data in memory after a // conversion is complete so the data can be examined via the LastInputAsHex/QP and // LastOutputAsHex/QP properties. (for debugging purposes) bool get_SaveLast(void); // Tells the component to keep the input/output byte data in memory after a // conversion is complete so the data can be examined via the LastInputAsHex/QP and // LastOutputAsHex/QP properties. (for debugging purposes) void put_SaveLast(bool newVal); // Tells the charset converter the target charset for a conversion. Possible values // are:2 face=MS Sans Serif> // us-ascii // unicode (also known as UTF16LE or simply UTF16) // unicodefffe (also known as UTF16BE) // ebcdic // iso-8859-1 // iso-8859-2 // iso-8859-3 // iso-8859-4 // iso-8859-5 // iso-8859-6 // iso-8859-7 // iso-8859-8 // iso-8859-9 // iso-8859-13 // iso-8859-15 // windows-874 // windows-1250 // windows-1251 // windows-1252 // windows-1253 // windows-1254 // windows-1255 // windows-1256 // windows-1257 // windows-1258 // utf-7 // utf-8 // utf-32 // utf-32be // shift_jis // gb2312 // ks_c_5601-1987 // big5 // iso-2022-jp // iso-2022-kr // euc-jp // euc-kr // macintosh // x-mac-japanese // x-mac-chinesetrad // x-mac-korean // x-mac-arabic // x-mac-hebrew // x-mac-greek // x-mac-cyrillic // x-mac-chinesesimp // x-mac-romanian // x-mac-ukrainian // x-mac-thai // x-mac-ce // x-mac-icelandic // x-mac-turkish // x-mac-croatian // asmo-708 // dos-720 // dos-862 // ibm01140 // ibm01141 // ibm01142 // ibm01143 // ibm01144 // ibm01145 // ibm01146 // ibm01147 // ibm01148 // ibm01149 // ibm037 // ibm437 // ibm500 // ibm737 // ibm775 // ibm850 // ibm852 // ibm855 // ibm857 // ibm00858 // ibm860 // ibm861 // ibm863 // ibm864 // ibm865 // cp866 // ibm869 // ibm870 // cp875 // koi8-r // koi8-u void get_ToCharset(CkString &str); // Tells the charset converter the target charset for a conversion. Possible values // are:2 face=MS Sans Serif> // us-ascii // unicode (also known as UTF16LE or simply UTF16) // unicodefffe (also known as UTF16BE) // ebcdic // iso-8859-1 // iso-8859-2 // iso-8859-3 // iso-8859-4 // iso-8859-5 // iso-8859-6 // iso-8859-7 // iso-8859-8 // iso-8859-9 // iso-8859-13 // iso-8859-15 // windows-874 // windows-1250 // windows-1251 // windows-1252 // windows-1253 // windows-1254 // windows-1255 // windows-1256 // windows-1257 // windows-1258 // utf-7 // utf-8 // utf-32 // utf-32be // shift_jis // gb2312 // ks_c_5601-1987 // big5 // iso-2022-jp // iso-2022-kr // euc-jp // euc-kr // macintosh // x-mac-japanese // x-mac-chinesetrad // x-mac-korean // x-mac-arabic // x-mac-hebrew // x-mac-greek // x-mac-cyrillic // x-mac-chinesesimp // x-mac-romanian // x-mac-ukrainian // x-mac-thai // x-mac-ce // x-mac-icelandic // x-mac-turkish // x-mac-croatian // asmo-708 // dos-720 // dos-862 // ibm01140 // ibm01141 // ibm01142 // ibm01143 // ibm01144 // ibm01145 // ibm01146 // ibm01147 // ibm01148 // ibm01149 // ibm037 // ibm437 // ibm500 // ibm737 // ibm775 // ibm850 // ibm852 // ibm855 // ibm857 // ibm00858 // ibm860 // ibm861 // ibm863 // ibm864 // ibm865 // cp866 // ibm869 // ibm870 // cp875 // koi8-r // koi8-u const wchar_t *toCharset(void); // Tells the charset converter the target charset for a conversion. Possible values // are:2 face=MS Sans Serif> // us-ascii // unicode (also known as UTF16LE or simply UTF16) // unicodefffe (also known as UTF16BE) // ebcdic // iso-8859-1 // iso-8859-2 // iso-8859-3 // iso-8859-4 // iso-8859-5 // iso-8859-6 // iso-8859-7 // iso-8859-8 // iso-8859-9 // iso-8859-13 // iso-8859-15 // windows-874 // windows-1250 // windows-1251 // windows-1252 // windows-1253 // windows-1254 // windows-1255 // windows-1256 // windows-1257 // windows-1258 // utf-7 // utf-8 // utf-32 // utf-32be // shift_jis // gb2312 // ks_c_5601-1987 // big5 // iso-2022-jp // iso-2022-kr // euc-jp // euc-kr // macintosh // x-mac-japanese // x-mac-chinesetrad // x-mac-korean // x-mac-arabic // x-mac-hebrew // x-mac-greek // x-mac-cyrillic // x-mac-chinesesimp // x-mac-romanian // x-mac-ukrainian // x-mac-thai // x-mac-ce // x-mac-icelandic // x-mac-turkish // x-mac-croatian // asmo-708 // dos-720 // dos-862 // ibm01140 // ibm01141 // ibm01142 // ibm01143 // ibm01144 // ibm01145 // ibm01146 // ibm01147 // ibm01148 // ibm01149 // ibm037 // ibm437 // ibm500 // ibm737 // ibm775 // ibm850 // ibm852 // ibm855 // ibm857 // ibm00858 // ibm860 // ibm861 // ibm863 // ibm864 // ibm865 // cp866 // ibm869 // ibm870 // cp875 // koi8-r // koi8-u void put_ToCharset(const wchar_t *newVal); // ---------------------- // Methods // ---------------------- // Converts a character set name, such as iso-8859-1, to its corresponding code // page number, like 28591. int CharsetToCodePage(const wchar_t *charsetName); // Converts a code page number to a charset name. For example, 65001 converts to // utf-8 . bool CodePageToCharset(int codePage, CkString &outCharset); // Converts a code page number to a charset name. For example, 65001 converts to // utf-8 . const wchar_t *codePageToCharset(int codePage); // Converts character data from one charset to another. Before calling ConvertData, // the FromCharset and ToCharset properties must be set to the source and // destination charset names, such as iso-8859-1 or Shift_JIS . bool ConvertData(CkByteData &inData, CkByteData &outData); // Converts a file from one character encoding to another. The FromCharset and // ToCharset properties specify the source and destination character encodings. If // the ToCharset is utf-16 or utf-8, then the preamble (also known as BOM) is // included in the output. (Call ConvertFileNoPreamble to suppress the output of // the BOM.) bool ConvertFile(const wchar_t *inPath, const wchar_t *destPath); // Converts a file from one character encoding to another. The FromCharset and // ToCharset properties specify the source and destination character encodings. No // preamble (also known as BOM) is included in the output. bool ConvertFileNoPreamble(const wchar_t *inPath, const wchar_t *destPath); // Converts Unicode (utf-16) text to the charset specified by the ToCharset // property. bool ConvertFromUnicode(const wchar_t *inData, CkByteData &outBytes); // Converts utf-16 text to the charset specified by the ToCharset property. bool ConvertFromUtf16(CkByteData &uniData, CkByteData &outMbData); // Converts HTML text from one character encoding to another. The FromCharset and // ToCharset properties must be set prior to calling this method. This method // automatically edits the META tag within the HTML that indicates the charset. bool ConvertHtml(CkByteData &inData, CkByteData &outHtml); // Converts an HTML file from one character encoding to another. The ToCharset // properties must be set prior to calling this method. If the FromCharset is not // set, it is obtained from the HTML META tag that indicates the charset. This // method automatically edits the META tag within the HTML that indicates the // charset. bool ConvertHtmlFile(const wchar_t *inPath, const wchar_t *destPath); // Converts multibyte character data to a Unicode string. The FromCharset property // should be set before calling this method. bool ConvertToUnicode(CkByteData &inData, CkString &outStr); // Converts multibyte character data to a Unicode string. The FromCharset property // should be set before calling this method. const wchar_t *convertToUnicode(CkByteData &inData); // Converts the mbData to utf-16 bytes. bool ConvertToUtf16(CkByteData &mbData, CkByteData &outUniData); // Converts non-US-ASCII characters to Unicode decimal entities (_AMP_#xxxxx;) bool EntityEncodeDec(const wchar_t *str, CkString &outStr); // Converts non-US-ASCII characters to Unicode decimal entities (_AMP_#xxxxx;) const wchar_t *entityEncodeDec(const wchar_t *str); // Converts non-US-ASCII characters to Unicode hex entities (_AMP_#xXXXX;) bool EntityEncodeHex(const wchar_t *str, CkString &outStr); // Converts non-US-ASCII characters to Unicode hex entities (_AMP_#xXXXX;) const wchar_t *entityEncodeHex(const wchar_t *str); // Examines HTML text and extracts the charset name specified by the META tag, if // present. bool GetHtmlCharset(CkByteData &inData, CkString &outCharset); // Examines HTML text and extracts the charset name specified by the META tag, if // present. const wchar_t *getHtmlCharset(CkByteData &inData); // Examines HTML text and extracts the charset name specified by the META tag, if // present. const wchar_t *htmlCharset(CkByteData &inData); // Examines an HTML file and extracts the charset name specified by the META tag, // if present. bool GetHtmlFileCharset(const wchar_t *htmlFilePath, CkString &outCharset); // Examines an HTML file and extracts the charset name specified by the META tag, // if present. const wchar_t *getHtmlFileCharset(const wchar_t *htmlFilePath); // Examines an HTML file and extracts the charset name specified by the META tag, // if present. const wchar_t *htmlFileCharset(const wchar_t *htmlFilePath); // Converts HTML entities to Unicode characters. bool HtmlDecodeToStr(const wchar_t *inStr, CkString &outStr); // Converts HTML entities to Unicode characters. const wchar_t *htmlDecodeToStr(const wchar_t *inStr); // Decodes HTML entities. See http://www.w3.org/TR/REC-html40/sgml/entities.html // for information on HTML entities. Examples of HTML entities are _AMP_lt; , // _AMP_#229; , _AMP_#xE5; , _AMP_#x6C34; , _AMP_Iacute; , etc. bool HtmlEntityDecode(CkByteData &inHtml, CkByteData &outData); // Decodes HTML entities in a file and creates a new HTML file with the entities // decoded. See http://www.w3.org/TR/REC-html40/sgml/entities.html for information // on HTML entities. Examples of HTML entities are _AMP_lt; , _AMP_#229; , // _AMP_#xE5; , _AMP_#x6C34; , _AMP_Iacute; , etc. bool HtmlEntityDecodeFile(const wchar_t *inPath, const wchar_t *destPath); // Converts a string to lowercase. bool LowerCase(const wchar_t *inStr, CkString &outStr); // Converts a string to lowercase. const wchar_t *lowerCase(const wchar_t *inStr); // Convenience method for reading the entire contents of a file into a byte array. bool ReadFile(const wchar_t *path, CkByteData &outData); // Reads a text file and returns the text converted to a Unicode string. The // filename is specified by the first method argument, and the charset of the text // data is specified by the 2nd method argument. bool ReadFileToString(const wchar_t *path, const wchar_t *charset, CkString &outStr); // Reads a text file and returns the text converted to a Unicode string. The // filename is specified by the first method argument, and the charset of the text // data is specified by the 2nd method argument. const wchar_t *readFileToString(const wchar_t *path, const wchar_t *charset); // If the ErrorAction property is set to 1, the string passed to this method is // used as the result for any characters that cannot be converted during a // conversion. void SetErrorString(const wchar_t *str, const wchar_t *charset); // Converts a string to uppercase. bool UpperCase(const wchar_t *inStr, CkString &outStr); // Converts a string to uppercase. const wchar_t *upperCase(const wchar_t *inStr); // URL decodes a string. // // This method assumes the ANSI byte representation was used for encoding. For // example, the letter É, where the ANSI charset is Latin-1, would appear as %C9 // when URL encoded. (As opposed to the utf-8 encoding where É is represented by 2 // bytes 0xC3, 0x89 and would appear as %C3%89 when URl encoded. This method // assumes ANSI encoding. // // You can use encoding/decoding methods in Chilkat's StringBuilder class to // specify the charset from which to decode or encode. // bool UrlDecodeStr(const wchar_t *inStr, CkString &outStr); // URL decodes a string. // // This method assumes the ANSI byte representation was used for encoding. For // example, the letter É, where the ANSI charset is Latin-1, would appear as %C9 // when URL encoded. (As opposed to the utf-8 encoding where É is represented by 2 // bytes 0xC3, 0x89 and would appear as %C3%89 when URl encoded. This method // assumes ANSI encoding. // // You can use encoding/decoding methods in Chilkat's StringBuilder class to // specify the charset from which to decode or encode. // const wchar_t *urlDecodeStr(const wchar_t *inStr); // Returns true if the byte data conforms to the charset passed in the first // argument. bool VerifyData(const wchar_t *charset, CkByteData &inData); // Returns true if the file contains character data that conforms to the charset // passed in the 1st argument. bool VerifyFile(const wchar_t *charset, const wchar_t *path); // Convenience method for saving an entire byte array to a file. bool WriteFile(const wchar_t *path, CkByteData &byteData); // Converts a Unicode string to a multibyte charset and writes the multibyte text // data to a file. The destination charset is specified in the 2nd method argument. bool WriteStringToFile(const wchar_t *textData, const wchar_t *path, const wchar_t *charset); // END PUBLIC INTERFACE }; #if !defined(__sun__) && !defined(__sun) #pragma pack (pop) #endif #endif