CBMC
Loading...
Searching...
No Matches
unicode.cpp
Go to the documentation of this file.
1/*******************************************************************\
2
3Module:
4
5Author: Daniel Kroening, kroening@kroening.com
6
7\*******************************************************************/
8
9#include "unicode.h"
10
11#include "invariant.h"
12
13#include <codecvt>
14#include <cstdint>
15#include <iomanip>
16#include <locale>
17#include <sstream>
18
19#ifdef _WIN32
20# include <util/pragma_push.def>
21# ifdef _MSC_VER
22# pragma warning(disable : 4668)
23// using #if/#elif on undefined macro
24# pragma warning(disable : 5039)
25// pointer or reference to potentially throwing function passed to extern C
26# endif
27# include <util/pragma_pop.def>
28# include <windows.h>
29#endif
30
31static void utf8_append_code(unsigned int c, std::string &);
32
33std::string narrow(const wchar_t *s)
34{
35#ifdef _WIN32
36
37 int slength = static_cast<int>(wcslen(s));
38 int rlength =
40 std::string r(rlength, 0);
42 return r;
43
44#else
45 return narrow(std::wstring(s));
46#endif
47}
48
49std::wstring widen(const char *s)
50{
51#ifdef _WIN32
52
53 int slength = static_cast<int>(strlen(s));
55 std::wstring r(rlength, 0);
57 return r;
58
59#else
60 return widen(std::string(s));
61#endif
62}
63
64std::string narrow(const std::wstring &s)
65{
66#ifdef _WIN32
67
68 int slength = static_cast<int>(s.size());
69 int rlength =
71 std::string r(rlength, 0);
73 return r;
74
75#else
76 std::string result;
77
78 result.reserve(s.size()); // at least that long
79
80 for(const auto codepoint : s)
82
83 return result;
84#endif
85}
86
87std::wstring widen(const std::string &s)
88{
89#ifdef _WIN32
90
91 int slength = static_cast<int>(s.size());
92 int rlength = MultiByteToWideChar(CP_UTF8, 0, &s[0], slength, NULL, 0);
93 std::wstring r(rlength, 0);
95 return r;
96
97#else
98 auto utf32 = utf8_to_utf32(std::string(s));
99
100 std::wstring r;
101 r.reserve(utf32.size());
102 for(auto codepoint : utf32)
103 r += codepoint;
104 return r;
105#endif
106}
107
110static void utf8_append_code(unsigned int c, std::string &result)
111{
112 if(c <= 0x7f)
113 result += static_cast<char>(c);
114 else if(c <= 0x7ff)
115 {
116 result += static_cast<char>((c >> 6) | 0xc0);
117 result += static_cast<char>((c & 0x3f) | 0x80);
118 }
119 else if(c <= 0xffff)
120 {
121 result += static_cast<char>((c >> 12) | 0xe0);
122 result += static_cast<char>(((c >> 6) & 0x3f) | 0x80);
123 result += static_cast<char>((c & 0x3f) | 0x80);
124 }
125 else
126 {
127 result += static_cast<char>((c >> 18) | 0xf0);
128 result += static_cast<char>(((c >> 12) & 0x3f) | 0x80);
129 result += static_cast<char>(((c >> 6) & 0x3f) | 0x80);
130 result += static_cast<char>((c & 0x3f) | 0x80);
131 }
132}
133
136std::string utf32_native_endian_to_utf8(const std::basic_string<char32_t> &s)
137{
138 std::string result;
139
140 result.reserve(s.size()); // at least that long
141
142 for(const auto c : s)
143 utf8_append_code(c, result);
144
145 return result;
146}
147
148std::vector<std::string> narrow_argv(int argc, const wchar_t **argv_wide)
149{
150 if(argv_wide == nullptr)
151 return std::vector<std::string>();
152
153 std::vector<std::string> argv_narrow;
154 argv_narrow.reserve(argc);
155
156 for(int i = 0; i != argc; ++i)
157 argv_narrow.push_back(narrow(argv_wide[i]));
158
159 return argv_narrow;
160}
161
162static void utf16_append_code(unsigned int code, std::wstring &result)
163{
164 // we do not treat 0xD800 to 0xDFFF, although
165 // they are not valid unicode symbols
166
167 if(code < 0xFFFF)
168 {
169 // code is encoded as one UTF16 character
170 result += static_cast<wchar_t>(code);
171 }
172 else // code is encoded as two UTF16 characters
173 {
174 // if this is valid unicode, we have
175 // code<0x10FFFF
176 // but let's not check it programmatically
177
178 // encode the code in UTF16
179 code = code - 0x10000;
180 const uint16_t i1 = static_cast<uint16_t>(((code >> 10) & 0x3ff) | 0xD800);
181 result += static_cast<wchar_t>(i1);
182 const uint16_t i2 = static_cast<uint16_t>((code & 0x3ff) | 0xDC00);
183 result += static_cast<wchar_t>(i2);
184 }
185}
186
191std::wstring utf8_to_utf16_native_endian(const std::string &in)
192{
193 std::wstring result;
194 result.reserve(in.size());
195
196 for(auto codepoint : utf8_to_utf32(in))
198
199 return result;
200}
201
205std::u32string utf8_to_utf32(const std::string &utf8_str)
206{
207 std::u32string result;
208 result.reserve(utf8_str.size());
209 std::string::size_type i = 0;
210 while(i < utf8_str.size())
211 {
212 unsigned char c = utf8_str[i++];
213 char32_t code = 0;
214 // the ifs that follow find out how many UTF8 characters (1-4) store the
215 // next unicode character. This is determined by the few most
216 // significant bits.
217 if(c <= 0x7F)
218 {
219 // if it's one character, then code is exactly the value
220 code = c;
221 }
222 else if(c <= 0xDF && i < utf8_str.size())
223 { // in other cases, we need to read the right number of chars and decode
224 // note: if we wanted to make sure that we capture incorrect strings,
225 // we should check that whatever follows first character starts with
226 // bits 10.
227 code = (c & 0x1Fu) << 6;
228 c = utf8_str[i++];
229 code += c & 0x3Fu;
230 }
231 else if(c <= 0xEF && i + 1 < utf8_str.size())
232 {
233 code = (c & 0xFu) << 12;
234 c = utf8_str[i++];
235 code += (c & 0x3Fu) << 6;
236 c = utf8_str[i++];
237 code += c & 0x3Fu;
238 }
239 else if(c <= 0xF7 && i + 2 < utf8_str.size())
240 {
241 code = (c & 0x7u) << 18;
242 c = utf8_str[i++];
243 code += (c & 0x3Fu) << 12;
244 c = utf8_str[i++];
245 code += (c & 0x3Fu) << 6;
246 c = utf8_str[i++];
247 code += c & 0x3Fu;
248 }
249 else
250 {
251 // The string is not a valid UTF8 string! Either it has some characters
252 // missing from a multi-character unicode symbol, or it has a char with
253 // too high value.
254 // For now, let's replace the character with a space
255 code = 32;
256 }
257
258 result.append(1, code);
259 }
260
261 return result;
262}
263
273 const wchar_t ch,
274 std::ostringstream &result,
275 const std::locale &loc)
276{
277 // \u unicode characters are translated very early by the Java compiler and so
278 // \u000a or \u000d would become a newline character in a char constant, which
279 // is illegal. Instead use \n or \r.
280 if(ch == '\n')
281 result << "\\n";
282 else if(ch == '\r')
283 result << "\\r";
284 // \f, \b and \t do not need to be escaped, but this will improve readability
285 // of generated tests.
286 else if(ch == '\f')
287 result << "\\f";
288 else if(ch == '\b')
289 result << "\\b";
290 else if(ch == '\t')
291 result << "\\t";
292 else if(ch <= 255 && isprint(ch, loc))
293 {
294 const auto uch = static_cast<unsigned char>(ch);
295 // ", and \ need to be escaped, but not ' for java strings
296 // e.g. "\"\\" needs escaping but "'" does not.
297 if(uch == '"' || uch == '\\')
298 result << '\\';
299 result << uch;
300 }
301 else
302 {
303 // Format ch as a hexadecimal unicode character padded to four digits with
304 // zeros.
305 result << "\\u" << std::hex << std::setw(4) << std::setfill('0')
306 << static_cast<unsigned int>(ch);
307 }
308}
309
317 const wchar_t ch,
318 std::ostringstream &result,
319 const std::locale &loc)
320{
321 if(ch == (wchar_t)'\'')
322 {
323 const auto uch = static_cast<unsigned char>(ch);
324 // ' needs to be escaped for java characters, e.g. '\''
325 result << '\\' << uch;
326 }
327 else
328 {
330 }
331}
332
335std::string utf16_native_endian_to_java(const char16_t ch)
336{
337 std::ostringstream result;
338 const std::locale loc;
339 utf16_native_endian_to_java(ch, result, loc);
340 return result.str();
341}
342
350std::string utf16_native_endian_to_java_string(const std::wstring &in)
351{
352 std::ostringstream result;
353 const std::locale loc;
354 for(const auto ch : in)
356 return result.str();
357}
358
359std::string utf16_native_endian_to_utf8(const char16_t utf16_char)
360{
361 return utf16_native_endian_to_utf8(std::u16string(1, utf16_char));
362}
363
364std::string utf16_native_endian_to_utf8(const std::u16string &utf16_str)
365{
366#ifdef _MSC_VER
367 // Workaround for Visual Studio bug, see
368 // https://stackoverflow.com/questions/32055357
369 std::wstring wide_string(utf16_str.begin(), utf16_str.end());
370 return std::wstring_convert<std::codecvt_utf8_utf16<wchar_t>, wchar_t>{}
371 .to_bytes(wide_string);
372#else
373 return std::wstring_convert<std::codecvt_utf8_utf16<char16_t>, char16_t>{}
374 .to_bytes(utf16_str);
375#endif
376}
377
378char16_t codepoint_hex_to_utf16_native_endian(const std::string &hex)
379{
380 PRECONDITION(hex.length() == 4);
381 return std::strtol(hex.c_str(), nullptr, 16);
382}
383
ait supplies three of the four components needed: an abstract interpreter (in this case handling func...
Definition ai.h:562
int isprint(int c)
Definition ctype.c:39
static int8_t r
Definition irep_hash.h:60
#define PRECONDITION(CONDITION)
Definition invariant.h:463
size_t strlen(const char *s)
Definition string.c:561
std::string narrow(const wchar_t *s)
Definition unicode.cpp:33
std::u32string utf8_to_utf32(const std::string &utf8_str)
Convert UTF8-encoded string to UTF-32 with architecture-native endianness.
Definition unicode.cpp:205
static void utf16_native_endian_to_java(const wchar_t ch, std::ostringstream &result, const std::locale &loc)
Escapes non-printable characters, whitespace except for spaces, double- and single-quotes and backsla...
Definition unicode.cpp:316
std::wstring widen(const char *s)
Definition unicode.cpp:49
static void utf16_append_code(unsigned int code, std::wstring &result)
Definition unicode.cpp:162
static void utf16_native_endian_to_java_string(const wchar_t ch, std::ostringstream &result, const std::locale &loc)
Escapes non-printable characters, whitespace except for spaces, double quotes and backslashes.
Definition unicode.cpp:272
std::string utf16_native_endian_to_utf8(const char16_t utf16_char)
Definition unicode.cpp:359
char16_t codepoint_hex_to_utf16_native_endian(const std::string &hex)
Definition unicode.cpp:378
std::string utf32_native_endian_to_utf8(const std::basic_string< char32_t > &s)
Definition unicode.cpp:136
std::vector< std::string > narrow_argv(int argc, const wchar_t **argv_wide)
Definition unicode.cpp:148
std::string codepoint_hex_to_utf8(const std::string &hex)
Definition unicode.cpp:384
std::wstring utf8_to_utf16_native_endian(const std::string &in)
Convert UTF8-encoded string to UTF-16 with architecture-native endianness.
Definition unicode.cpp:191
static void utf8_append_code(unsigned int c, std::string &)
Appends a unicode character to a utf8-encoded string.
Definition unicode.cpp:110