CBMC
unescape_string.cpp
Go to the documentation of this file.
1 /*******************************************************************\
2 
3 Module: ANSI-C Language Conversion
4 
5 Author: Daniel Kroening, kroening@kroening.com
6 
7 \*******************************************************************/
8 
11 
12 #include "unescape_string.h"
13 
14 #include <cctype>
15 
16 #include <util/invariant.h>
17 #include <util/unicode.h>
18 
20  unsigned int value,
21  std::string &dest)
22 {
23  std::basic_string<unsigned int> value_str(1, value);
24 
25  // turn into utf-8
26  const std::string utf8_value = utf32_native_endian_to_utf8(value_str);
27 
28  dest.append(utf8_value);
29 }
30 
32  unsigned int value,
33  std::basic_string<unsigned int> &dest)
34 {
35  dest.push_back(value);
36 }
37 
38 template<typename T>
39 std::basic_string<T> unescape_string_templ(const std::string &src)
40 {
41  std::basic_string<T> dest;
42 
43  dest.reserve(src.size()); // about that long, but may be shorter
44 
45  for(unsigned i=0; i<src.size(); i++)
46  {
47  T ch=(unsigned char)src[i];
48 
49  if(ch=='\\') // escape?
50  {
51  // go to next character
52  i++;
53  INVARIANT(i < src.size(), "backslash can't be last character");
54 
55  ch=(unsigned char)src[i];
56  switch(ch)
57  {
58  case '\\': dest.push_back(ch); break;
59  case 'n': dest.push_back('\n'); break; /* NL (0x0a) */
60  case 't': dest.push_back('\t'); break; /* HT (0x09) */
61  case 'v': dest.push_back('\v'); break; /* VT (0x0b) */
62  case 'b': dest.push_back('\b'); break; /* BS (0x08) */
63  case 'r': dest.push_back('\r'); break; /* CR (0x0d) */
64  case 'f': dest.push_back('\f'); break; /* FF (0x0c) */
65  case 'a': dest.push_back('\a'); break; /* BEL (0x07) */
66  case '"': dest.push_back('"'); break;
67  case '\'': dest.push_back('\''); break;
68 
69  case 'u': // universal character
70  case 'U': // universal character
71  i++;
72 
73  {
74  std::string hex;
75 
76  const unsigned digits = (ch == 'u') ? 4u : 8u;
77  hex.reserve(digits);
78 
79  for(unsigned count=digits;
80  count!=0 && i<src.size();
81  i++, count--)
82  hex+=src[i];
83 
84  // go back
85  i--;
86 
87  unsigned int result=hex_to_unsigned(hex.c_str(), hex.size());
88 
89  append_universal_char(result, dest);
90  }
91 
92  break;
93 
94  case 'x': // hex
95  i++;
96 
97  {
98  std::string hex;
99 
100  while(i<src.size() && isxdigit(src[i]))
101  {
102  hex+=src[i];
103  i++;
104  }
105 
106  // go back
107  i--;
108 
109  ch=hex_to_unsigned(hex.c_str(), hex.size());
110  }
111 
112  // if T isn't sufficiently wide to hold unsigned values
113  // the following might truncate; but then
114  // universal characters in non-wide strings don't
115  // really work; gcc just issues a warning.
116  dest.push_back(ch);
117  break;
118 
119  default:
120  if(isdigit(ch)) // octal
121  {
122  std::string octal;
123 
124  while(i<src.size() && isdigit(src[i]))
125  {
126  octal+=src[i];
127  i++;
128  }
129 
130  // go back
131  i--;
132 
133  ch=octal_to_unsigned(octal.c_str(), octal.size());
134  dest.push_back(ch);
135  }
136  else
137  {
138  // Unknown escape sequence.
139  // Both GCC and CL turn \% into %.
140  dest.push_back(ch);
141  }
142  }
143  }
144  else
145  dest.push_back(ch);
146  }
147 
148  return dest;
149 }
150 
151 std::string unescape_string(const std::string &src)
152 {
153  return unescape_string_templ<char>(src);
154 }
155 
156 std::basic_string<unsigned int> unescape_wide_string(
157  const std::string &src)
158 {
159  return unescape_string_templ<unsigned int>(src);
160 }
161 
162 unsigned hex_to_unsigned(const char *hex, std::size_t digits)
163 {
164  unsigned value=0;
165 
166  for(; digits!=0; digits--, hex++)
167  {
168  char ch=*hex;
169 
170  if(ch==0)
171  break;
172 
173  value<<=4;
174 
175  if(isdigit(ch))
176  value|=ch-'0';
177  else if(isxdigit(ch))
178  value|=10+tolower(ch)-'a';
179  }
180 
181  return value;
182 }
183 
184 unsigned octal_to_unsigned(const char *octal, std::size_t digits)
185 {
186  unsigned value=0;
187 
188  for(; digits!=0; digits--, octal++)
189  {
190  char ch=*octal;
191 
192  if(ch==0)
193  break;
194 
195  value<<=3;
196 
197  if(isdigit(ch))
198  value|=ch-'0';
199  }
200 
201  return value;
202 }
int isdigit(int c)
Definition: ctype.c:24
int tolower(int c)
Definition: ctype.c:109
int isxdigit(int c)
Definition: ctype.c:95
#define INVARIANT(CONDITION, REASON)
This macro uses the wrapper function 'invariant_violated_string'.
Definition: invariant.h:423
unsigned octal_to_unsigned(const char *octal, std::size_t digits)
std::string unescape_string(const std::string &src)
std::basic_string< T > unescape_string_templ(const std::string &src)
static void append_universal_char(unsigned int value, std::string &dest)
std::basic_string< unsigned int > unescape_wide_string(const std::string &src)
unsigned hex_to_unsigned(const char *hex, std::size_t digits)
ANSI-C Language Conversion.
std::string utf32_native_endian_to_utf8(const std::basic_string< unsigned int > &s)
Definition: unicode.cpp:137