C++ Utilities  4.17.0
Useful C++ classes and routines such as argument parser, IO and conversion utilities
stringconversion.cpp
Go to the documentation of this file.
1 #include "./stringconversion.h"
2 
3 #ifndef CPP_UTILITIES_NO_THREAD_LOCAL
4 #include "resources/features.h"
5 #else
6 #define CPP_UTILITIES_THREAD_LOCAL
7 #endif
8 
9 #include <cstdlib>
10 #include <iomanip>
11 #include <limits>
12 #include <memory>
13 #include <sstream>
14 
15 #include <errno.h>
16 #include <iconv.h>
17 
18 #ifdef PLATFORM_WINDOWS
19 #include <windows.h>
20 #endif
21 
22 using namespace std;
23 
31 namespace ConversionUtilities {
32 
34 
35 struct Keep {
36  size_t operator()(size_t value)
37  {
38  return value;
39  }
40 };
41 struct Double {
42  size_t operator()(size_t value)
43  {
44  return value + value;
45  }
46 };
47 struct Half {
48  size_t operator()(size_t value)
49  {
50  return value / 2;
51  }
52 };
53 struct Factor {
54  Factor(float factor)
55  : factor(factor){};
56  size_t operator()(size_t value)
57  {
58  return static_cast<size_t>(value * factor);
59  }
60  float factor;
61 };
62 
63 template <class OutputSizeHint> class ConversionDescriptor {
64 public:
65  ConversionDescriptor(const char *fromCharset, const char *toCharset)
66  : m_ptr(iconv_open(toCharset, fromCharset))
67  , m_outputSizeHint(OutputSizeHint())
68  {
69  if (m_ptr == reinterpret_cast<iconv_t>(-1)) {
70  throw ConversionException("Unable to allocate descriptor for character set conversion.");
71  }
72  }
73 
74  ConversionDescriptor(const char *fromCharset, const char *toCharset, OutputSizeHint outputSizeHint)
75  : m_ptr(iconv_open(toCharset, fromCharset))
76  , m_outputSizeHint(outputSizeHint)
77  {
78  if (m_ptr == reinterpret_cast<iconv_t>(-1)) {
79  throw ConversionException("Unable to allocate descriptor for character set conversion.");
80  }
81  }
82 
83  ~ConversionDescriptor()
84  {
85  iconv_close(m_ptr);
86  }
87 
88 public:
89  StringData convertString(const char *inputBuffer, size_t inputBufferSize)
90  {
91  // setup input and output buffer
92  size_t inputBytesLeft = inputBufferSize;
93  size_t outputSize = m_outputSizeHint(inputBufferSize);
94  size_t outputBytesLeft = outputSize;
95  char *outputBuffer = reinterpret_cast<char *>(malloc(outputSize));
96  size_t bytesWritten;
97 
98  char *currentOutputOffset = outputBuffer;
99  for (;; currentOutputOffset = outputBuffer + bytesWritten) {
100  bytesWritten = iconv(m_ptr, const_cast<char **>(&inputBuffer), &inputBytesLeft, &currentOutputOffset, &outputBytesLeft);
101  if (bytesWritten == static_cast<size_t>(-1)) {
102  if (errno == EINVAL) {
103  // ignore incomplete multibyte sequence in the input
104  bytesWritten = static_cast<size_t>(currentOutputOffset - outputBuffer);
105  break;
106  } else if (errno == E2BIG) {
107  // output buffer has no more room for next converted character
108  bytesWritten = static_cast<size_t>(currentOutputOffset - outputBuffer);
109  outputBytesLeft = (outputSize += m_outputSizeHint(inputBytesLeft)) - bytesWritten;
110  outputBuffer = reinterpret_cast<char *>(realloc(outputBuffer, outputSize));
111  } else /*if(errno == EILSEQ)*/ {
112  // invalid multibyte sequence in the input
113  free(outputBuffer);
114  throw ConversionException("Invalid multibyte sequence in the input.");
115  }
116  } else {
117  // conversion completed without (further) errors
118  break;
119  }
120  }
121  return StringData(std::unique_ptr<char[], StringDataDeleter>(outputBuffer), currentOutputOffset - outputBuffer);
122  }
123 
124 private:
125  iconv_t m_ptr;
126  OutputSizeHint m_outputSizeHint;
127 };
128 
130 
141  const char *fromCharset, const char *toCharset, const char *inputBuffer, std::size_t inputBufferSize, float outputBufferSizeFactor)
142 {
143  return ConversionDescriptor<Factor>(fromCharset, toCharset, outputBufferSizeFactor).convertString(inputBuffer, inputBufferSize);
144 }
145 
149 StringData convertUtf8ToUtf16LE(const char *inputBuffer, std::size_t inputBufferSize)
150 {
151  CPP_UTILITIES_THREAD_LOCAL ConversionDescriptor<Double> descriptor("UTF-8", "UTF-16LE");
152  return descriptor.convertString(inputBuffer, inputBufferSize);
153 }
154 
158 StringData convertUtf16LEToUtf8(const char *inputBuffer, std::size_t inputBufferSize)
159 {
160  CPP_UTILITIES_THREAD_LOCAL ConversionDescriptor<Half> descriptor("UTF-16LE", "UTF-8");
161  return descriptor.convertString(inputBuffer, inputBufferSize);
162 }
163 
167 StringData convertUtf8ToUtf16BE(const char *inputBuffer, std::size_t inputBufferSize)
168 {
169  CPP_UTILITIES_THREAD_LOCAL ConversionDescriptor<Double> descriptor("UTF-8", "UTF-16BE");
170  return descriptor.convertString(inputBuffer, inputBufferSize);
171 }
172 
176 StringData convertUtf16BEToUtf8(const char *inputBuffer, std::size_t inputBufferSize)
177 {
178  CPP_UTILITIES_THREAD_LOCAL ConversionDescriptor<Half> descriptor("UTF-16BE", "UTF-8");
179  return descriptor.convertString(inputBuffer, inputBufferSize);
180 }
181 
185 StringData convertLatin1ToUtf8(const char *inputBuffer, std::size_t inputBufferSize)
186 {
187  CPP_UTILITIES_THREAD_LOCAL ConversionDescriptor<Keep> descriptor("ISO-8859-1", "UTF-8");
188  return descriptor.convertString(inputBuffer, inputBufferSize);
189 }
190 
194 StringData convertUtf8ToLatin1(const char *inputBuffer, std::size_t inputBufferSize)
195 {
196  CPP_UTILITIES_THREAD_LOCAL ConversionDescriptor<Keep> descriptor("UTF-8", "ISO-8859-1");
197  return descriptor.convertString(inputBuffer, inputBufferSize);
198 }
199 
200 #ifdef PLATFORM_WINDOWS
201 
207 WideStringData convertMultiByteToWide(const char *inputBuffer, int inputBufferSize)
208 {
209  // calculate required size
210  WideStringData widePath;
211  widePath.second = MultiByteToWideChar(CP_UTF8, 0, inputBuffer, inputBufferSize, nullptr, 0);
212  if (widePath.second <= 0) {
213  return widePath;
214  }
215  // do the actual conversion
216  widePath.first = make_unique<wchar_t[]>(static_cast<size_t>(widePath.second));
217  widePath.second = MultiByteToWideChar(CP_UTF8, 0, inputBuffer, inputBufferSize, widePath.first.get(), widePath.second);
218  if (widePath.second <= 0) {
219  widePath.first.reset();
220  }
221  return widePath;
222 }
223 
228 WideStringData convertMultiByteToWide(const std::string &inputBuffer)
229 {
230  return convertMultiByteToWide(
231  inputBuffer.data(), inputBuffer.size() < (numeric_limits<int>::max() - 1) ? static_cast<int>(inputBuffer.size() + 1) : -1);
232 }
233 #endif
234 
239 void truncateString(string &str, char terminationChar)
240 {
241  string::size_type firstNullByte = str.find(terminationChar);
242  if (firstNullByte != string::npos) {
243  str.resize(firstNullByte);
244  }
245 }
246 
252 string dataSizeToString(uint64 sizeInByte, bool includeByte)
253 {
254  stringstream res(stringstream::in | stringstream::out);
255  res.setf(ios::fixed, ios::floatfield);
256  res << setprecision(2);
257  if (sizeInByte < 1024LL) {
258  res << sizeInByte << " bytes";
259  } else if (sizeInByte < 1048576LL) {
260  res << (static_cast<double>(sizeInByte) / 1024.0) << " KiB";
261  } else if (sizeInByte < 1073741824LL) {
262  res << (static_cast<double>(sizeInByte) / 1048576.0) << " MiB";
263  } else if (sizeInByte < 1099511627776LL) {
264  res << (static_cast<double>(sizeInByte) / 1073741824.0) << " GiB";
265  } else {
266  res << (static_cast<double>(sizeInByte) / 1099511627776.0) << " TiB";
267  }
268  if (includeByte && sizeInByte > 1024LL) {
269  res << ' ' << '(' << sizeInByte << " byte)";
270  }
271  return res.str();
272 }
273 
284 string bitrateToString(double bitrateInKbitsPerSecond, bool useIecBinaryPrefixes)
285 {
286  stringstream res(stringstream::in | stringstream::out);
287  res << setprecision(3);
288  if (useIecBinaryPrefixes) {
289  if (bitrateInKbitsPerSecond < 8.0) {
290  res << (bitrateInKbitsPerSecond * 125.0) << " byte/s";
291  } else if (bitrateInKbitsPerSecond < 8000.0) {
292  res << (bitrateInKbitsPerSecond * 0.125) << " KiB/s";
293  } else if (bitrateInKbitsPerSecond < 8000000.0) {
294  res << (bitrateInKbitsPerSecond * 0.000125) << " MiB/s";
295  } else {
296  res << (bitrateInKbitsPerSecond * 0.000000125) << " GiB/s";
297  }
298  } else {
299  if (bitrateInKbitsPerSecond < 1.0) {
300  res << (bitrateInKbitsPerSecond * 1000.0) << " bit/s";
301  } else if (bitrateInKbitsPerSecond < 1000.0) {
302  res << (bitrateInKbitsPerSecond) << " kbit/s";
303  } else if (bitrateInKbitsPerSecond < 1000000.0) {
304  res << (bitrateInKbitsPerSecond * 0.001) << " Mbit/s";
305  } else {
306  res << (bitrateInKbitsPerSecond * 0.000001) << " Gbit/s";
307  }
308  }
309  return res.str();
310 }
311 
313 const char *const base64Chars = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
314 const char base64Pad = '=';
316 
321 string encodeBase64(const byte *data, uint32 dataSize)
322 {
323  string encoded;
324  byte mod = dataSize % 3;
325  encoded.reserve(((dataSize / 3) + (mod > 0)) * 4);
326  uint32 temp;
327  for (const byte *end = --data + dataSize - mod; data != end;) {
328  temp = *++data << 16;
329  temp |= *++data << 8;
330  temp |= *++data;
331  encoded.push_back(base64Chars[(temp & 0x00FC0000) >> 18]);
332  encoded.push_back(base64Chars[(temp & 0x0003F000) >> 12]);
333  encoded.push_back(base64Chars[(temp & 0x00000FC0) >> 6]);
334  encoded.push_back(base64Chars[(temp & 0x0000003F)]);
335  }
336  switch (mod) {
337  case 1:
338  temp = *++data << 16;
339  encoded.push_back(base64Chars[(temp & 0x00FC0000) >> 18]);
340  encoded.push_back(base64Chars[(temp & 0x0003F000) >> 12]);
341  encoded.push_back(base64Pad);
342  encoded.push_back(base64Pad);
343  break;
344  case 2:
345  temp = *++data << 16;
346  temp |= *++data << 8;
347  encoded.push_back(base64Chars[(temp & 0x00FC0000) >> 18]);
348  encoded.push_back(base64Chars[(temp & 0x0003F000) >> 12]);
349  encoded.push_back(base64Chars[(temp & 0x00000FC0) >> 6]);
350  encoded.push_back(base64Pad);
351  break;
352  }
353  return encoded;
354 }
355 
361 pair<unique_ptr<byte[]>, uint32> decodeBase64(const char *encodedStr, const uint32 strSize)
362 {
363  if (strSize % 4) {
364  throw ConversionException("invalid size of base64");
365  }
366  uint32 decodedSize = (strSize / 4) * 3;
367  const char *const end = encodedStr + strSize;
368  if (strSize) {
369  if (*(end - 1) == base64Pad) {
370  --decodedSize;
371  }
372  if (*(end - 2) == base64Pad) {
373  --decodedSize;
374  }
375  }
376  auto buffer = make_unique<byte[]>(decodedSize);
377  auto *iter = buffer.get() - 1;
378  while (encodedStr < end) {
379  int32 temp = 0;
380  for (byte quantumPos = 0; quantumPos < 4; ++quantumPos, ++encodedStr) {
381  temp <<= 6;
382  if (*encodedStr >= 'A' && *encodedStr <= 'Z') {
383  temp |= *encodedStr - 'A';
384  } else if (*encodedStr >= 'a' && *encodedStr <= 'z') {
385  temp |= *encodedStr - 'a' + 26;
386  } else if (*encodedStr >= '0' && *encodedStr <= '9') {
387  temp |= *encodedStr - '0' + 2 * 26;
388  } else if (*encodedStr == '+') {
389  temp |= 2 * 26 + 10;
390  } else if (*encodedStr == '/') {
391  temp |= 2 * 26 + 10 + 1;
392  } else if (*encodedStr == base64Pad) {
393  switch (end - encodedStr) {
394  case 1:
395  *++iter = (temp >> 16) & 0xFF;
396  *++iter = (temp >> 8) & 0xFF;
397  return make_pair(move(buffer), decodedSize);
398  case 2:
399  *++iter = (temp >> 10) & 0xFF;
400  return make_pair(move(buffer), decodedSize);
401  default:
402  throw ConversionException("invalid padding in base64");
403  }
404  } else {
405  throw ConversionException("invalid character in base64");
406  }
407  }
408  *++iter = (temp >> 16) & 0xFF;
409  *++iter = (temp >> 8) & 0xFF;
410  *++iter = (temp)&0xFF;
411  }
412  return make_pair(move(buffer), decodedSize);
413 }
414 } // namespace ConversionUtilities
CPP_UTILITIES_EXPORT StringData convertUtf8ToUtf16LE(const char *inputBuffer, std::size_t inputBufferSize)
Converts the specified UTF-8 string to UTF-16 (little-endian).
CPP_UTILITIES_EXPORT std::string encodeBase64(const byte *data, uint32 dataSize)
Encodes the specified data to Base64.
std::pair< std::unique_ptr< char[], StringDataDeleter >, std::size_t > StringData
Type used to return string encoding conversion result.
The ConversionException class is thrown by the various conversion functions of this library when a co...
CPP_UTILITIES_EXPORT StringData convertUtf16BEToUtf8(const char *inputBuffer, std::size_t inputBufferSize)
Converts the specified UTF-16 (big-endian) string to UTF-8.
CPP_UTILITIES_EXPORT void truncateString(std::string &str, char terminationChar='\0')
Truncates all characters after the first occurrence of the specified terminationChar and the terminat...
std::uint64_t uint64
unsigned 64-bit integer
Definition: types.h:49
CPP_UTILITIES_EXPORT StringData convertUtf8ToUtf16BE(const char *inputBuffer, std::size_t inputBufferSize)
Converts the specified UTF-8 string to UTF-16 (big-endian).
constexpr T max(T first, T second)
Returns the greatest of the given items.
Definition: math.h:29
CPP_UTILITIES_EXPORT StringData convertUtf8ToLatin1(const char *inputBuffer, std::size_t inputBufferSize)
Converts the specified UTF-8 string to Latin-1.
Contains several functions providing conversions between different data types.
std::uint32_t uint32
unsigned 32-bit integer
Definition: types.h:44
CPP_UTILITIES_EXPORT StringData convertUtf16LEToUtf8(const char *inputBuffer, std::size_t inputBufferSize)
Converts the specified UTF-16 (little-endian) string to UTF-8.
std::int32_t int32
signed 32-bit integer
Definition: types.h:24
CPP_UTILITIES_EXPORT StringData convertString(const char *fromCharset, const char *toCharset, const char *inputBuffer, std::size_t inputBufferSize, float outputBufferSizeFactor=1.0f)
Converts the specified string from one character set to another.
CPP_UTILITIES_EXPORT StringData convertLatin1ToUtf8(const char *inputBuffer, std::size_t inputBufferSize)
Converts the specified Latin-1 string to UTF-8.
CPP_UTILITIES_EXPORT std::string dataSizeToString(uint64 sizeInByte, bool includeByte=false)
Converts the specified data size in byte to its equivalent std::string representation.
CPP_UTILITIES_EXPORT std::pair< std::unique_ptr< byte[]>, uint32 > decodeBase64(const char *encodedStr, const uint32 strSize)
Decodes the specified Base64 encoded string.
CPP_UTILITIES_EXPORT std::string bitrateToString(double speedInKbitsPerSecond, bool useByteInsteadOfBits=false)
Converts the specified bitrate in kbit/s to its equivalent std::string representation.