RESTinio
percent_encoding.hpp
Go to the documentation of this file.
1 /*
2  restinio
3 */
4 
9 #pragma once
10 
11 #include <string>
12 
14 
15 #include <restinio/string_view.hpp>
16 #include <restinio/exception.hpp>
17 #include <restinio/expected.hpp>
18 
20 
21 namespace restinio
22 {
23 
24 namespace utils
25 {
26 
36 {
37  static constexpr bool
38  ordinary_char( char c ) noexcept
39  {
40  return
41  ( '0' <= c && c <= '9' ) ||
42  ( 'a' <= c && c <= 'z' ) ||
43  ( 'A' <= c && c <= 'Z' ) ||
44  '-' == c ||
45  '.' == c ||
46  '~' == c ||
47  '_' == c;
48  }
49 };
50 
61 {
62  static constexpr bool
63  ordinary_char( char c ) noexcept
64  {
65  return
66  ( '0' <= c && c <= '9' ) ||
67  ( 'a' <= c && c <= 'z' ) ||
68  ( 'A' <= c && c <= 'Z' ) ||
69  '*' == c ||
70  '-' == c ||
71  '.' == c ||
72  '_' == c;
73  }
74 };
75 
97 {
98  static bool
99  ordinary_char( char c ) noexcept
100  {
101  return nullptr != std::strchr(
102  " " // Space
103  "ABCDEFGHIJKLMNOPQRSTUVWXYZ" // ALPHA
104  "abcdefghijklmnopqrstuvwxyz"
105  "0123456789" // DIGIT
106  "-._~" // unreserved
107  ":/?#[]@" // gen-delims
108  "!$&'()*+,;=", c );
109  }
110 };
111 
125 {
126  static constexpr bool
127  ordinary_char( char c ) noexcept
128  {
129  return
130  ( '0' <= c && c <= '9' ) ||
131  ( 'a' <= c && c <= 'z' ) ||
132  ( 'A' <= c && c <= 'Z' ) ||
133  '-' == c ||
134  '.' == c ||
135  '~' == c ||
136  '_' == c ||
137  '*' == c ||
138  '!' == c ||
139  '\'' == c ||
140  '(' == c ||
141  ')' == c;
142  }
143 };
144 
152 
160 {
162  std::string m_description;
163 
164 public:
166  std::string description )
168  {}
169 
172  const std::string &
173  description() const noexcept { return m_description; }
174 
176 
182  std::string
184 };
185 
186 namespace impl
187 {
188 
189 inline bool
190 is_hexdigit( char c )
191 {
192  return
193  ( '0' <= c && c <= '9' ) ||
194  ( 'a' <= c && c <= 'f' ) ||
195  ( 'A' <= c && c <= 'F' );
196 }
197 
198 inline char
199 extract_escaped_char( char c1, char c2 )
200 {
201  char result;
202 
203  if( '0' <= c1 && c1 <= '9' )
204  result = c1 - '0';
205  else
206  {
207  c1 |= 0x20;
208  result = 10 + c1 - 'a';
209  }
210 
211  result <<= 4;
212 
213  if( '0' <= c2 && c2 <= '9' )
214  result += c2 - '0';
215  else
216  {
217  c2 |= 0x20;
218  result += 10 + c2 - 'a';
219  }
220 
221  return result;
222 }
223 
224 //
225 // do_unescape_percent_encoding
226 //
232 template<
233  typename Traits,
234  typename Chars_Collector >
236 expected_t<
240  const string_view_t data,
241  Chars_Collector && collector )
242 {
243  std::size_t chars_to_handle = data.size();
244  const char * d = data.data();
245 
246  utf8_checker_t utf8_checker;
247  bool expect_next_utf8_byte = false;
248 
249  const auto current_pos = [&d, &data]() noexcept { return d - data.data(); };
250 
251  while( 0 < chars_to_handle )
252  {
253  char c = *d;
254  if( expect_next_utf8_byte && '%' != c )
255  return make_unexpected( unescape_percent_encoding_failure_t{
256  fmt::format(
257  "next byte from UTF-8 sequence expected at {}",
258  current_pos() )
259  } );
260 
261  if( '%' == c )
262  {
263  if( chars_to_handle >= 3 &&
264  is_hexdigit( d[ 1 ] ) &&
265  is_hexdigit( d[ 2 ] ) )
266  {
267  const auto ch = extract_escaped_char( d[ 1 ], d[ 2 ] );
268  if( !utf8_checker.process_byte( static_cast<std::uint8_t>(ch) ) )
269  return make_unexpected( unescape_percent_encoding_failure_t{
270  fmt::format( "invalid UTF-8 sequence detected at {}",
271  current_pos() )
272  } );
273 
274  collector( ch );
275  chars_to_handle -= 3;
276  d += 3;
277 
278  expect_next_utf8_byte = !utf8_checker.finalized();
279  if( !expect_next_utf8_byte )
280  utf8_checker.reset();
281  }
282  else
283  {
284  return make_unexpected( unescape_percent_encoding_failure_t{
285  fmt::format(
286  "invalid escape sequence at pos {}", current_pos() )
287  } );
288  }
289  }
290  else if( '+' == c )
291  {
292  collector( ' ' );
293  --chars_to_handle;
294  ++d;
295  }
296  else if( Traits::ordinary_char( c ) )
297  {
298  collector( c );
299  --chars_to_handle;
300  ++d;
301  }
302  else
303  {
304  return make_unexpected( unescape_percent_encoding_failure_t{
305  fmt::format(
306  "invalid non-escaped char with code {:#02X} at pos: {}",
307  c,
308  current_pos() )
309  } );
310  }
311  }
312 
313  if( expect_next_utf8_byte )
314  return make_unexpected( unescape_percent_encoding_failure_t{
315  fmt::format( "unfinished UTF-8 sequence" )
316  } );
317 
319 }
320 
321 } /* namespace impl */
322 
325 template< typename Traits = restinio_default_unescape_traits >
327 std::string
329 {
330  std::string result;
331  const auto escaped_chars_count = static_cast<std::size_t>(
332  std::count_if(
333  data.begin(),
334  data.end(),
335  []( auto c ){ return !Traits::ordinary_char(c); } ));
336 
337  if( 0 == escaped_chars_count )
338  {
339  // No escaped chars.
340  result.assign( data.data(), data.size() );
341  }
342  else
343  {
344  // Having escaped chars.
345  result.reserve( data.size() + 2*escaped_chars_count );
346  for( auto c : data )
347  {
348  if( Traits::ordinary_char( c ) )
349  result += c;
350  else
351  {
352  result += fmt::format( "%{:02X}", c );
353  }
354  }
355  }
356 
357  return result;
358 }
359 
360 template< typename Traits = restinio_default_unescape_traits >
362 std::string
364 {
365  std::string result;
366  result.reserve( data.size() );
367 
368  auto r = impl::do_unescape_percent_encoding<Traits>(
369  data,
370  [&result]( char ch ) { result += ch; } );
371  if( !r )
372  throw exception_t{ r.error().giveout_description() };
373 
374  return result;
375 }
376 
389 template< typename Traits = restinio_default_unescape_traits >
393 {
394  std::string result;
395  result.reserve( data.size() );
396 
397  auto r = impl::do_unescape_percent_encoding<Traits>(
398  data,
399  [&result]( char ch ) { result += ch; } );
400  if( !r )
401  return make_unexpected( std::move(r.error()) );
402 
403  return std::move(result);
404 }
405 
406 template< typename Traits = restinio_default_unescape_traits >
408 std::size_t
409 inplace_unescape_percent_encoding( char * data, std::size_t size )
410 {
411  std::size_t result_size = 0u;
412  char * dest = data;
413 
414  auto r = impl::do_unescape_percent_encoding<Traits>(
415  string_view_t{ data, size },
416  [&result_size, &dest]( char ch ) {
417  *dest++ = ch;
418  ++result_size;
419  } );
420  if( !r )
421  throw exception_t{ r.error().giveout_description() };
422 
423  return result_size;
424 }
425 
438 template< typename Traits = restinio_default_unescape_traits >
441 try_inplace_unescape_percent_encoding( char * data, std::size_t size )
442 {
443  std::size_t result_size = 0u;
444  char * dest = data;
445 
446  auto r = impl::do_unescape_percent_encoding<Traits>(
447  string_view_t{ data, size },
448  [&result_size, &dest]( char ch ) {
449  *dest++ = ch;
450  ++result_size;
451  } );
452  if( !r )
453  return make_unexpected( std::move(r.error()) );
454 
455  return result_size;
456 }
457 
459 
460 namespace uri_normalization
461 {
462 
463 namespace unreserved_chars
464 {
465 
466 namespace impl
467 {
468 
477 constexpr inline bool
478 is_unreserved_char( const char ch ) noexcept
479 {
480  // In this version of RESTinio class restinio_default_unescape_traits
481  // already implements necessary check.
483 }
484 
499 template<
500  typename One_Byte_Handler,
501  typename Three_Byte_Handler >
502 void
504  string_view_t what,
505  One_Byte_Handler && one_byte_handler,
506  Three_Byte_Handler && three_byte_handler )
507 {
508  using namespace restinio::utils::impl;
509 
510  std::size_t chars_to_handle = what.size();
511  const char * d = what.data();
512 
513  utf8_checker_t utf8_checker;
514  bool expect_next_utf8_byte = false;
515 
516  const auto current_pos = [&d, &what]() noexcept { return d - what.data(); };
517 
518  while( 0 < chars_to_handle )
519  {
520  if( expect_next_utf8_byte && '%' != *d )
521  throw exception_t{
522  fmt::format( "next byte from UTF-8 sequence expected at {}",
523  current_pos() )
524  };
525 
526  if( '%' != *d )
527  {
528  // Just one symbol to the output.
529  one_byte_handler( *d );
530  ++d;
531  --chars_to_handle;
532  }
533  else if( chars_to_handle >= 3 &&
534  is_hexdigit( d[ 1 ] ) && is_hexdigit( d[ 2 ] ) )
535  {
536  const char ch = extract_escaped_char( d[ 1 ], d[ 2 ] );
537  if( !utf8_checker.process_byte( static_cast<std::uint8_t>(ch) ) )
538  throw exception_t{
539  fmt::format( "invalid UTF-8 sequence detected at {}",
540  current_pos() )
541  };
542 
543  bool keep_three_bytes = true;
544 
545  if( utf8_checker.finalized() )
546  {
547  expect_next_utf8_byte = false;
548 
549  const auto symbol = utf8_checker.current_symbol();
550  utf8_checker.reset();
551 
552  if( symbol < 0x80u )
553  {
554  const char ascii_char = static_cast<char>(symbol);
555  if( is_unreserved_char( ascii_char ) )
556  {
557  // percent encoded char will be replaced by one char.
558  one_byte_handler( ascii_char );
559  keep_three_bytes = false;
560  }
561  }
562  }
563  else
564  {
565  expect_next_utf8_byte = true;
566  }
567 
568  if( keep_three_bytes )
569  {
570  // this part of multi-byte char will go to the output as is.
571  three_byte_handler( d[ 0 ], d[ 1 ], d[ 2 ] );
572  }
573 
574  chars_to_handle -= 3;
575  d += 3u;
576  }
577  else
578  {
579  throw exception_t{
580  fmt::format( "invalid escape sequence at pos {}", current_pos() )
581  };
582  }
583  }
584 
585  if( expect_next_utf8_byte )
586  throw exception_t{ fmt::format( "unfinished UTF-8 sequence" ) };
587 }
588 
589 } /* namespace impl */
590 
604 inline std::size_t
606  string_view_t what )
607 {
608  std::size_t calculated_capacity = 0u;
609 
611  [&calculated_capacity]( char ) noexcept {
612  ++calculated_capacity;
613  },
614  [&calculated_capacity]( char, char, char ) noexcept {
615  calculated_capacity += 3u;
616  } );
617 
618  return calculated_capacity;
619 }
620 
639 inline void
641  string_view_t what,
642  char * dest )
643 {
645  [&dest]( char ch ) noexcept {
646  *dest++ = ch;
647  },
648  [&dest]( char ch1, char ch2, char ch3 ) noexcept {
649  dest[ 0 ] = ch1;
650  dest[ 1 ] = ch2;
651  dest[ 2 ] = ch3;
652  dest += 3;
653  } );
654 }
655 
656 } /* namespace unreserved_chars */
657 
658 } /* namespace uri_normalization */
659 
660 } /* namespace utils */
661 
662 } /* namespace restinio */
663 
restinio::exception_t
Exception class for all exceptions thrown by RESTinio.
Definition: exception.hpp:26
restinio::utils::utf8_checker_t::current_symbol
RESTINIO_NODISCARD std::uint32_t current_symbol() const noexcept
Definition: utf8_checker.hpp:134
RESTINIO_NODISCARD
#define RESTINIO_NODISCARD
Definition: compiler_features.hpp:33
restinio::easy_parser::symbol
RESTINIO_NODISCARD auto symbol(char expected) noexcept
A factory function to create a clause that expects the speficied symbol, extracts it and then skips i...
Definition: easy_parser.hpp:4032
restinio::utils::javascript_compatible_unescape_traits::ordinary_char
static constexpr bool ordinary_char(char c) noexcept
Definition: percent_encoding.hpp:127
nonstd::optional_lite::std11::move
T & move(T &t)
Definition: optional.hpp:421
restinio::utils::restinio_default_unescape_traits::ordinary_char
static constexpr bool ordinary_char(char c) noexcept
Definition: percent_encoding.hpp:38
restinio::utils::unescape_percent_encoding
RESTINIO_NODISCARD std::string unescape_percent_encoding(const string_view_t data)
Definition: percent_encoding.hpp:363
string_view.hpp
restinio::utils::x_www_form_urlencoded_unescape_traits
Traits for escaping and unexcaping symbols in a query string in correspondence with application/x-www...
Definition: percent_encoding.hpp:61
restinio::utils::x_www_form_urlencoded_unescape_traits::ordinary_char
static constexpr bool ordinary_char(char c) noexcept
Definition: percent_encoding.hpp:63
restinio::utils::uri_normalization::unreserved_chars::impl::is_unreserved_char
constexpr RESTINIO_NODISCARD bool is_unreserved_char(const char ch) noexcept
Is this symbol a part of unreserved set?
Definition: percent_encoding.hpp:478
restinio::utils::escape_percent_encoding
RESTINIO_NODISCARD std::string escape_percent_encoding(const string_view_t data)
Percent encoding.
Definition: percent_encoding.hpp:328
restinio::string_view_t
nonstd::string_view string_view_t
Definition: string_view.hpp:19
restinio::utils::unescape_percent_encoding_failure_t::description
RESTINIO_NODISCARD const std::string & description() const noexcept
Get a reference to the description of the failure.
Definition: percent_encoding.hpp:173
restinio::utils::unescape_percent_encoding_failure_t
Type that indicates a failure of unescaping of percent-encoded symbols.
Definition: percent_encoding.hpp:160
restinio::utils::restinio_default_unescape_traits
The default traits for escaping and unexcaping symbols in a query string.
Definition: percent_encoding.hpp:36
restinio::utils::uri_normalization::unreserved_chars::normalize_to
void normalize_to(string_view_t what, char *dest)
Perform normalization of URI value.
Definition: percent_encoding.hpp:640
restinio::utils::try_inplace_unescape_percent_encoding
RESTINIO_NODISCARD expected_t< std::size_t, unescape_percent_encoding_failure_t > try_inplace_unescape_percent_encoding(char *data, std::size_t size)
Helper function for unescaping percent-encoded string inplace.
Definition: percent_encoding.hpp:441
restinio::utils::utf8_checker_t::process_byte
RESTINIO_NODISCARD bool process_byte(std::uint8_t byte) noexcept
Definition: utf8_checker.hpp:41
restinio::utils::relaxed_unescape_traits
Traits for escaping and unescaping symbols in a query string in very relaxed mode.
Definition: percent_encoding.hpp:97
restinio::utils::uri_normalization::unreserved_chars::estimate_required_capacity
RESTINIO_NODISCARD std::size_t estimate_required_capacity(string_view_t what)
Calculate the size of a buffer to hold normalized value of a URI.
Definition: percent_encoding.hpp:605
restinio::utils::utf8_checker_t::finalized
RESTINIO_NODISCARD bool finalized() const noexcept
Definition: utf8_checker.hpp:120
restinio::utils::unescape_percent_encoding_failure_t::unescape_percent_encoding_failure_t
unescape_percent_encoding_failure_t(std::string description)
Definition: percent_encoding.hpp:165
restinio::utils::unescape_percent_encoding_success_t
Type that indicates that unescaping of percent-encoded symbols completed successfully.
Definition: percent_encoding.hpp:151
restinio::utils::utf8_checker_t
Helper class for checking UTF-8 byte sequence during parsing URI or incoming byte stream.
Definition: utf8_checker.hpp:35
include_fmtlib.hpp
A special wrapper around fmtlib include files.
restinio::utils::javascript_compatible_unescape_traits
The traits for escaping and unexcaping symbols in JavaScript-compatible mode.
Definition: percent_encoding.hpp:125
restinio::expected_t
nonstd::expected< T, E > expected_t
Definition: expected.hpp:22
restinio::utils::utf8_checker_t::reset
void reset() noexcept
Definition: utf8_checker.hpp:126
restinio::utils::unescape_percent_encoding_failure_t::m_description
std::string m_description
Description of a failure.
Definition: percent_encoding.hpp:162
restinio::utils::inplace_unescape_percent_encoding
RESTINIO_NODISCARD std::size_t inplace_unescape_percent_encoding(char *data, std::size_t size)
Definition: percent_encoding.hpp:409
restinio
Definition: asio_include.hpp:21
restinio::utils::try_unescape_percent_encoding
RESTINIO_NODISCARD expected_t< std::string, unescape_percent_encoding_failure_t > try_unescape_percent_encoding(const string_view_t data)
Helper function for unescaping percent-encoded string.
Definition: percent_encoding.hpp:392
exception.hpp
restinio::utils::impl::extract_escaped_char
char extract_escaped_char(char c1, char c2)
Definition: percent_encoding.hpp:199
utf8_checker.hpp
An implementation of checker for UTF-8 sequences.
restinio::utils::unescape_percent_encoding_failure_t::giveout_description
RESTINIO_NODISCARD std::string giveout_description() noexcept
Get out the value of the description of the failure.
Definition: percent_encoding.hpp:183
expected.hpp
restinio::utils::impl::do_unescape_percent_encoding
RESTINIO_NODISCARD expected_t< unescape_percent_encoding_success_t, unescape_percent_encoding_failure_t > do_unescape_percent_encoding(const string_view_t data, Chars_Collector &&collector)
The actual implementation of unescape-percent-encoding procedure.
Definition: percent_encoding.hpp:239
restinio::utils::impl::is_hexdigit
bool is_hexdigit(char c)
Definition: percent_encoding.hpp:190
restinio::utils::impl
Definition: bitops.hpp:17
restinio::utils::relaxed_unescape_traits::ordinary_char
static bool ordinary_char(char c) noexcept
Definition: percent_encoding.hpp:99
restinio::utils::uri_normalization::unreserved_chars::impl::run_normalization_algo
void run_normalization_algo(string_view_t what, One_Byte_Handler &&one_byte_handler, Three_Byte_Handler &&three_byte_handler)
Internal helper to perform the main logic of enumeration of symbols in URI.
Definition: percent_encoding.hpp:503
const
#define const
Definition: zconf.h:230