json_parser.hpp 20 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614
  1. // Copyright (c) 2018-2020 Jsonxx - Nomango
  2. //
  3. // Permission is hereby granted, free of charge, to any person obtaining a copy
  4. // of this software and associated documentation files (the "Software"), to deal
  5. // in the Software without restriction, including without limitation the rights
  6. // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  7. // copies of the Software, and to permit persons to whom the Software is
  8. // furnished to do so, subject to the following conditions:
  9. //
  10. // The above copyright notice and this permission notice shall be included in
  11. // all copies or substantial portions of the Software.
  12. //
  13. // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  14. // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  15. // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  16. // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  17. // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  18. // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
  19. // THE SOFTWARE.
  20. #pragma once
  21. #include <cstdio> // std::FILE
  22. #include <type_traits>
  23. #include <ios>
  24. #include "json_value.hpp"
  25. namespace jsonxx {
  26. //
  27. // input_adapter
  28. //
  29. template<typename _CharTy>
  30. struct input_adapter {
  31. using char_type = _CharTy;
  32. using char_traits = std::char_traits<char_type>;
  33. virtual typename char_traits::int_type get_char() = 0;
  34. virtual ~input_adapter() = default;
  35. };
  36. template<typename _CharTy>
  37. struct file_input_adapter
  38. : public input_adapter<_CharTy> {
  39. using char_type = typename input_adapter<_CharTy>::char_type;
  40. using char_traits = typename input_adapter<_CharTy>::char_traits;
  41. file_input_adapter(std::FILE *file) : file(file) {}
  42. virtual typename char_traits::int_type get_char() override {
  43. return std::fgetc(file);
  44. }
  45. private:
  46. std::FILE *file;
  47. };
  48. template<typename _CharTy>
  49. struct stream_input_adapter
  50. : public input_adapter<_CharTy> {
  51. using char_type = typename input_adapter<_CharTy>::char_type;
  52. using char_traits = typename input_adapter<_CharTy>::char_traits;
  53. stream_input_adapter(std::basic_istream<char_type> &stream) : stream(stream), streambuf(*stream.rdbuf()) {}
  54. virtual typename char_traits::int_type get_char() override {
  55. auto ch = streambuf.sbumpc();
  56. if (ch == EOF) {
  57. stream.clear(stream.rdstate() | std::ios::eofbit);
  58. }
  59. return ch;
  60. }
  61. virtual ~stream_input_adapter() {
  62. stream.clear(stream.rdstate() & std::ios::eofbit);
  63. }
  64. private:
  65. std::basic_istream<char_type> &stream;
  66. std::basic_streambuf<char_type> &streambuf;
  67. };
  68. template<typename _StringTy>
  69. struct string_input_adapter
  70. : public input_adapter<typename _StringTy::value_type> {
  71. using char_type = typename input_adapter<typename _StringTy::value_type>::char_type;
  72. using char_traits = typename input_adapter<typename _StringTy::value_type>::char_traits;
  73. string_input_adapter(const _StringTy &str) : str(str), index(0) {}
  74. virtual typename char_traits::int_type get_char() override {
  75. if (index == str.size())
  76. return char_traits::eof();
  77. return str[index++];
  78. }
  79. private:
  80. const _StringTy &str;
  81. typename _StringTy::size_type index;
  82. };
  83. template<typename _CharTy>
  84. struct buffer_input_adapter
  85. : public input_adapter<_CharTy> {
  86. using char_type = typename input_adapter<_CharTy>::char_type;
  87. using char_traits = typename input_adapter<_CharTy>::char_traits;
  88. buffer_input_adapter(const _CharTy *str) : str(str), index(0) {}
  89. virtual typename char_traits::int_type get_char() override {
  90. if (str[index] == '\0')
  91. return char_traits::eof();
  92. return str[index++];
  93. }
  94. private:
  95. const char_type *str;
  96. std::size_t index;
  97. };
  98. //
  99. // json_lexer & json_parser
  100. //
  101. enum class token_type {
  102. uninitialized,
  103. literal_true,
  104. literal_false,
  105. literal_null,
  106. value_string,
  107. value_integer,
  108. value_float,
  109. begin_array,
  110. end_array,
  111. begin_object,
  112. end_object,
  113. name_separator,
  114. value_separator,
  115. parse_error,
  116. end_of_input
  117. };
  118. template<typename _BasicJsonTy>
  119. struct json_lexer {
  120. using string_type = typename _BasicJsonTy::string_type;
  121. using char_type = typename _BasicJsonTy::char_type;
  122. using integer_type = typename _BasicJsonTy::integer_type;
  123. using float_type = typename _BasicJsonTy::float_type;
  124. using boolean_type = typename _BasicJsonTy::boolean_type;
  125. using array_type = typename _BasicJsonTy::array_type;
  126. using object_type = typename _BasicJsonTy::object_type;
  127. using char_traits = std::char_traits<char_type>;
  128. json_lexer(input_adapter<char_type> *adapter) : adapter(adapter) {
  129. // read first char
  130. read_next();
  131. }
  132. typename char_traits::int_type read_next() {
  133. current = adapter->get_char();
  134. return current;
  135. }
  136. void skip_spaces() {
  137. while (current == ' ' || current == '\t' || current == '\n' || current == '\r') {
  138. read_next();
  139. }
  140. }
  141. token_type scan() {
  142. skip_spaces();
  143. token_type result = token_type::uninitialized;
  144. switch (current) {
  145. case '[':
  146. result = token_type::begin_array;
  147. break;
  148. case ']':
  149. result = token_type::end_array;
  150. break;
  151. case '{':
  152. result = token_type::begin_object;
  153. break;
  154. case '}':
  155. result = token_type::end_object;
  156. break;
  157. case ':':
  158. result = token_type::name_separator;
  159. break;
  160. case ',':
  161. result = token_type::value_separator;
  162. break;
  163. case 't':
  164. return scan_literal("true", token_type::literal_true);
  165. case 'f':
  166. return scan_literal("false", token_type::literal_false);
  167. case 'n':
  168. return scan_literal("null", token_type::literal_null);
  169. case '\"':
  170. return scan_string();
  171. case '-':
  172. case '0':
  173. case '1':
  174. case '2':
  175. case '3':
  176. case '4':
  177. case '5':
  178. case '6':
  179. case '7':
  180. case '8':
  181. case '9':
  182. return scan_number();
  183. case '\0':
  184. case char_traits::eof():
  185. return token_type::end_of_input;
  186. // unexpected char
  187. default:
  188. return token_type::parse_error;
  189. }
  190. // skip current char
  191. read_next();
  192. return result;
  193. }
  194. token_type scan_literal(const char_type *text, token_type result) {
  195. for (std::size_t i = 0; text[i] != '\0'; ++i) {
  196. if (text[i] != char_traits::to_char_type(current)) {
  197. return token_type::parse_error;
  198. }
  199. read_next();
  200. }
  201. return result;
  202. }
  203. token_type scan_string() {
  204. if (current != '\"')
  205. return token_type::parse_error;
  206. string_buffer.clear();
  207. while (true) {
  208. const auto ch = read_next();
  209. switch (ch) {
  210. case char_traits::eof(): {
  211. // unexpected end
  212. return token_type::parse_error;
  213. }
  214. case '\"': {
  215. // skip last `\"`
  216. read_next();
  217. return token_type::value_string;
  218. }
  219. case 0x00:
  220. case 0x01:
  221. case 0x02:
  222. case 0x03:
  223. case 0x04:
  224. case 0x05:
  225. case 0x06:
  226. case 0x07:
  227. case 0x08:
  228. case 0x09:
  229. case 0x0A:
  230. case 0x0B:
  231. case 0x0C:
  232. case 0x0D:
  233. case 0x0E:
  234. case 0x0F:
  235. case 0x10:
  236. case 0x11:
  237. case 0x12:
  238. case 0x13:
  239. case 0x14:
  240. case 0x15:
  241. case 0x16:
  242. case 0x17:
  243. case 0x18:
  244. case 0x19:
  245. case 0x1A:
  246. case 0x1B:
  247. case 0x1C:
  248. case 0x1D:
  249. case 0x1E:
  250. case 0x1F: {
  251. // invalid control character
  252. return token_type::parse_error;
  253. }
  254. case '\\': {
  255. switch (read_next()) {
  256. case '\"':
  257. string_buffer.push_back('\"');
  258. break;
  259. case '\\':
  260. string_buffer.push_back('\\');
  261. break;
  262. case '/':
  263. string_buffer.push_back('/');
  264. break;
  265. case 'b':
  266. string_buffer.push_back('\b');
  267. break;
  268. case 'f':
  269. string_buffer.push_back('\f');
  270. break;
  271. case 'n':
  272. string_buffer.push_back('\n');
  273. break;
  274. case 'r':
  275. string_buffer.push_back('\r');
  276. break;
  277. case 't':
  278. string_buffer.push_back('\t');
  279. break;
  280. case 'u': {
  281. // unicode escapes
  282. uint16_t byte = 0;
  283. for (const auto factor : {12, 8, 4, 0}) {
  284. const auto n = read_next();
  285. if (n >= L'0' && n <= L'9') {
  286. byte += ((n - L'0') << factor);
  287. } else if (n >= L'A' && n <= L'F') {
  288. byte += ((n - L'A' + 10) << factor);
  289. } else if (n >= L'a' && n <= L'f') {
  290. byte += ((n - L'a' + 10) << factor);
  291. } else {
  292. // '\u' must be followed by 4 hex digits
  293. return token_type::parse_error;
  294. }
  295. }
  296. string_buffer.push_back(char_traits::to_char_type(byte));
  297. break;
  298. }
  299. default: {
  300. return token_type::parse_error;
  301. }
  302. }
  303. break;
  304. }
  305. default: {
  306. if (ch > 0x1F && ch < 0x7F) {
  307. string_buffer.push_back(char_traits::to_char_type(ch));
  308. break;
  309. } else {
  310. return token_type::parse_error;
  311. }
  312. }
  313. }
  314. }
  315. }
  316. token_type scan_number() {
  317. is_negative = false;
  318. number_value = static_cast<float_type>(0.0);
  319. if (current == '-') {
  320. return scan_negative();
  321. }
  322. if (current == '0') {
  323. return scan_zero();
  324. }
  325. return scan_integer();
  326. }
  327. token_type scan_negative() {
  328. if (current == '-') {
  329. is_negative = true;
  330. read_next();
  331. return scan_integer();
  332. }
  333. return token_type::parse_error;
  334. }
  335. token_type scan_zero() {
  336. if (current == '0') {
  337. if (read_next() == '.')
  338. return scan_float();
  339. else
  340. return token_type::value_integer;
  341. }
  342. return token_type::parse_error;
  343. }
  344. token_type scan_integer() {
  345. if (std::isdigit(current)) {
  346. number_value = static_cast<float_type>(current - '0');
  347. while (true) {
  348. const auto ch = read_next();
  349. if (ch == '.')
  350. return scan_float();
  351. if (ch == 'e' || ch == 'E')
  352. return scan_exponent();
  353. if (std::isdigit(ch))
  354. number_value = number_value * 10 + (ch - '0');
  355. else
  356. break;
  357. }
  358. return token_type::value_integer;
  359. }
  360. return token_type::parse_error;
  361. }
  362. token_type scan_float() {
  363. if (current != '.')
  364. return token_type::parse_error;
  365. if (std::isdigit(read_next())) {
  366. float_type fraction = static_cast<float_type>(0.1);
  367. number_value += static_cast<float_type>(current - '0') * fraction;
  368. while (true) {
  369. const auto ch = read_next();
  370. if (ch == 'e' || ch == 'E')
  371. return scan_exponent();
  372. if (std::isdigit(ch)) {
  373. fraction *= static_cast<float_type>(0.1);
  374. number_value += static_cast<float_type>(ch - '0') * fraction;
  375. } else
  376. break;
  377. }
  378. return token_type::value_float;
  379. }
  380. return token_type::parse_error;
  381. }
  382. token_type scan_exponent() {
  383. if (current != 'e' && current != 'E')
  384. return token_type::parse_error;
  385. // skip current char
  386. read_next();
  387. if ((std::isdigit(current) && current != '0') || (current == '-') || (current == '+')) {
  388. float_type base = 10;
  389. if (current == '+') {
  390. read_next();
  391. } else if (current == '-') {
  392. base = static_cast<float_type>(0.1);
  393. read_next();
  394. }
  395. unsigned int exponent = static_cast<unsigned int>(current - '0');
  396. while (std::isdigit(read_next())) {
  397. exponent = (exponent * 10) + static_cast<unsigned int>(current - '0');
  398. }
  399. float_type power = 1;
  400. for (; exponent; exponent >>= 1, base *= base)
  401. if (exponent & 1)
  402. power *= base;
  403. number_value *= power;
  404. return token_type::value_float;
  405. }
  406. return token_type::parse_error;
  407. }
  408. integer_type token_to_integer() const {
  409. integer_type integer = static_cast<integer_type>(number_value);
  410. return is_negative ? -integer : integer;
  411. }
  412. float_type token_to_float() const {
  413. return is_negative ? -number_value : number_value;
  414. }
  415. string_type token_to_string() const {
  416. return string_buffer;
  417. }
  418. private:
  419. input_adapter<char_type> *adapter;
  420. typename char_traits::int_type current;
  421. bool is_negative;
  422. float_type number_value;
  423. string_type string_buffer;
  424. };
  425. template<typename _BasicJsonTy>
  426. struct json_parser {
  427. using string_type = typename _BasicJsonTy::string_type;
  428. using char_type = typename _BasicJsonTy::char_type;
  429. using integer_type = typename _BasicJsonTy::integer_type;
  430. using float_type = typename _BasicJsonTy::float_type;
  431. using boolean_type = typename _BasicJsonTy::boolean_type;
  432. using array_type = typename _BasicJsonTy::array_type;
  433. using object_type = typename _BasicJsonTy::object_type;
  434. using char_traits = std::char_traits<char_type>;
  435. json_parser(input_adapter<char_type> *adapter)
  436. : lexer(adapter), last_token(token_type::uninitialized) {
  437. }
  438. void parse(_BasicJsonTy &json) {
  439. parse_value(json);
  440. if (get_token() != token_type::end_of_input)
  441. throw json_parse_error("unexpected token, expect end");
  442. }
  443. private:
  444. token_type get_token() {
  445. last_token = lexer.scan();
  446. return last_token;
  447. }
  448. void parse_value(_BasicJsonTy &json) {
  449. switch (get_token()) {
  450. case token_type::literal_true:
  451. json = json_type::boolean;
  452. json.value_.data.boolean = true;
  453. break;
  454. case token_type::literal_false:
  455. json = json_type::boolean;
  456. json.value_.data.boolean = false;
  457. break;
  458. case token_type::literal_null:
  459. json = json_type::null;
  460. break;
  461. case token_type::value_string:
  462. json = lexer.token_to_string();
  463. break;
  464. case token_type::value_integer:
  465. json = lexer.token_to_integer();
  466. break;
  467. case token_type::value_float:
  468. json = lexer.token_to_float();
  469. break;
  470. case token_type::begin_array:
  471. json = json_type::array;
  472. while (true) {
  473. json.value_.data.vector->push_back(_BasicJsonTy());
  474. parse_value(json.value_.data.vector->back());
  475. // read ','
  476. if (get_token() != token_type::value_separator)
  477. break;
  478. }
  479. if (last_token != token_type::end_array)
  480. throw json_parse_error("unexpected token in array");
  481. break;
  482. case token_type::begin_object:
  483. json = json_type::object;
  484. while (true) {
  485. if (get_token() != token_type::value_string)
  486. break;
  487. string_type key = lexer.token_to_string();
  488. if (get_token() != token_type::name_separator)
  489. break;
  490. _BasicJsonTy object;
  491. parse_value(object);
  492. json.value_.data.object->insert(std::make_pair(key, object));
  493. // read ','
  494. if (get_token() != token_type::value_separator)
  495. break;
  496. }
  497. if (last_token != token_type::end_object)
  498. throw json_parse_error("unexpected token in object");
  499. break;
  500. default:
  501. // unexpected token
  502. throw json_parse_error("unexpected token");
  503. break;
  504. }
  505. }
  506. private:
  507. json_lexer<_BasicJsonTy> lexer;
  508. token_type last_token;
  509. };
  510. } // namespace Jsonxx