Regex.cpp 19 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637
  1. #include "Regex.h"
  2. using namespace Framework;
  3. using namespace Regex;
  4. int findEndPos(Text* regex, int startPos, char endChar, char startChar)
  5. {
  6. int length = regex->getLength();
  7. int i = startPos;
  8. int opened = 0;
  9. while (i < length)
  10. {
  11. if (regex->getText()[i] == '\\')
  12. {
  13. i++;
  14. }
  15. else if (regex->getText()[i] == startChar)
  16. {
  17. opened++;
  18. }
  19. else if (regex->getText()[i] == endChar)
  20. {
  21. if (opened == 0)
  22. {
  23. return i;
  24. }
  25. opened--;
  26. }
  27. i++;
  28. }
  29. return -1;
  30. }
  31. Automata<char>* parseCharacterList(Text* regex, RegexConfig& config)
  32. {
  33. bool negated = regex->hatAt(0, "^");
  34. Automata<char>* result = new Automata<char>();
  35. State<char>* start = result->addState();
  36. State<char>* end = result->addState();
  37. end->setFinal(true);
  38. int length = regex->getLength();
  39. bool escaped = false;
  40. bool minusValid = false;
  41. Text characterList;
  42. std::function<bool(char, int)> negatedFunction
  43. = [](char c, int flags) { return true; };
  44. for (int i = 0; i < length; i++)
  45. {
  46. switch (regex->getText()[i])
  47. {
  48. case '\\':
  49. {
  50. escaped = true;
  51. minusValid = false;
  52. break;
  53. }
  54. case '-':
  55. if (escaped || !minusValid)
  56. {
  57. characterList += "-";
  58. }
  59. else if (minusValid)
  60. {
  61. if (i == length - 1)
  62. {
  63. characterList += "-";
  64. }
  65. else
  66. {
  67. unsigned char before = (unsigned char)
  68. characterList[characterList.getLength() - 1];
  69. characterList.remove(characterList.getLength() - 1, 1);
  70. unsigned char after
  71. = (unsigned char)regex->getText()[i + 1];
  72. if (before > after)
  73. {
  74. unsigned char temp = before;
  75. before = after;
  76. after = temp;
  77. }
  78. for (unsigned char c = before; c <= after; c++)
  79. {
  80. characterList.append((char)c);
  81. }
  82. i++;
  83. }
  84. }
  85. break;
  86. default:
  87. {
  88. if (escaped)
  89. {
  90. if (regex->getText()[i] == 'w')
  91. {
  92. if (negated)
  93. {
  94. negatedFunction
  95. = [config, negatedFunction](char c, int flags) {
  96. return negatedFunction(c, flags)
  97. && !config.getWordChars().hat(c);
  98. };
  99. }
  100. else
  101. {
  102. characterList += config.getWordChars();
  103. }
  104. minusValid = false;
  105. }
  106. else if (regex->getText()[i] == 'W')
  107. {
  108. if (negated)
  109. {
  110. characterList += config.getWordChars();
  111. }
  112. else
  113. {
  114. start->addTransition(Transition<char>(
  115. [config](char c, int flags) {
  116. return !config.getWordChars().hat(c);
  117. },
  118. end,
  119. 0));
  120. }
  121. minusValid = false;
  122. }
  123. else if (regex->getText()[i] == 'd')
  124. {
  125. if (negated)
  126. {
  127. negatedFunction
  128. = [config, negatedFunction](char c, int flags) {
  129. return negatedFunction(c, flags)
  130. && (c < '0' || c > '9');
  131. };
  132. }
  133. else
  134. {
  135. characterList.append("0123456789");
  136. }
  137. minusValid = false;
  138. }
  139. else if (regex->getText()[i] == 's')
  140. {
  141. if (negated)
  142. {
  143. negatedFunction = [config, negatedFunction](
  144. char c, int flags) {
  145. return negatedFunction(c, flags)
  146. && !config.getWhitespaceChars().hat(c);
  147. };
  148. }
  149. else
  150. {
  151. characterList.append(config.getWhitespaceChars());
  152. }
  153. minusValid = false;
  154. }
  155. else if (regex->getText()[i] == 'D')
  156. {
  157. if (negated)
  158. {
  159. negatedFunction
  160. = [negatedFunction](char c, int flags) {
  161. return negatedFunction(c, flags)
  162. && (c >= '0' && c <= '9');
  163. };
  164. }
  165. else
  166. {
  167. start->addTransition(Transition<char>(
  168. [](char c, int flags) {
  169. return c < '0' || c > '9';
  170. },
  171. end,
  172. 0));
  173. }
  174. minusValid = false;
  175. }
  176. else if (regex->getText()[i] == 'S')
  177. {
  178. if (negated)
  179. {
  180. negatedFunction
  181. = [config, negatedFunction](char c, int flags) {
  182. return negatedFunction(c, flags)
  183. && config.getWhitespaceChars().hat(c);
  184. };
  185. }
  186. else
  187. {
  188. start->addTransition(Transition<char>(
  189. [config](char c, int flags) {
  190. return !config.getWhitespaceChars().hat(c);
  191. },
  192. end,
  193. 0));
  194. }
  195. minusValid = false;
  196. }
  197. else if (regex->getText()[i] == 'n')
  198. {
  199. characterList.append('\n');
  200. }
  201. else if (regex->getText()[i] == 'r')
  202. {
  203. characterList.append('\r');
  204. }
  205. else if (regex->getText()[i] == 't')
  206. {
  207. characterList.append('\t');
  208. }
  209. else
  210. {
  211. characterList.append(regex->getText()[i]);
  212. minusValid = true;
  213. }
  214. }
  215. else
  216. {
  217. characterList.append(regex->getText()[i]);
  218. minusValid = true;
  219. }
  220. break;
  221. }
  222. }
  223. }
  224. if (negated)
  225. {
  226. negatedFunction = [characterList, negatedFunction](char c, int flags) {
  227. return negatedFunction(c, flags) && !characterList.hat(c);
  228. };
  229. start->addTransition(Transition<char>(negatedFunction, end, 0));
  230. }
  231. else
  232. {
  233. start->addTransition(Transition<char>(
  234. [characterList](char c, int flags) { return characterList.hat(c); },
  235. end,
  236. 0));
  237. }
  238. regex->release();
  239. return result;
  240. }
  241. Automata<char>* concat(Array<Automata<char>*> parsers)
  242. {
  243. if (parsers.getEintragAnzahl() == 0)
  244. {
  245. Automata<char>* result = new Automata<char>();
  246. State<char>* start = result->addState();
  247. State<char>* end = result->addState();
  248. end->setFinal(true);
  249. start->addTransition(Transition<char>(0, end, 0));
  250. return result;
  251. }
  252. Automata<char>* result = 0;
  253. for (Automata<char>* parser : parsers)
  254. {
  255. if (result == 0)
  256. {
  257. result = parser;
  258. }
  259. else
  260. {
  261. result = concat(result, parser);
  262. }
  263. }
  264. return result;
  265. }
  266. Automata<char>* parse(Text* regex, int& nextGroupId, RegexConfig& config)
  267. {
  268. Array<Automata<char>*> results;
  269. int length = regex->getLength();
  270. for (int i = 0; i < length; i++)
  271. {
  272. bool escaped = false;
  273. switch (regex->getText()[i])
  274. {
  275. case '\\':
  276. {
  277. escaped = true;
  278. i++;
  279. break;
  280. }
  281. case '(':
  282. {
  283. int end = findEndPos(regex, i + 1, ')', '(');
  284. Text* subRegex = regex->getTeilText(i + 1, end);
  285. Automata<char>* subAutomata = 0;
  286. if (subRegex->hatAt(0, "?:"))
  287. {
  288. subRegex->remove(0, 2);
  289. subAutomata = parse(subRegex, nextGroupId, config);
  290. }
  291. else
  292. {
  293. int groupId = nextGroupId++;
  294. subAutomata
  295. = group(parse(subRegex, nextGroupId, config), groupId);
  296. }
  297. results.add(subAutomata);
  298. i = end;
  299. break;
  300. }
  301. case '[':
  302. {
  303. int end = findEndPos(regex, i + 1, ']', 0);
  304. Text* subRegex = regex->getTeilText(i + 1, end);
  305. Automata<char>* subAutomata
  306. = parseCharacterList(subRegex, config);
  307. results.add(subAutomata);
  308. i = end;
  309. break;
  310. }
  311. case '?':
  312. {
  313. Automata<char>* last
  314. = results.get(results.getEintragAnzahl() - 1);
  315. results.remove(results.getEintragAnzahl() - 1);
  316. results.add(maybe(last));
  317. break;
  318. }
  319. case '*':
  320. {
  321. bool lazy = false;
  322. if (i < length - 1 && regex->getText()[i + 1] == '?')
  323. {
  324. lazy = true;
  325. i++;
  326. }
  327. Automata<char>* last
  328. = results.get(results.getEintragAnzahl() - 1);
  329. results.remove(results.getEintragAnzahl() - 1);
  330. results.add(many(last, lazy));
  331. break;
  332. }
  333. case '+':
  334. {
  335. bool lazy = false;
  336. if (i < length - 1 && regex->getText()[i + 1] == '?')
  337. {
  338. lazy = true;
  339. i++;
  340. }
  341. Automata<char>* last
  342. = results.get(results.getEintragAnzahl() - 1);
  343. results.remove(results.getEintragAnzahl() - 1);
  344. results.add(atLeastOnce(last, lazy));
  345. break;
  346. }
  347. case '.':
  348. {
  349. Automata<char>* any = new Automata<char>();
  350. State<char>* start = any->addState();
  351. State<char>* end = any->addState();
  352. end->setFinal(true);
  353. start->addTransition(Transition<char>(
  354. [](char c, int flags) { return true; }, end, 0));
  355. results.add(any);
  356. break;
  357. }
  358. case '|':
  359. {
  360. Automata<char>* result = oneOf(concat(results),
  361. parse(regex->getTeilText(i + 1), nextGroupId, config));
  362. regex->release();
  363. return result;
  364. }
  365. case '{':
  366. {
  367. int end = findEndPos(regex, i + 1, '}', 0);
  368. Text* subRegex = regex->getTeilText(i + 1, end);
  369. int min = 0;
  370. int max = 0;
  371. if (subRegex->hat(","))
  372. {
  373. Text* minText
  374. = subRegex->getTeilText(0, subRegex->positionVon(","));
  375. Text* maxText
  376. = subRegex->getTeilText(subRegex->positionVon(",") + 1);
  377. min = (int)*minText;
  378. max = (int)*maxText;
  379. minText->release();
  380. maxText->release();
  381. }
  382. else
  383. {
  384. min = (int)*subRegex;
  385. max = min;
  386. }
  387. Automata<char>* last
  388. = results.get(results.getEintragAnzahl() - 1);
  389. results.remove(results.getEintragAnzahl() - 1);
  390. results.add(repeat(last, min, max));
  391. i = end;
  392. subRegex->release();
  393. break;
  394. }
  395. case '^':
  396. {
  397. Automata<char>* qutomata = new Automata<char>();
  398. State<char>* start = qutomata->addState();
  399. State<char>* end = qutomata->addState();
  400. end->setFinal(true);
  401. start->addTransition(Transition<char>(0, end, Flags::START));
  402. results.add(qutomata);
  403. break;
  404. }
  405. case '$':
  406. {
  407. Automata<char>* qutomata = new Automata<char>();
  408. State<char>* start = qutomata->addState();
  409. State<char>* end = qutomata->addState();
  410. end->setFinal(true);
  411. start->addTransition(Transition<char>(0, end, Flags::END));
  412. results.add(qutomata);
  413. break;
  414. }
  415. default:
  416. {
  417. Automata<char>* automata = new Automata<char>();
  418. State<char>* start = automata->addState();
  419. State<char>* end = automata->addState();
  420. end->setFinal(true);
  421. char current = regex->getText()[i];
  422. start->addTransition(Transition<char>(
  423. [current](char c, int flags) { return c == current; },
  424. end,
  425. 0));
  426. results.add(automata);
  427. break;
  428. }
  429. }
  430. if (escaped)
  431. {
  432. std::function<bool(char, int)> function;
  433. switch (regex->getText()[i])
  434. {
  435. case 'n':
  436. {
  437. function = [](char c, int flags) { return c == '\n'; };
  438. break;
  439. }
  440. case 'r':
  441. {
  442. function = [](char c, int flags) { return c == '\r'; };
  443. break;
  444. }
  445. case 't':
  446. {
  447. function = [](char c, int flags) { return c == '\t'; };
  448. break;
  449. }
  450. case 'w':
  451. {
  452. function = [config](char c, int flags) {
  453. return config.getWordChars().hat(c);
  454. };
  455. break;
  456. }
  457. case 'W':
  458. {
  459. function = [config](char c, int flags) {
  460. return !config.getWordChars().hat(c);
  461. };
  462. break;
  463. }
  464. case 'd':
  465. {
  466. function = [](char c, int flags) {
  467. return c >= '0' && c <= '9';
  468. };
  469. break;
  470. }
  471. case 'D':
  472. {
  473. function
  474. = [](char c, int flags) { return c < '0' || c > '9'; };
  475. break;
  476. }
  477. case 's':
  478. {
  479. function = [config](char c, int flags) {
  480. return config.getWhitespaceChars().hat(c);
  481. };
  482. break;
  483. }
  484. case 'S':
  485. {
  486. function = [config](char c, int flags) {
  487. return !config.getWhitespaceChars().hat(c);
  488. };
  489. break;
  490. }
  491. default:
  492. {
  493. char current = regex->getText()[i];
  494. function
  495. = [current](char c, int flags) { return c == current; };
  496. break;
  497. }
  498. }
  499. Automata<char>* automata = new Automata<char>();
  500. State<char>* start = automata->addState();
  501. State<char>* end = automata->addState();
  502. end->setFinal(true);
  503. start->addTransition(Transition<char>(function, end, 0));
  504. results.add(automata);
  505. }
  506. }
  507. regex->release();
  508. return concat(results);
  509. }
  510. Framework::Regex::Group::Group(int start, int end, int id)
  511. : start(start),
  512. end(end),
  513. id(id)
  514. {}
  515. Framework::Regex::Group::Group()
  516. : start(-1),
  517. end(-1),
  518. id(-1)
  519. {}
  520. int Framework::Regex::Group::getStart()
  521. {
  522. return start;
  523. }
  524. int Framework::Regex::Group::getEnd()
  525. {
  526. return end;
  527. }
  528. int Framework::Regex::Group::getId()
  529. {
  530. return id;
  531. }
  532. Framework::Regex::Result::Result(Array<Group> groups)
  533. : ReferenceCounter(),
  534. groups(groups)
  535. {}
  536. int Framework::Regex::Result::getGroupCount()
  537. {
  538. return groups.getEintragAnzahl();
  539. }
  540. Group Framework::Regex::Result::getGroup(int index)
  541. {
  542. return groups.get(index);
  543. }
  544. int Framework::Regex::Result::getStart()
  545. {
  546. return groups.get(0).getStart();
  547. }
  548. int Framework::Regex::Result::getEnd()
  549. {
  550. return groups.get(0).getEnd();
  551. }
  552. RegexConfig::RegexConfig()
  553. {
  554. whitespaceChars = " \n\r\t";
  555. wordChars
  556. = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789_";
  557. }
  558. void RegexConfig::setWhitespaceChars(Text whitespaceChars)
  559. {
  560. this->whitespaceChars = whitespaceChars;
  561. }
  562. void RegexConfig::setWordChars(Text wordChars)
  563. {
  564. this->wordChars = wordChars;
  565. }
  566. Text RegexConfig::getWhitespaceChars() const
  567. {
  568. return whitespaceChars;
  569. }
  570. Text RegexConfig::getWordChars() const
  571. {
  572. return wordChars;
  573. }
  574. Text Framework::Regex::quote(const Text& text)
  575. {
  576. Text result = "";
  577. for (int i = 0; i < text.getLength(); i++)
  578. {
  579. switch (text[i])
  580. {
  581. case '\\':
  582. case '.':
  583. case '[':
  584. case ']':
  585. case '(':
  586. case ')':
  587. case '{':
  588. case '}':
  589. case '+':
  590. case '*':
  591. case '?':
  592. case '|':
  593. case '^':
  594. case '$':
  595. result.append('\\');
  596. result.append(text[i]);
  597. break;
  598. default:
  599. result.append(text[i]);
  600. break;
  601. }
  602. }
  603. return result;
  604. }
  605. Automata<char>* Framework::Regex::parse(Text regex)
  606. {
  607. RegexConfig config;
  608. return parse(regex, config);
  609. }
  610. Automata<char>* Framework::Regex::parse(Text regex, RegexConfig& config)
  611. {
  612. int nextGroupId = 1;
  613. return ::parse(new Text(regex), nextGroupId, config);
  614. }