#include "Regex.h" using namespace Framework; using namespace Regex; int findEndPos(Text* regex, int startPos, char endChar, char startChar) { int length = regex->getLength(); int i = startPos; int opened = 0; while (i < length) { if (regex->getText()[i] == '\\') { i++; } else if (regex->getText()[i] == startChar) { opened++; } else if (regex->getText()[i] == endChar) { if (opened == 0) { return i; } opened--; } i++; } return -1; } Automata* parseCharacterList(Text* regex, RegexConfig& config) { bool negated = regex->hatAt(0, "^"); Automata* result = new Automata(); State* start = result->addState(); State* end = result->addState(); end->setFinal(true); int length = regex->getLength(); bool escaped = false; bool minusValid = false; Text characterList; std::function negatedFunction = [](char c, int flags) { return true; }; for (int i = 0; i < length; i++) { switch (regex->getText()[i]) { case '\\': { escaped = true; minusValid = false; break; } case '-': if (escaped || !minusValid) { characterList += "-"; } else if (minusValid) { if (i == length - 1) { characterList += "-"; } else { unsigned char before = (unsigned char) characterList[characterList.getLength() - 1]; characterList.remove(characterList.getLength() - 1, 1); unsigned char after = (unsigned char)regex->getText()[i + 1]; if (before > after) { unsigned char temp = before; before = after; after = temp; } for (unsigned char c = before; c <= after; c++) { characterList.append((char)c); } i++; } } break; default: { if (escaped) { if (regex->getText()[i] == 'w') { if (negated) { negatedFunction = [config, negatedFunction](char c, int flags) { return negatedFunction(c, flags) && !config.getWordChars().hat(c); }; } else { characterList += config.getWordChars(); } minusValid = false; } else if (regex->getText()[i] == 'W') { if (negated) { characterList += config.getWordChars(); } else { start->addTransition(Transition( [config](char c, int flags) { return !config.getWordChars().hat(c); }, end, 0)); } minusValid = false; } else if (regex->getText()[i] == 'd') { if (negated) { negatedFunction = [config, negatedFunction](char c, int flags) { return negatedFunction(c, flags) && (c < '0' || c > '9'); }; } else { characterList.append("0123456789"); } minusValid = false; } else if (regex->getText()[i] == 's') { if (negated) { negatedFunction = [config, negatedFunction]( char c, int flags) { return negatedFunction(c, flags) && !config.getWhitespaceChars().hat(c); }; } else { characterList.append(config.getWhitespaceChars()); } minusValid = false; } else if (regex->getText()[i] == 'D') { if (negated) { negatedFunction = [negatedFunction](char c, int flags) { return negatedFunction(c, flags) && (c >= '0' && c <= '9'); }; } else { start->addTransition(Transition( [](char c, int flags) { return c < '0' || c > '9'; }, end, 0)); } minusValid = false; } else if (regex->getText()[i] == 'S') { if (negated) { negatedFunction = [config, negatedFunction](char c, int flags) { return negatedFunction(c, flags) && config.getWhitespaceChars().hat(c); }; } else { start->addTransition(Transition( [config](char c, int flags) { return !config.getWhitespaceChars().hat(c); }, end, 0)); } minusValid = false; } else if (regex->getText()[i] == 'n') { characterList.append('\n'); } else if (regex->getText()[i] == 'r') { characterList.append('\r'); } else if (regex->getText()[i] == 't') { characterList.append('\t'); } else { characterList.append(regex->getText()[i]); minusValid = true; } } else { characterList.append(regex->getText()[i]); minusValid = true; } break; } } } if (negated) { negatedFunction = [characterList, negatedFunction](char c, int flags) { return negatedFunction(c, flags) && !characterList.hat(c); }; start->addTransition(Transition(negatedFunction, end, 0)); } else { start->addTransition(Transition( [characterList](char c, int flags) { return characterList.hat(c); }, end, 0)); } regex->release(); return result; } Automata* concat(Array*> parsers) { if (parsers.getEintragAnzahl() == 0) { Automata* result = new Automata(); State* start = result->addState(); State* end = result->addState(); end->setFinal(true); start->addTransition(Transition(0, end, 0)); return result; } Automata* result = 0; for (Automata* parser : parsers) { if (result == 0) { result = parser; } else { result = concat(result, parser); } } return result; } Automata* parse(Text* regex, int& nextGroupId, RegexConfig& config) { Array*> results; int length = regex->getLength(); for (int i = 0; i < length; i++) { bool escaped = false; switch (regex->getText()[i]) { case '\\': { escaped = true; i++; break; } case '(': { int end = findEndPos(regex, i + 1, ')', '('); Text* subRegex = regex->getTeilText(i + 1, end); Automata* subAutomata = 0; if (subRegex->hatAt(0, "?:")) { subRegex->remove(0, 2); subAutomata = parse(subRegex, nextGroupId, config); } else { int groupId = nextGroupId++; subAutomata = group(parse(subRegex, nextGroupId, config), groupId); } results.add(subAutomata); i = end; break; } case '[': { int end = findEndPos(regex, i + 1, ']', 0); Text* subRegex = regex->getTeilText(i + 1, end); Automata* subAutomata = parseCharacterList(subRegex, config); results.add(subAutomata); i = end; break; } case '?': { Automata* last = results.get(results.getEintragAnzahl() - 1); results.remove(results.getEintragAnzahl() - 1); results.add(maybe(last)); break; } case '*': { bool lazy = false; if (i < length - 1 && regex->getText()[i + 1] == '?') { lazy = true; i++; } Automata* last = results.get(results.getEintragAnzahl() - 1); results.remove(results.getEintragAnzahl() - 1); results.add(many(last, lazy)); break; } case '+': { bool lazy = false; if (i < length - 1 && regex->getText()[i + 1] == '?') { lazy = true; i++; } Automata* last = results.get(results.getEintragAnzahl() - 1); results.remove(results.getEintragAnzahl() - 1); results.add(atLeastOnce(last, lazy)); break; } case '.': { Automata* any = new Automata(); State* start = any->addState(); State* end = any->addState(); end->setFinal(true); start->addTransition(Transition( [](char c, int flags) { return true; }, end, 0)); results.add(any); break; } case '|': { Automata* result = oneOf(concat(results), parse(regex->getTeilText(i + 1), nextGroupId, config)); regex->release(); return result; } case '{': { int end = findEndPos(regex, i + 1, '}', 0); Text* subRegex = regex->getTeilText(i + 1, end); int min = 0; int max = 0; if (subRegex->hat(",")) { Text* minText = subRegex->getTeilText(0, subRegex->positionVon(",")); Text* maxText = subRegex->getTeilText(subRegex->positionVon(",") + 1); min = (int)*minText; max = (int)*maxText; minText->release(); maxText->release(); } else { min = (int)*subRegex; max = min; } Automata* last = results.get(results.getEintragAnzahl() - 1); results.remove(results.getEintragAnzahl() - 1); results.add(repeat(last, min, max)); i = end; subRegex->release(); break; } case '^': { Automata* qutomata = new Automata(); State* start = qutomata->addState(); State* end = qutomata->addState(); end->setFinal(true); start->addTransition(Transition(0, end, Flags::START)); results.add(qutomata); break; } case '$': { Automata* qutomata = new Automata(); State* start = qutomata->addState(); State* end = qutomata->addState(); end->setFinal(true); start->addTransition(Transition(0, end, Flags::END)); results.add(qutomata); break; } default: { Automata* automata = new Automata(); State* start = automata->addState(); State* end = automata->addState(); end->setFinal(true); char current = regex->getText()[i]; start->addTransition(Transition( [current](char c, int flags) { return c == current; }, end, 0)); results.add(automata); break; } } if (escaped) { std::function function; switch (regex->getText()[i]) { case 'n': { function = [](char c, int flags) { return c == '\n'; }; break; } case 'r': { function = [](char c, int flags) { return c == '\r'; }; break; } case 't': { function = [](char c, int flags) { return c == '\t'; }; break; } case 'w': { function = [config](char c, int flags) { return config.getWordChars().hat(c); }; break; } case 'W': { function = [config](char c, int flags) { return !config.getWordChars().hat(c); }; break; } case 'd': { function = [](char c, int flags) { return c >= '0' && c <= '9'; }; break; } case 'D': { function = [](char c, int flags) { return c < '0' || c > '9'; }; break; } case 's': { function = [config](char c, int flags) { return config.getWhitespaceChars().hat(c); }; break; } case 'S': { function = [config](char c, int flags) { return !config.getWhitespaceChars().hat(c); }; break; } default: { char current = regex->getText()[i]; function = [current](char c, int flags) { return c == current; }; break; } } Automata* automata = new Automata(); State* start = automata->addState(); State* end = automata->addState(); end->setFinal(true); start->addTransition(Transition(function, end, 0)); results.add(automata); } } regex->release(); return concat(results); } Framework::Regex::Group::Group(int start, int end, int id) : start(start), end(end), id(id) {} Framework::Regex::Group::Group() : start(-1), end(-1), id(-1) {} int Framework::Regex::Group::getStart() { return start; } int Framework::Regex::Group::getEnd() { return end; } int Framework::Regex::Group::getId() { return id; } Framework::Regex::Result::Result(Array groups) : ReferenceCounter(), groups(groups) {} int Framework::Regex::Result::getGroupCount() { return groups.getEintragAnzahl(); } Group Framework::Regex::Result::getGroup(int index) { return groups.get(index); } int Framework::Regex::Result::getStart() { return groups.get(0).getStart(); } int Framework::Regex::Result::getEnd() { return groups.get(0).getEnd(); } RegexConfig::RegexConfig() { whitespaceChars = " \n\r\t"; wordChars = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789_"; } void RegexConfig::setWhitespaceChars(Text whitespaceChars) { this->whitespaceChars = whitespaceChars; } void RegexConfig::setWordChars(Text wordChars) { this->wordChars = wordChars; } Text RegexConfig::getWhitespaceChars() const { return whitespaceChars; } Text RegexConfig::getWordChars() const { return wordChars; } Text Framework::Regex::quote(const Text& text) { Text result = ""; for (int i = 0; i < text.getLength(); i++) { switch (text[i]) { case '\\': case '.': case '[': case ']': case '(': case ')': case '{': case '}': case '+': case '*': case '?': case '|': case '^': case '$': result.append('\\'); result.append(text[i]); break; default: result.append(text[i]); break; } } return result; } Automata* Framework::Regex::parse(Text regex) { RegexConfig config; return parse(regex, config); } Automata* Framework::Regex::parse(Text regex, RegexConfig& config) { int nextGroupId = 1; return ::parse(new Text(regex), nextGroupId, config); }