How a Regex Engine Work?

By Bluehive | freethreads | 7 Feb 2020


 

A regular expression defines a set of string.

A regex engine matches a pattern against input and returns true/false for the condition that pattern exists in the input.
I have got a C++ implementation of a regex engine which uses simple constructs of matching the following:

  1. A single character
  2. An equal length pattern and input
  3. A pattern that has ^ and $
  4. Wildcards (? and *)

It is inspired by https://nickdrane.com/build-your-own-regex/ and https://swtch.com/~rsc/regexp/regexp1.html.

The C++ code is as follows:

#include<iostream>

using namespace std;

bool MatchQuestion(const char * pattern, const char *input);
bool MatchStar(const char *pattern, const char *input);

bool matchOne(char pattern, char input)
{
  cout << "matchOne" << "patt=" << pattern << "input=" << input << endl;
 if (pattern == '\0')
     return true;

  if (input == '\0')
      return false;

  return pattern == input;
}

bool MatchSameLength(const char *pattern, const char* input) 
{
    if (*pattern == '\0')
        return true;

    return matchOne(pattern[0], input[0]) && MatchSameLength(pattern+1, input+1);
}

bool MatchAnywhere(const char *pattern, const char *input)
{
    //cout << "patt=" << *pattern << "input=" << *input << endl;
    if (*pattern == '\0')
        return true;

    if (*pattern == '$' && *input == '\0')
        return true;

    if (pattern[1] == '?') {
        return MatchQuestion(pattern, input);
    }

    if (pattern[1] == '*') {
        return MatchStar(pattern, input);
    }

    return matchOne(pattern[0], input[0]) && MatchAnywhere(pattern+1, input+1);
}

bool MatchQuestion(const char * pattern, const char *input)
{
    return (
        MatchAnywhere(pattern+2, input) ||
            (matchOne(*pattern, *input) && MatchAnywhere(pattern+2, input))
            );
}

bool MatchStar(const char *pattern, const char *input)
{
 return (
     MatchAnywhere(pattern+2, input) ||
     (matchOne(*pattern, *input) && MatchAnywhere(pattern, input+1))
 );
}

bool search(const char *pattern, const char *input )
{
    if (*pattern == '^') {
        return MatchAnywhere(pattern+1, input);
    }

    while (*input) {
        if (MatchAnywhere(pattern, input))
            return true;
        ++input;
        }
    return false;    
}

int main() {
    //cout << MatchSameLength("aab", "aab") << endl;
    //cout << MatchAnywhere("^b$", "b") << endl;

    cout << search("ax*b", "abc") << endl;
    return 1;
}

How do you rate this article?

0



freethreads
freethreads

Notes on Distributed Systems, Containers, Kubernetes, Databases, Linux, Python, GoLang, C/C++

Send a $0.01 microtip in crypto to the author, and earn yourself as you read!

20% to author / 80% to me.
We pay the tips from our rewards pool.