NFAScanner

Why

We've all written custom loaders for misc text formats a dozen times or more.  A compact, simple, generic, reusable system would be nice.  I'd also like to be able to configure it at run-time, so I don't have to recompile if I change one or more tokens, and of course, working on compilers and related tech is just good fun.


What

A lexical analyzer that is simple, flexible, and easy to use. Tokens are defined via x-macros. These are used to configure an instance of NFAScanner at run-time, generating the required state machines. Feed your source text in, and get your tokens out. Some rudimentary error information is present, giving you an idea of where any unmatchable token was encountered.

Here's are a few of the state machine definitions that you might require for a C-like programming language…


// TokenDecl.h - definition of some tokens for a C-like programming language.


__beginTokenDefs


__declareTokenChar(NotLogOp, '!')

__declareTokenChar(InvBitOp, '~')


__declareTokenChar(AndBitOp, '&')

__declareTokenChar(XorBitOp, '^')

__declareTokenChar(OrBitOp,  '|')


__declareTokenChar(MulOp, '*')

__declareTokenChar(DivOp, '/')

__declareTokenChar(ModOp, '%')

__declareTokenChar(AddOp, '+')

__declareTokenChar(SubOp, '-')


__declareTokenString(AndLogOp, "&&")

__declareTokenString(OrLogOp, "||")


__declareTokenChar(LessthanOp, '<')

__declareTokenChar(GreaterthanOp, '>')


__declareTokenString(LessthanEqOp, "<=")

__declareTokenString(GreaterthanEqOp, ">=")


__declareTokenString(EqOp, "==")

__declareTokenString(NotEqOp, "!=")


__declareTokenChar(Comma, ',')


__declareTokenChar(Semicolon, ';')



__declareTokenChar(LSqrParen, '[')

__declareTokenChar(RSqrParen, ']')


__declareTokenChar(LParen, '(')

__declareTokenChar(RParen, ')')



__declareTokenChar(AssignOp, '=')



__declareToken(LiteralFloat) __startsWith("digits1") __startsWith("minus") __startsWith("dotstart")


    __declareStateChar("minus",'-')__nextState("digits1") __nextState("dotstart")

    __declareStateClass("digits1", Digit) __nextState("digits1") __nextState("dot")

    __declareStateChar("dotstart", '.')__nextState("digits2")

    __declareStateChar("dot", '.')__nextState("digits2") __nextState("f") __nextState("stop")

    __declareStateClass("digits2", Digit) __nextState("digits2") __nextState("f") __nextState("stop")

    __declareStateChar("f", 'f')__nextState("stop")


__declareStop



__declareToken(LiteralInt) __startsWith("digits") __startsWith("minus")


    __declareStateChar("minus",'-')__nextState("digits")

    __declareStateClass("digits", Digit) __nextState("digits")  __nextState("stop")


__declareStop



__declareToken(LiteralUint) __startsWith("digits")


    __declareStateClass("digits", Digit) __nextState("digits")  __nextState("u")

    __declareStateChar("u", 'u')__nextState("stop")


__declareStop



__declareToken(LiteralHex) __startsWith("0")


    __declareStateChar("0",'0')__nextState("x")

    __declareStateChar("x",'x')__nextState("hexdigits")

    __declareStateClass("hexdigits", HexDigit) __nextState("hexdigits")  __nextState("stop")


__declareStop



__declareToken(LiteralBin) __startsWith("0") __startsWith("1")


    __declareStateChar("0",'0')__nextState("0") __nextState("1") __nextState("b")

    __declareStateChar("1",'1')__nextState("0") __nextState("1") __nextState("b")

    __declareStateChar("b",'b')__nextState("stop")


__declareStop



__declareToken(LiteralString) __startsWith("q1")


    __declareStateChar("q1",'\"')__nextState("escape")  __nextState("q2")  __nextState("anychar")

    __declareStateClass("anychar",AnyChar)__nextState("escape")  __nextState("q2")  __nextState("anychar")

    __declareStateChar("escape",'\\')__nextState("anychar")

    __declareStateChar("q2",'\"')__nextState("stop")


__declareStop



__declareToken(Identifier) __startsWith("_") __startsWith("alpha")


    __declareStateClass("alpha",Alpha)__nextState("alnum")  __nextState("_")  __nextState("stop")

    __declareStateClass("alnum",Alnum)__nextState("alnum")  __nextState("_")  __nextState("stop")

    __declareStateChar("_",'_')__nextState("alnum")  __nextState("_")  __nextState("stop")


__declareStop



__endTokenDefs


Notes

For my purposes, it's quite functional.  I'm not suggesting you go and use it as the scanner for your next compiler, but you could certainly have some fun playing around with it.


Download

source code: NFAScanner.zip

sample usage: NFAScannerSample.zip


© Paul Glinker 2002 - 2016