% $Id: LexerTools.oz,v 1.18 2012/01/08 13:17:01 leavens Exp $ % % A lexical analyzer based on Paulson's book "ML for the working Programmer" % (Cambridge, 1991) Figure 9.2. Translated into Oz (via Haskell) % by Gary T. Leavens. % % A few differences from Paulson's version: % - Alphanumerics must start with an alphabetic character, % - _ is considered to be an alphanumeric, not a special character % - I added both intLit and floatLit tokens and string literals % - This file isn't as reusable and adaptable as Paulson's, % because I didn't use Oz modules. % - Keywords that consist entirely of symbols are tokenized so that the % longest one in the list of symbolic keywords is found % (instead of the shortest one, which is found in Paulson's version). % - Comments starting with the % and ending in a newline % are recognized and discarded as in Comment_Start below % \insert 'ParserFunctions.oz' % This lexical analyzer recognizes three kinds of tokens: % reserved keywords, identifiers (alphanumerics), % and numbers (integer literals). % Keywords can be either symbols, such as "if", or operators, such as ":=". % ::= key( ) | varId( ) % | atomExp( ) | strTok( ) % | intLit( ) | floatLit( ) % | label( ) % ::= pos:pos( ) % | pos:pos( ) % | % ::= % ::= % ::= % ::= % The following auxiliary functions are useful for recognizing various % classes of tokens. fun {IsKeyToken V} case V of key(...) then true else false end end fun {IsIdToken V} case V of varId(...) then true else false end end fun {IsAtomToken V} case V of atomExp(...) then true else false end end fun {IsIntToken V} case V of intLit(...) then true else false end end fun {IsFloatToken V} case V of floatLit(...) then true else false end end fun {IsStrToken V} case V of strTok(...) then true else false end end fun {IsLabelToken V} case V of label(...) then true else false end end % The following are some helpful parsing combinators. % (See the ParserFunctions for the general idea of parsing combinators.) % By far the most useful is P_Check. It allows one to check for the occurrence % of a particular keyword. For example one would write {P_Check 'if'} to parse % the keyword 'if'. % P_Check : }: > fun {P_Check K} fun {$ Inp} case Inp of key(S pos:C)|Ts andthen K == S then [key(S pos:C)#Ts] else nil end end end % The following can be used to parse an identifier token (one that is not % a keyword). % P_VarId: fun {P_VarId Inp} case Inp of varId(S pos:C)|Ts then [(varId(S pos:C)#Ts)] else nil end end % The following can be used to parse a number. % P_Number_Token : fun {P_Number_Token Inp} case Inp of intLit(I pos:C)|Ts then [(intLit(I pos:C)#Ts)] [] floatLit(F pos:C)|Ts then [(floatLit(F pos:C)#Ts)] else nil end end % The main function that performs lexical analysis is called `Scanning' below. % It uses the following list of special characters to determine what % characters can potentially compose symbolic keywords. % Note that _ is not in this set. Specials = "!@#$%^&*()+-=[]:\"|;`\\,./?'~<>{}" % The scanning function itself is a partially curried function that takes: % a list of alphanumeric keywords (like 'if'), % a list of symbolic keywords (like ":=" or "+"), % an accumulated list of tokens (usually [] to begin with, % accumulated in reverse), and % a String representing the input to be tokenized. % It returns a list of tokens, in order. % The implementation is a simple finite state machine. % % Scanning : }: % }: >> fun {Scanning Keys Syms FileName} Line = {NewCell 1} Col = {NewCell 1} fun {Scan_Loop Cs} POS = pos(FileName @Line @Col) in % {Show 'Scan_Loop'#Cs} case Cs of nil then nil [] C|Cs then if {Comment_Start C|Cs} then Line := @Line + 1 Col := 1 {Scan_Loop {Eat_Comment C|Cs}} elseif {Letter C} orelse C == &_ then % Identifier or Keyword local Id#Cs2 = {AlphaNumeric [C] Cs} Tok = {TokenOf Id POS} in Col := @Col + {Length Id} if Cs2 \= nil andthen Cs2.1 == &( then % start of a record label(Tok pos:POS)|{Scan_Loop Cs2} else % not a record Tok|{Scan_Loop Cs2} end end elseif C == &" then % String literal local Str#Cs2 = {StrLit nil &" Cs} in Col := @Col + {Length Str} + 2 strTok(Str pos:POS)|{Scan_Loop Cs2} end elseif C == &' then % atom literal local Atm#Cs2 = {StrLit nil &' Cs} Tok = atomExp({StringToAtom Atm} pos:POS) in Col := @Col + {Length Atm} + 2 if Cs2 \= nil andthen Cs2.1 == &( then % start of a record label(Tok pos:POS)|{Scan_Loop Cs2} else % not a record Tok|{Scan_Loop Cs2} end end elseif {NumberStart C Cs} then % numeric literal [[[Currently doesn't handle octal and hex]]] local IntPart#Cs2 = {Numeric [C] Cs} in if Cs2 == nil orelse Cs2.1 \= &. then Col := @Col + {Length IntPart} intLit({StringToInt IntPart} pos:POS)|{Scan_Loop Cs2} elseif Cs2.1 == &. andthen Cs2.2 \= nil andthen Cs2.2.1 == &. andthen Cs2.2.2 \= nil andthen Cs2.2.2.1 == &. then % it's an followed by 3 dots Col := @Col + {Length IntPart}+3 intLit({StringToInt IntPart} pos:POS) |key('...' pos:pos(FileName @Line @Col-3)) |{Scan_Loop {List.drop Cs2 3}} elseif Cs2.1 == &. then local FloatEnd#Cs3 = {FloatRest [Cs2.1] Cs2.2} WholeFloat = {Append IntPart FloatEnd} Col := @Col + {Length WholeFloat} in floatLit({StringToFloat WholeFloat} pos:POS) |{Scan_Loop Cs3} end end end elseif {Member C Specials} then % special character local Sy#Cs2 = {Specials_Run [C] Cs} Prefix_Sy#Cs3 = {Longest_Prefix_Symbol Sy Cs2} in if Prefix_Sy == "" then % not a keyword, so give an error raise unknownCharacter(C pos:POS) end else Col := @Col + {Length Prefix_Sy} key({StringToAtom Prefix_Sy} pos:POS)|{Scan_Loop Cs3} end end elseif C == &? % a question mark comment then Col := @Col+1 {Scan_Loop Cs} % skip it elseif C == & % a blank then Col := @Col+1 {Scan_Loop Cs} % skip it elseif C == &\t then Col := ((@Col div 8)+1)*8 {Scan_Loop Cs} % skip it elseif C == &\r andthen Cs \= nil andthen Cs.1 \= &\n then Line := @Line+1 Col := 1 {Scan_Loop Cs} % skip it elseif C == &\r andthen Cs \= nil andthen Cs.1 == &\n then Line := @Line+1 Col := 1 {Scan_Loop Cs.2} % skip both the \r and the \n elseif C == &\r orelse C == &\n then Line := @Line+1 Col := 1 {Scan_Loop Cs} % skip it elseif {Char.isSpace C} then Col := @Col+1 {Scan_Loop Cs} % skip it else raise unknownCharacter(C pos:POS) end end end end %% Comment_Start : }: Bool> fun {Comment_Start Str} case Str of &%|_ then true else false end end %% Eat_Comment: }: fun {Eat_Comment Str} case Str of nil then nil [] &\n|Cs then Cs [] _|Cs then {Eat_Comment Cs} end end %% AlphaNumeric : }: > fun {AlphaNumeric Id Cs} case Cs of nil then Id#nil [] C|Cs then if {Char.isAlNum C} orelse C == &_ then {AlphaNumeric {Append Id [C]} Cs} else Id#(C|Cs) end end end fun {TokenOf Id POS} IdSym = {StringToAtom Id} in if {Member IdSym Keys} then key(IdSym pos:POS) elseif {Char.isLower Id.1} then atomExp(IdSym pos:POS) else varId(IdSym pos:POS) end end %% Specials_Run : }: > fun {Specials_Run Sy Cs} case Cs of nil then Sy#nil [] C|Cs then if {Not {Member C Specials}} then Sy#(C|Cs) else {Specials_Run {Append Sy [C]} Cs} end end end %% Longest_Prefix_Symbol: }: > fun {Longest_Prefix_Symbol Sy Unread} case Sy of nil then nil#Unread else if {Member Sy Syms} then Sy#Unread else {Longest_Prefix_Symbol {List.take Sy {Length Sy}-1} {List.last Sy}|Unread} end end end %% NumberStart : }: Bool> fun {NumberStart C Cs} {Char.isDigit C} orelse (C == &~ andthen Cs \= nil andthen {Char.isDigit Cs.1}) end %% FloatRest : }: > fun {FloatRest SoFar Cs} FracPart#Cs2 = {Numeric SoFar Cs} in {Exponent FracPart Cs2} end %% Exponent : }: > fun {Exponent SoFar Chars} case Chars of C|Cs andthen C == &e orelse C == &E then {SignedInt {Append SoFar [C]} Cs} else SoFar#Chars end end %% SignedInt : }: > fun {SignedInt SoFar Chars} case Chars of C|Cs andthen ({Char.isDigit C} orelse C == &~) then {Numeric {Append SoFar [C]} Cs} else SoFar#Chars end end %% Numeric : }: > fun {Numeric SoFar Chars} case Chars of C|Cs andthen {Char.isDigit C} then {Numeric {Append SoFar [C]} Cs} else SoFar#Chars end end %% Strlit : }: > fun {StrLit Str QC Cs} case Cs of nil then raise error('unterminated string literal') end [] C|Cs then if C == QC then Str#Cs else {StrLit {Append Str [C]} QC Cs} end end end in Scan_Loop end