module polemy.lex;
import polemy._common;
/*
* Author: k.inaba
* License: NYSL 0.9982 (http://www.kmonos.net/nysl/
* Lexer for the polemy programming language
*/
import std.file : readText;
import std.string : munch;
import std.ctype;
/// Represents a position in a source code
class LexPosition
{
immutable string filename; ///< name of the source file
immutable int lineno; ///< line number: 1, 2, ...
immutable int column; ///< column: 1, 2, ...
override string toString() const
{ return sprintf!"%s:%d:%d"(filename, lineno, column); }
mixin SimpleConstructor;
mixin SimpleCompare;
}
unittest
{
auto p = new LexPosition("hello.cpp", 123, 45);
auto q = new LexPosition("hello.cpp", 123, 46);
assert( p.filename == "hello.cpp" );
assert( p.lineno == 123 );
assert( p.column == 45 );
assert( to!string(p) == "hello.cpp:123:45" );
assert( p < q );
assert( p != q );
assert( !__traits(compiles, new LexPosition) );
assert( !__traits(compiles, p.filename="foo") );
assert( !__traits(compiles, p.lineno =789) );
assert( !__traits(compiles, p.column =222) );
}
/// Represents a lexer token
class Token
{
enum Kind {identifier, stringLiteral, number};
immutable LexPosition pos; ///< position where the token occurred in the source
immutable string str; ///< the token string itself
immutable Kind kind; ///< which kind of token?
mixin SimpleConstructor;
mixin SimpleCompare;
}
unittest
{
auto p = new immutable(LexPosition)("hello.cpp", 123, 45);
auto t = new Token(p, "class", Token.Kind.identifier);
assert( t.pos == p );
assert( t.str == "class" );
assert( t == new Token(p, "class", Token.Kind.identifier) );
assert( t < new Token(p, "struct", Token.Kind.identifier) );
assert( !__traits(compiles, new Token) );
assert( !__traits(compiles, t.pos=p) );
assert( !__traits(compiles, t.str=789) );
}
/// Named Construtor for Lexer
Lexer lexerFromFile(T...)( string filename, T rest )
{
return lexerFromString( std.file.readText(filename), filename, rest );
}
/// Named Construtor for Lexer
Lexer lexerFromString( string str, string filename="<unnamed>", int lineno=1, int column=1 )
{
return new Lexer(str, filename, lineno, column);
}
/// Lexer is a forward range of Tokens
class Lexer
{
bool empty() /*@property*/
{
return current is null;
}
Token front() /*@property*/
{
return std.exception.enforce(current, "Lexer has already reached the end");
}
void popFront() /*@property*/
{
std.exception.enforce(current, "Lexer has already reached the end");
current = readNext();
}
Lexer save() /*@property*/
{
return new Lexer(buffer, filename, lineno, column, current);
}
private: // implementation
string buffer;
string filename;
int lineno;
int column;
Token current;
invariant()
{
assert( buffer.empty || !std.ctype.isspace(buffer[0]) );
}
this( string buffer, string filename, int lineno, int column, Token current=null )
{
this.buffer = buffer;
this.filename = filename;
this.lineno = lineno;
this.column = column;
skipws();
this.current = (current is null ? readNext() : current);
}
void skipws()
{
bool progress = false;
do
{
string ws = buffer.munch(" \t");
column += ws.length;
progress = !ws.empty;
while( !buffer.empty && (buffer[0]=='\r' || buffer[0]=='\n') )
{
progress = true;
if( buffer[0] == '\n' )
buffer = buffer[1..$];
else // if( buffer.front == '\r' )
{
buffer = buffer[1..$];
if( !buffer.empty && buffer[0]=='\n' )
buffer = buffer[1..$];
}
lineno ++;
column = 1;
}
}while( progress );
}
char readChar()
{
scope(exit) {
buffer = buffer[1..$];
column ++;
}
return buffer[0];
}
/// This is the main lexing routine
Token readNext()
{
if( buffer.empty )
return null;
scope(exit)
skipws();
if( isSymbol(buffer[0]) )
{
if( buffer[0] == '#' )
{
// skip comment
while( !buffer.empty && (buffer[0]!='\n' && buffer[0]!='\r') )
readChar();
skipws();
return readNext();
}
else if( buffer[0] == '"' )
{
// string literal
auto pos = currentPosition();
string lit;
readChar();
while( !buffer.empty && buffer[0]!='"' )
{
// read one char
char c = readChar();
if( c == '\\' )
{
if( !buffer.empty && (buffer[0]=='\\' || buffer[0]=='"') )
lit ~= readChar();
else
lit ~= c;
}
else if( c == '\n' )
{
lit ~= c;
lineno++;
column = 1;
}
else if( c == '\r' )
{
if( !buffer.empty && buffer[0]=='\n' )
readChar();
lit ~= '\n';
lineno++;
column = 1;
}
else
lit ~= c;
}
if( !buffer.empty )
readChar();
return new Token(pos, lit, Token.Kind.stringLiteral);
}
else
{
// normal symbol
auto pos = currentPosition();
auto str = ""~readChar();
return new Token(pos, str, Token.Kind.identifier);
}
}
else
{
auto pos = currentPosition();
int i = 0;
while( i<buffer.length && !std.ctype.isspace(buffer[i]) && !isSymbol(buffer[i]) )
++i;
auto str = buffer[0 .. i];
buffer = buffer[i .. $];
column += i;
bool isNumber = find!(`a<'0' || '9'<a`)(str).empty;
return new Token(pos, str, isNumber ? Token.Kind.number : Token.Kind.identifier);
}
}
bool isSymbol(char c)
{
return (0x21<=c && c<=0x7f && !std.ctype.isalnum(c) && c!='_');
}
immutable(LexPosition) currentPosition()
{
return new immutable(LexPosition)(filename, lineno, column);
}
}
unittest
{
assert( std.range.isForwardRange!(Lexer) );
}
unittest
{
auto lex = lexerFromString("this is a \t\n pen :-( ");
Token[] ts = std.array.array(lex);
assert( ts[0].pos.lineno == 1 );
assert( ts[0].pos.column == 1 );
assert( ts[0].kind == Token.Kind.identifier );
assert( ts[0].str == "this" );
assert( ts[1].pos.lineno == 1 );
assert( ts[1].pos.column == 6 );
assert( ts[1].kind == Token.Kind.identifier );
assert( ts[1].str == "is" );
assert( ts[2].pos.lineno == 1 );
assert( ts[2].pos.column == 9 );
assert( ts[2].kind == Token.Kind.identifier );
assert( ts[2].str == "a" );
assert( ts[3].pos.lineno == 2 );
assert( ts[3].pos.column == 2 );
assert( ts[3].kind == Token.Kind.identifier );
assert( ts[3].str == "pen" );
// consecutive symbols are always separated
// hence, no "++" or "<<" or ...
assert( ts[4].pos.lineno == 2 );
assert( ts[4].pos.column == 6 );
assert( ts[4].str == ":" );
assert( ts[5].pos.lineno == 2 );
assert( ts[5].pos.column == 7 );
assert( ts[5].str == "-" );
assert( ts[6].pos.lineno == 2 );
assert( ts[6].pos.column == 8 );
assert( ts[6].str == "(" );
assert( ts.length == 7 );
}
unittest
{
auto lex2 = lexerFromString(" a12\n3a 5 ");
assert( lex2.front.str == "a12" );
assert( lex2.front.kind == Token.Kind.identifier );
lex2.popFront;
auto lex3 = lex2.save;
assert( lex2.front.str == "3a" );
assert( lex2.front.kind == Token.Kind.identifier );
lex2.popFront;
assert( lex3.front.str == "3a" );
assert( lex3.front.kind == Token.Kind.identifier );
assert( lex2.front.str == "5" );
assert( lex2.front.kind == Token.Kind.number );
lex2.popFront;
lex3.popFront;
assert( lex2.empty );
assert( !lex3.empty );
assert( lex3.front.str == "5" );
assert( lex3.front.kind == Token.Kind.number );
}
unittest
{
//!! be sure to run the unittest on the root of the source directory
auto lexf = lexerFromFile("polemy/lex.d");
assert( lexf.front.str == "module", lexf.front.str );
assert( lexf.front.pos.filename == "polemy/lex.d" );
assert( lexf.front.pos.lineno == 1 );
assert( lexf.front.pos.column == 1 );
lexf.popFront;
assert( lexf.front.str == "polemy" );
assert( lexf.front.pos.lineno == 1 );
assert( lexf.front.pos.column == 8 );
lexf.popFront;
assert( lexf.front.str == "." );
lexf.popFront;
assert( lexf.front.str == "lex" );
lexf.popFront;
assert( lexf.front.str == ";" );
lexf.popFront;
assert( lexf.front.str == "import" );
assert( lexf.front.pos.lineno == 2 );
assert( lexf.front.pos.column == 1 );
}
unittest
{
auto lex = lexerFromString(`my # comment should
# hey!!
be ignored.
hahaha"hihihi""hu\\\"huhu"#123 aa
123 aa "aaa
bbb # 123
eee"
zzz
`);
Token[] ts = std.array.array(lex);
assert( ts[0].str == "my" );
assert( ts[0].pos.lineno == 1 );
assert( ts[1].str == "be" );
assert( ts[1].pos.lineno == 3 );
assert( ts[2].str == "ignored" );
assert( ts[3].str == "." );
assert( ts[4].str == "hahaha" );
assert( ts[4].pos.lineno == 4 );
assert( ts[4].kind == Token.Kind.identifier );
assert( ts[5].str == "hihihi" );
assert( ts[5].pos.lineno == 4 );
assert( ts[5].kind == Token.Kind.stringLiteral );
assert( ts[6].str == `hu\"huhu` );
assert( ts[6].kind == Token.Kind.stringLiteral );
assert( ts[6].pos.lineno == 4 );
assert( ts[7].str == "123" );
assert( ts[7].pos.lineno == 5 );
assert( ts[7].kind == Token.Kind.number );
assert( ts[8].str == "aa" );
assert( ts[9].pos.lineno == 5 );
assert( ts[9].str == "aaa\nbbb # 123\neee" );
assert( ts[9].kind == Token.Kind.stringLiteral );
assert( ts[10].pos.lineno == 8 );
assert( ts.length == 11 );
}