/**
* Authors: k.inaba
* License: NYSL 0.9982 http://www.kmonos.net/nysl/
*
* Lexer for Polemy programming language.
*/
module polemy.lex;
import polemy._common;
import std.file : readText;
import std.string : munch;
import std.ctype;
/// Represents a position in a source code
class LexPosition
{
immutable string filename; /// name of the source file
immutable int lineno; /// line number, 1, 2, ...
immutable int column; /// column, 1, 2, ...
override string toString() const
{ return sprintf!"%s:%d:%d"(filename, lineno, column); }
mixin SimpleConstructor;
mixin SimpleCompare;
}
unittest
{
auto p = new LexPosition("hello.cpp", 123, 45);
auto q = new LexPosition("hello.cpp", 123, 46);
assert_eq( p.filename, "hello.cpp" );
assert_eq( p.lineno, 123 );
assert_eq( p.column, 45 );
assert_eq( to!string(p), "hello.cpp:123:45" );
assert_lt( p, q );
assert_ne( p, q );
assert( !__traits(compiles, new LexPosition) );
assert( !__traits(compiles, p.filename="foo") );
assert( !__traits(compiles, p.lineno =789) );
assert( !__traits(compiles, p.column =222) );
}
/// Represents a lexer token
class Token
{
immutable LexPosition pos; /// Position where the token occurred in the source
immutable string str; /// The token string itself
immutable bool quoted; /// Was it a "quoted" token or unquoted?
mixin SimpleConstructor;
mixin SimpleCompare;
}
unittest
{
auto p = new immutable(LexPosition)("hello.cpp", 123, 45);
auto t = new Token(p, "class", false);
auto u = new Token(p, "class", true);
assert_eq( t.pos, p );
assert_eq( t.str, "class" );
assert( !t.quoted );
assert_eq( t, new Token(p, "class", false) );
assert_lt( t, new Token(p, "struct", false) );
assert_ne( t, u );
assert( u.quoted );
assert( !__traits(compiles, new Token) );
assert( !__traits(compiles, t.pos=p) );
assert( !__traits(compiles, t.str=789) );
assert( !__traits(compiles, t.quoted=true) );
}
/// Named Construtor for Lexer
Lexer lexerFromFile(T...)( string filename, T rest )
{
return lexerFromString( std.file.readText(filename), filename, rest );
}
/// Named Construtor for Lexer
Lexer lexerFromString( string str, string filename="<unnamed>", int lineno=1, int column=1 )
{
return new Lexer(str, filename, lineno, column);
}
/// Lexer is a forward range of Tokens
class Lexer
{
/// Range primitive
bool empty() /*@property*/
{
return current is null;
}
/// Range primitive
Token front() /*@property*/
{
return std.exception.enforce(current, "Lexer has already reached the end");
}
/// Range primitive
void popFront() /*@property*/
{
std.exception.enforce(current, "Lexer has already reached the end");
current = readNext();
}
/// Range primitive
Lexer save() /*@property*/
{
return new Lexer(this.tupleof);
}
private: // implementation
string buffer;
string filename;
int lineno;
int column;
Token current;
invariant()
{
assert( buffer.empty || !std.ctype.isspace(buffer[0]) );
}
this( string buffer, string filename, int lineno, int column, Token current=null )
{
this.buffer = buffer;
this.filename = filename;
this.lineno = lineno;
this.column = column;
skipws();
this.current = (current is null ? readNext() : current);
}
void skipws()
{
bool progress = false;
do
{
string ws = buffer.munch(" \t");
column += ws.length;
progress = !ws.empty;
while( !buffer.empty && (buffer[0]=='\r' || buffer[0]=='\n') )
{
progress = true;
if( buffer[0] == '\n' )
buffer = buffer[1..$];
else // if( buffer.front == '\r' )
{
buffer = buffer[1..$];
if( !buffer.empty && buffer[0]=='\n' )
buffer = buffer[1..$];
}
lineno ++;
column = 1;
}
}while( progress );
}
char readChar()
{
scope(exit) {
buffer = buffer[1..$];
column ++;
}
return buffer[0];
}
/// This is the main lexing routine
Token readNext()
{
if( buffer.empty )
return null;
scope(exit)
skipws();
if( isSymbol(buffer[0]) )
{
if( buffer[0] == '#' )
{
// skip comment
while( !buffer.empty && (buffer[0]!='\n' && buffer[0]!='\r') )
readChar();
skipws();
return readNext();
}
else if( buffer[0] == '"' )
{
// string literal
auto pos = currentPosition();
string lit;
readChar();
while( !buffer.empty && buffer[0]!='"' )
{
// read one char
char c = readChar();
if( c == '\\' )
{
if( !buffer.empty && (buffer[0]=='\\' || buffer[0]=='"') )
lit ~= readChar();
else
lit ~= c;
}
else if( c == '\n' )
{
lit ~= c;
lineno++;
column = 1;
}
else if( c == '\r' )
{
if( !buffer.empty && buffer[0]=='\n' )
readChar();
lit ~= '\n';
lineno++;
column = 1;
}
else
lit ~= c;
}
if( !buffer.empty )
readChar();
return new Token(pos, lit, true);
}
else
{
// normal symbol
auto pos = currentPosition();
auto str = ""~readChar();
return new Token(pos, str, false);
}
}
else
{
auto pos = currentPosition();
int i = 0;
while( i<buffer.length && !std.ctype.isspace(buffer[i]) && !isSymbol(buffer[i]) )
++i;
auto str = buffer[0 .. i];
buffer = buffer[i .. $];
column += i;
return new Token(pos, str, false);
}
}
bool isSymbol(char c)
{
return (0x21<=c && c<=0x7f && !std.ctype.isalnum(c) && c!='_');
}
immutable(LexPosition) currentPosition()
{
return new immutable(LexPosition)(filename, lineno, column);
}
}
unittest
{
assert( std.range.isForwardRange!(Lexer) );
}
unittest
{
auto lex = lexerFromString("this is a \t\r\n pen :-( ");
Token[] ts = std.array.array(lex);
assert_eq( ts[0].pos.lineno, 1 );
assert_eq( ts[0].pos.column, 1 );
assert( !ts[0].quoted );
assert_eq( ts[0].str, "this" );
assert_eq( ts[1].pos.lineno, 1 );
assert_eq( ts[1].pos.column, 6 );
assert( !ts[1].quoted );
assert_eq( ts[1].str, "is" );
assert_eq( ts[2].pos.lineno, 1 );
assert_eq( ts[2].pos.column, 9 );
assert( !ts[2].quoted );
assert_eq( ts[2].str, "a" );
assert_eq( ts[3].pos.lineno, 2 );
assert_eq( ts[3].pos.column, 2 );
assert( !ts[3].quoted );
assert_eq( ts[3].str, "pen" );
// consecutive symbols are always separated
// hence, no "++" or "<<" or ...
assert_eq( ts[4].pos.lineno, 2 );
assert_eq( ts[4].pos.column, 6 );
assert_eq( ts[4].str, ":" );
assert_eq( ts[5].pos.lineno, 2 );
assert_eq( ts[5].pos.column, 7 );
assert_eq( ts[5].str, "-" );
assert_eq( ts[6].pos.lineno, 2 );
assert_eq( ts[6].pos.column, 8 );
assert_eq( ts[6].str, "(" );
assert_eq( ts.length, 7 );
}
unittest
{
auto lex2 = lexerFromString(" a12\n3a 5 ");
assert_eq( lex2.front.str, "a12" );
lex2.popFront;
auto lex3 = lex2.save;
assert_eq( lex2.front.str, "3a" );
lex2.popFront;
assert_eq( lex3.front.str, "3a" );
assert_eq( lex2.front.str, "5" );
lex2.popFront;
lex3.popFront;
assert( lex2.empty );
assert( !lex3.empty );
assert_eq( lex3.front.str, "5" );
}
unittest
{
//!! be sure to run the unittest on the root of the source directory
auto lexf = lexerFromFile("polemy/lex.d");
lexf = find!`a.str == "module"`(lexf);
assert_eq( lexf.front.str, "module" );
assert_eq( lexf.front.pos.filename, "polemy/lex.d" );
assert_eq( lexf.front.pos.lineno, 7 );
assert_eq( lexf.front.pos.column, 1 );
lexf.popFront;
assert_eq( lexf.front.str, "polemy" );
assert_eq( lexf.front.pos.lineno, 7 );
assert_eq( lexf.front.pos.column, 8 );
lexf.popFront;
assert_eq( lexf.front.str, "." );
lexf.popFront;
assert_eq( lexf.front.str, "lex" );
lexf.popFront;
assert_eq( lexf.front.str, ";" );
lexf.popFront;
assert_eq( lexf.front.str, "import" );
assert_eq( lexf.front.pos.lineno, 8 );
assert_eq( lexf.front.pos.column, 1 );
}
unittest
{
auto lex = lexerFromString(`my # comment should`~"\r\n"~`# hey!!
be ignored.
hahaha"hihihi""hu\\\"huhu"#123 aa
123 aa "aaa`~"\r\n"~`bbb # 123`~"\r\n"~`eee"
zzz
`);
Token[] ts = std.array.array(lex);
assert_eq( ts[0].str, "my" );
assert_eq( ts[0].pos.lineno, 1 );
assert( !ts[0].quoted );
assert_eq( ts[1].str, "be" );
assert_eq( ts[1].pos.lineno, 3 );
assert( !ts[1].quoted );
assert_eq( ts[2].str, "ignored" );
assert( !ts[2].quoted );
assert_eq( ts[3].str, "." );
assert( !ts[3].quoted );
assert_eq( ts[4].str, "hahaha" );
assert_eq( ts[4].pos.lineno, 4 );
assert( !ts[4].quoted );
assert_eq( ts[5].str, "hihihi" );
assert_eq( ts[5].pos.lineno, 4 );
assert( ts[5].quoted );
assert_eq( ts[6].str, `hu\"huhu` );
assert_eq( ts[6].pos.lineno, 4 );
assert( ts[6].quoted );
assert_eq( ts[7].str, "123" );
assert_eq( ts[7].pos.lineno, 5 );
assert_eq( ts[8].str, "aa" );
assert_eq( ts[9].pos.lineno, 5 );
assert_eq( ts[9].str, "aaa\nbbb # 123\neee" );
assert( ts[9].quoted );
assert_eq( ts[10].pos.lineno, 8 );
assert( !ts[10].quoted );
assert_eq( ts.length, 11 );
}