/**
* Authors: k.inaba
* License: NYSL 0.9982 http://www.kmonos.net/nysl/
*
* Lexer for Polemy programming language.
*/
module polemy.lex;
import polemy._common;
import std.file : readText;
import std.ctype : isspace, isalnum;
/*mixin*/
template ExceptionWithPosition()
{
const LexPosition pos;
this( const LexPosition pos, string msg, string file=null, size_t line=0, Throwable next=null )
{
if(pos is null)
super(sprintf!"[??] %s"(msg), file, line, next);
else
super(sprintf!"[%s] %s"(pos, msg), file, line, next);
this.pos = pos;
}
}
/// Thrown when encountered an EOF in the middle of a lexical token
class UnexpectedEOF : Exception
{
mixin ExceptionWithPosition;
}
/// Thrown when encountered a lexical error
class LexException : Exception
{
mixin ExceptionWithPosition;
};
/// Represents a position in source codes
class LexPosition
{
immutable string filename; /// name of the source file
immutable int lineno; /// 1-origin
immutable int column; /// 1-origin
mixin SimpleClass;
override string toString() const
{ return sprintf!"%s:%d:%d"(filename, lineno, column); }
static immutable LexPosition dummy;
static this(){ dummy = new immutable(LexPosition)("<unnamed>",0,0); }
}
unittest
{
auto p = new LexPosition("hello.cpp", 123, 45);
assert_eq( p.filename, "hello.cpp" );
assert_eq( p.lineno, 123 );
assert_eq( p.column, 45 );
assert_eq( text(p), "hello.cpp:123:45" );
assert( !__traits(compiles, new LexPosition) );
assert( !__traits(compiles, p.filename="foo") );
assert( !__traits(compiles, p.lineno =789) );
assert( !__traits(compiles, p.column =222) );
auto q = new LexPosition("hello.cpp", 123, 46);
assert_lt( p, q );
assert_ne( p, q );
}
/// Represents a lexer token
class Token
{
immutable LexPosition pos; /// Position where the token occurred in the source
immutable string str; /// The token string itself
immutable bool quoted; /// Was it a "quoted" token or unquoted?
mixin SimpleClass;
}
unittest
{
auto p = new immutable(LexPosition)("hello.cpp", 123, 45);
auto t = new Token(p, "class", false);
auto u = new Token(p, "class", true);
assert_eq( t.pos, p );
assert_eq( t.str, "class" );
assert( !t.quoted );
assert_eq( t, new Token(p, "class", false) );
assert_lt( t, new Token(p, "struct", false) );
assert_ne( t, u );
assert( u.quoted );
assert( !__traits(compiles, new Token) );
assert( !__traits(compiles, t.pos=p) );
assert( !__traits(compiles, t.str=789) );
assert( !__traits(compiles, t.quoted=true) );
}
/// Named Construtors for Lexer
Lexer lexerFromFile(T...)( string filename, T ln_cn )
{
return lexerFromString( std.file.readText(filename), filename, ln_cn );
}
/// Named Construtor for Lexer
LexerT!(PositionedReader!CharSeq) /* ddoc doesn't recognize auto return... bugzilla:2581 */
lexerFromString(CharSeq)( CharSeq str, string filename="<unnamed>", int lineno=1, int column=1 )
{
return new LexerT!(PositionedReader!CharSeq)(
PositionedReader!CharSeq(str, filename, lineno, column)
);
}
/// Standard Lexer Type (all you have to know is that this is a forward range of Tokens!)
alias LexerT!(PositionedReader!string) Lexer;
/// Lexer Implementation
class LexerT(Reader)
if( isForwardRange!(Reader) && is(ElementType!(Reader)==dchar) )
{
/// Range primitive
bool empty() /*@property*/
{
return current is null;
}
/// Range primitive
Token front() /*@property*/
{
return std.exception.enforce(current, "Lexer has already reached the end");
}
/// Range primitive
void popFront() /*@property*/
{
std.exception.enforce(current, "Lexer has already reached the end");
current = readNext();
}
/// Range primitive
typeof(this) save() /*@property*/
{
return new typeof(this)(reader.save, current);
}
private: // implementation
Reader reader;
Token current;
invariant()
{
assert( reader.empty || !isSpace(reader.front) );
}
this( Reader reader, Token current = null )
{
this.reader = reader;
readWhile!isSpace();
this.current = (current is null ? readNext() : current);
}
public static
{
bool isSpace (dchar c) { return std.ctype.isspace(c)!=0; }
bool isSymbol (dchar c) { return 0x21<=c && c<=0x7f && !std.ctype.isalnum(c) && c!='_' && c!='\''; }
bool isSSymbol (dchar c) { return "()[]{};@".canFind(c); }
bool isMSymbol (dchar c) { return isSymbol(c) && !isSSymbol(c) && c!='"' && c!='#'; }
bool isLetter (dchar c) { return !isSpace(c) && !isSymbol(c); }
}
string readQuoted(const LexPosition pos){char[] buf; return readQuoted(pos,buf);}
string readQuoted(const LexPosition pos, ref char[] buf)
{
if( reader.empty )
throw genex!UnexpectedEOF(pos, "Quoted string not terminated");
dchar c = reader.front;
reader.popFront;
if( c == '"' )
return assumeUnique(buf);
if( c == '\\' && !reader.empty ) {
if( reader.front=='"' ) {
reader.popFront;
return readQuoted(pos,buf ~= '\"');
}
if( reader.front=='\\' ) {
reader.popFront;
return readQuoted(pos,buf ~= '\\');
}
}
return readQuoted(pos,buf ~= c);
}
string readWhile(alias fn)()
{
char[] buf;
for(; !reader.empty && fn(reader.front); reader.popFront)
buf ~= reader.front;
return assumeUnique(buf);
}
Token readNext()
{
if( reader.empty )
return null;
scope(success)
readWhile!isSpace();
if( reader.front == '#' ) // comment
{
reader = find(reader, '\n');
readWhile!isSpace();
return readNext();
}
else if( reader.front == '"' ) // quoted
{
auto pos = reader.currentPosition();
reader.popFront;
return new Token(pos, readQuoted(pos), true);
}
else if( isSSymbol(reader.front) ) // paren
{
auto pos = reader.currentPosition();
string s; s~=reader.front; reader.popFront;
return new Token(pos, s, false);
}
else if( isMSymbol(reader.front) ) // symbol
{
auto pos = reader.currentPosition();
return new Token(pos, readWhile!isMSymbol(), false);
}
else
{
auto pos = reader.currentPosition();
return new Token(pos, readWhile!isLetter(), false);
}
}
}
unittest
{
assert( std.range.isForwardRange!(Lexer) );
assert( is(ElementType!(Lexer) == Token) );
}
unittest
{
auto lex = lexerFromString("this is a \t\r\n pen :-( @@; ");
Token[] ts = std.array.array(lex);
assert_eq( ts[0].pos.lineno, 1 );
assert_eq( ts[0].pos.column, 1 );
assert( !ts[0].quoted );
assert_eq( ts[0].str, "this" );
assert_eq( ts[1].pos.lineno, 1 );
assert_eq( ts[1].pos.column, 6 );
assert( !ts[1].quoted );
assert_eq( ts[1].str, "is" );
assert_eq( ts[2].pos.lineno, 1 );
assert_eq( ts[2].pos.column, 9 );
assert( !ts[2].quoted );
assert_eq( ts[2].str, "a" );
assert_eq( ts[3].pos.lineno, 2 );
assert_eq( ts[3].pos.column, 2 );
assert( !ts[3].quoted );
assert_eq( ts[3].str, "pen" );
assert_eq( ts[4].pos.lineno, 2 );
assert_eq( ts[4].pos.column, 6 );
assert_eq( ts[4].str, ":-" );
assert_eq( ts[5].pos.lineno, 2 );
assert_eq( ts[5].pos.column, 8 );
assert_eq( ts[5].str, "(" );
assert_eq( ts[6].str, "@" );
assert_eq( ts[7].str, "@" );
assert_eq( ts[8].str, ";" ); // paren and simicolons, atmarks are split
assert_eq( ts.length, 9 );
}
unittest
{
// !! be sure to run the unittest on the root of the source directory
auto lexf = lexerFromFile("polemy/lex.d");
lexf = find!`a.str == "module"`(lexf);
assert_eq( lexf.front.str, "module" );
assert_eq( lexf.front.pos.filename, "polemy/lex.d" );
assert_eq( lexf.front.pos.lineno, 7 );
assert_eq( lexf.front.pos.column, 1 );
lexf.popFront;
assert_eq( lexf.front.str, "polemy" );
assert_eq( lexf.front.pos.lineno, 7 );
assert_eq( lexf.front.pos.column, 8 );
lexf.popFront;
lexf.popFront;
lexf.popFront;
lexf.popFront;
assert_eq( lexf.front.str, "import" );
assert_eq( lexf.front.pos.lineno, 8 );
assert_eq( lexf.front.pos.column, 1 );
}
unittest
{
assert_throw!UnexpectedEOF( lexerFromString(`"`) );
}
unittest
{
auto lex = lexerFromString(`my # comment should`~"\r\n"~`# hey!!
be ignored.
hahaha"hihihi""hu\\\"huhu"#123 aa
123 aa "aaa`~"\n"~`bbb # 123`~"\r\n"~`eee"
zzz
`);
Token[] ts = std.array.array(lex);
assert_eq( ts[0].str, "my" );
assert_eq( ts[0].pos.lineno, 1 );
assert( !ts[0].quoted );
assert_eq( ts[1].str, "be" );
assert_eq( ts[1].pos.lineno, 3 );
assert( !ts[1].quoted );
assert_eq( ts[2].str, "ignored" );
assert( !ts[2].quoted );
assert_eq( ts[3].str, "." );
assert( !ts[3].quoted );
assert_eq( ts[4].str, "hahaha" );
assert_eq( ts[4].pos.lineno, 4 );
assert( !ts[4].quoted );
assert_eq( ts[5].str, "hihihi" );
assert_eq( ts[5].pos.lineno, 4 );
assert( ts[5].quoted );
assert_eq( ts[6].str, `hu\"huhu` );
assert_eq( ts[6].pos.lineno, 4 );
assert( ts[6].quoted );
assert_eq( ts[7].str, "123" );
assert_eq( ts[7].pos.lineno, 5 );
assert_eq( ts[8].str, "aa" );
assert_eq( ts[9].pos.lineno, 5 );
assert_eq( ts[9].str, "aaa\nbbb # 123\neee" );
assert( ts[9].quoted );
assert_eq( ts[10].pos.lineno, 8 );
assert( !ts[10].quoted );
assert_eq( ts.length, 11 );
}
unittest
{
auto lex2 = lexerFromString(" a12\n3a 5 ");
assert_eq( lex2.front.str, "a12" );
lex2.popFront;
auto lex3 = lex2.save;
assert_eq( lex2.front.str, "3a" );
lex2.popFront;
assert_eq( lex3.front.str, "3a" );
assert_eq( lex2.front.str, "5" );
lex2.popFront;
lex3.popFront;
assert( lex2.empty );
assert( !lex3.empty );
assert_eq( lex3.front.str, "5" );
}
unittest
{
auto lex = lexerFromString(`=""`);
assert_eq(lex.front.str, "="); lex.popFront;
assert_eq(lex.front.str, ""); lex.popFront;
assert( lex.empty );
assert_eq( lexerFromString(`-@`).front.str, "-" );
}
/// Forward range for reader character by character,
/// keeping track of position information and caring \r\n -> \n conversion.
struct PositionedReader(CharSeq)
if( isForwardRange!(CharSeq) && is(ElementType!(CharSeq)==dchar) )
{
CharSeq buffer;
string filename;
int lineno;
int column;
/// Range primitive
bool empty() /*@property*/
{
return buffer.empty;
}
/// Range primitive
dchar front() /*@property*/
{
dchar c = buffer.front;
return (c=='\r' ? '\n' : c);
}
/// Range primitive
void popFront() /*@property*/
{
dchar c = buffer.front;
buffer.popFront;
if( c=='\r' )
{
if( !buffer.empty && buffer.front=='\n' )
buffer.popFront;
c = '\n';
}
if( c=='\n' )
{
lineno ++;
column = 1;
}
else
column ++;
}
/// Range primitive
typeof(this) save() /*@property*/
{
return this;
}
/// Get the current position
immutable(LexPosition) currentPosition() const
{
return new immutable(LexPosition)(filename, lineno, column);
}
}
unittest
{
assert( isForwardRange!(PositionedReader!string) );
assert( is(ElementType!(PositionedReader!string) == dchar) );
}