Artifact Content
Not logged in

Artifact cc80329bc2ab2034954b236f76d60f8d846f989a


/**
 * Authors: k.inaba
 * License: NYSL 0.9982 http://www.kmonos.net/nysl/
 *
 * Lexer for Polemy programming language.
 */
module polemy.lex;
import polemy._common;
import std.file  : readText;
import std.ctype : isspace, isalnum;

/// Exception from this module

class LexException : Exception
{
	const LexPosition pos;

	private this( const LexPosition pos, string msg )
		{ super(sprintf!"%s [%s]"(msg, pos)); this.pos = pos; }
};

/// Represents a position in a source code

class LexPosition
{
	immutable string filename; /// name of the source file
	immutable int    lineno;   /// 1-origin
	immutable int    column;   /// 1-origin

	override string toString() const
		{ return sprintf!"%s:%d:%d"(filename, lineno, column); }

	mixin SimpleConstructor;
	mixin SimpleCompare;

	static immutable LexPosition dummy;
	static this(){ dummy = new immutable(LexPosition)("<unnamed>",0,0); }
}

unittest
{
	auto p = new LexPosition("hello.cpp", 123, 45);
	auto q = new LexPosition("hello.cpp", 123, 46);

	assert_eq( p.filename, "hello.cpp" );
	assert_eq( p.lineno, 123 );
	assert_eq( p.column, 45 );
	assert_eq( to!string(p), "hello.cpp:123:45" );
	assert_lt( p, q );
	assert_ne( p, q );

	assert( !__traits(compiles, new LexPosition) );
	assert( !__traits(compiles, p.filename="foo") );
	assert( !__traits(compiles, p.lineno  =789) );
	assert( !__traits(compiles, p.column  =222) );
}

/// Represents a lexer token

class Token
{
	immutable LexPosition pos;    /// Position where the token occurred in the source
	immutable string      str;    /// The token string itself
	immutable bool        quoted; /// Was it a "quoted" token or unquoted?

	mixin SimpleConstructor;
	mixin SimpleCompare;
	mixin SimpleToString;
}

unittest
{
	auto p = new immutable(LexPosition)("hello.cpp", 123, 45);
	auto t = new Token(p, "class", false);
	auto u = new Token(p, "class", true);

	assert_eq( t.pos, p );
	assert_eq( t.str, "class" );
	assert( !t.quoted );
	assert_eq( t, new Token(p, "class", false) );
	assert_lt( t, new Token(p, "struct", false) );
	assert_ne( t, u );
	assert( u.quoted );

	assert( !__traits(compiles, new Token) );
	assert( !__traits(compiles, t.pos=p) );
	assert( !__traits(compiles, t.str=789) );
	assert( !__traits(compiles, t.quoted=true) );
}

/// Named Construtors for Lexer

auto lexerFromFile(T...)( string filename, T rest )
{
	return lexerFromString( std.file.readText(filename), filename, rest );
}
	
auto lexerFromString(CharSeq)( CharSeq str, string filename="<unnamed>", int lineno=1, int column=1 )
{
 	return new LexerT!(PositionedReader!CharSeq)(
		PositionedReader!CharSeq(str, filename, lineno, column)
	);
}

/// Standard Lexer Type (all you have to know is that this is a forward range of Tokens)

alias LexerT!(PositionedReader!string) Lexer;

/// Lexer Implementation

class LexerT(Reader)
	if( isForwardRange!(Reader) && is(ElementType!(Reader) == dchar) )
{
	/// Range primitive
	bool empty() /*@property*/
	{
		return current is null;
	}

	/// Range primitive
	Token front() /*@property*/
	{
		return std.exception.enforce(current, "Lexer has already reached the end");
	}

	/// Range primitive
	void popFront() /*@property*/
	{
		std.exception.enforce(current, "Lexer has already reached the end");
		current = readNext();
	}

	/// Range primitive
	typeof(this) save() /*@property*/
	{
		return new typeof(this)(reader.save, current);
	}

private: // implementation

	Reader reader;
	Token  current;

	invariant()
	{
		assert( reader.empty || !std.ctype.isspace(reader.front) );
	}

	this( Reader reader, Token current = null )
	{
		this.reader = reader;
		readWhile!isSpace();
		this.current = (current is null ? readNext() : current);
	}

	public static {
		bool isSpace   (dchar c) { return std.ctype.isspace(c)!=0; }
		bool isSymbol  (dchar c) { return 0x21<=c && c<=0x7f && !std.ctype.isalnum(c) && c!='_' && c!='\''; }
		bool isSSymbol (dchar c) { return !find("()[]{};", c).empty; }
		bool isMSymbol (dchar c) { return isSymbol(c) && !isSSymbol(c) && c!='"' && c!='#'; }
		bool isLetter  (dchar c) { return !isSpace(c) && !isSymbol(c); }
	}

	string readQuoted(const LexPosition pos){char[] buf; return readQuoted(pos,buf);}
	string readQuoted(const LexPosition pos, ref char[] buf)
	{
		if( reader.empty )
			throw new LexException(pos, "EOF found while lexing a quoted-string");
		dchar c = reader.front;
		reader.popFront;
		if( c == '"' )
			return assumeUnique(buf);
		if( c == '\\' && !reader.empty ) {
			if( reader.front=='"' ) {
				reader.popFront;
				return readQuoted(pos,buf ~= '\"');
			}
			if( reader.front=='\\' ) {
				reader.popFront;
				return readQuoted(pos,buf ~= '\\');
			}
		}
		return readQuoted(pos,buf ~= c);
	}

	string readWhile(alias fn)()
	{
		char[] buf;
		for(; !reader.empty && fn(reader.front); reader.popFront)
			buf ~= reader.front;
		return assumeUnique(buf);
	}

	Token readNext()
	{
		if( reader.empty )
			return null;
		scope(success)
			readWhile!isSpace();
		if( reader.front == '#' ) // comment
		{
			reader = find(reader, '\n');
			readWhile!isSpace();
			return readNext();
		}
		else if( reader.front == '"' ) // quoted
		{
			auto pos = reader.currentPosition();
			reader.popFront;
			return new Token(pos, readQuoted(pos), true);
		}
		else if( isSSymbol(reader.front) ) // paren
		{
			auto pos = reader.currentPosition();
			string s; s~=reader.front; reader.popFront;
			return new Token(pos, s, false);
		}
		else if( isMSymbol(reader.front) ) // symbol
		{
			auto pos = reader.currentPosition();
			return new Token(pos, readWhile!isMSymbol(), false);
		}
		else
		{
			auto pos = reader.currentPosition();
			return new Token(pos, readWhile!isLetter(), false);
		}
	}
}

unittest
{
	assert( std.range.isForwardRange!(Lexer) );
}

unittest
{
	auto lex = lexerFromString("this	is a \t\r\n pen :-( @@;  ");
	Token[] ts = std.array.array(lex);

	assert_eq( ts[0].pos.lineno, 1 );
	assert_eq( ts[0].pos.column, 1 );
	assert(   !ts[0].quoted );
	assert_eq( ts[0].str, "this" );

	assert_eq( ts[1].pos.lineno, 1 );
	assert_eq( ts[1].pos.column, 6 );
	assert(   !ts[1].quoted );
	assert_eq( ts[1].str, "is" );

	assert_eq( ts[2].pos.lineno, 1 );
	assert_eq( ts[2].pos.column, 9 );
	assert(   !ts[2].quoted );
	assert_eq( ts[2].str, "a" );

	assert_eq( ts[3].pos.lineno, 2 );
	assert_eq( ts[3].pos.column, 2 );
	assert(   !ts[3].quoted );
	assert_eq( ts[3].str, "pen" );

	assert_eq( ts[4].pos.lineno, 2 );
	assert_eq( ts[4].pos.column, 6 );
	assert_eq( ts[4].str, ":-" );

	assert_eq( ts[5].pos.lineno, 2 );
	assert_eq( ts[5].pos.column, 8 );
	assert_eq( ts[5].str, "(" );
	assert_eq( ts[6].str, "@@" );
	assert_eq( ts[7].str, ";" ); // paren and simicolons are split

	assert_eq( ts.length, 8 );
}

unittest
{
	// !! be sure to run the unittest on the root of the source directory
	auto lexf = lexerFromFile("polemy/lex.d");	
	lexf = find!`a.str == "module"`(lexf);
	assert_eq( lexf.front.str, "module" );
	assert_eq( lexf.front.pos.filename, "polemy/lex.d" );
	assert_eq( lexf.front.pos.lineno, 7 );
	assert_eq( lexf.front.pos.column, 1 );
	lexf.popFront;
	assert_eq( lexf.front.str, "polemy" );
	assert_eq( lexf.front.pos.lineno, 7 );
	assert_eq( lexf.front.pos.column, 8 );
	lexf.popFront;
	lexf.popFront;
	lexf.popFront;
	lexf.popFront;
	assert_eq( lexf.front.str, "import" );
	assert_eq( lexf.front.pos.lineno, 8 );
	assert_eq( lexf.front.pos.column, 1 );
}

unittest
{
	assert_throw!LexException( lexerFromString(`"`) );
}

unittest
{
	auto lex = lexerFromString(`my # comment should`~"\r\n"~`# hey!!
be ignored.
hahaha"hihihi""hu\\\"huhu"#123 aa
123 aa "aaa`~"\n"~`bbb # 123`~"\r\n"~`eee"
zzz
`);
	Token[] ts = std.array.array(lex);
	assert_eq( ts[0].str, "my" );
	assert_eq( ts[0].pos.lineno, 1 );
	assert(   !ts[0].quoted );
	assert_eq( ts[1].str, "be" );
	assert_eq( ts[1].pos.lineno, 3 );
	assert(   !ts[1].quoted );
	assert_eq( ts[2].str, "ignored" );
	assert(   !ts[2].quoted );
	assert_eq( ts[3].str, "." );
	assert(   !ts[3].quoted );
	assert_eq( ts[4].str, "hahaha" );
	assert_eq( ts[4].pos.lineno, 4 );
	assert(   !ts[4].quoted );
	assert_eq( ts[5].str, "hihihi" );
	assert_eq( ts[5].pos.lineno, 4 );
	assert(    ts[5].quoted );
	assert_eq( ts[6].str, `hu\"huhu` );
	assert_eq( ts[6].pos.lineno, 4 );
	assert(    ts[6].quoted );
	assert_eq( ts[7].str, "123" );
	assert_eq( ts[7].pos.lineno, 5 );
	assert_eq( ts[8].str, "aa" );
	assert_eq( ts[9].pos.lineno, 5 );
	assert_eq( ts[9].str, "aaa\nbbb # 123\neee" );
	assert(    ts[9].quoted );
	assert_eq( ts[10].pos.lineno, 8 );
	assert(   !ts[10].quoted );
	assert_eq( ts.length, 11 );
}

unittest
{
	auto lex2 = lexerFromString(" a12\n3a 5 ");
	assert_eq( lex2.front.str, "a12" );
	lex2.popFront;
	auto lex3 = lex2.save;
	assert_eq( lex2.front.str, "3a" );
	lex2.popFront;
	assert_eq( lex3.front.str, "3a" );
	assert_eq( lex2.front.str, "5" );
	lex2.popFront;
	lex3.popFront;
	assert( lex2.empty );
	assert( !lex3.empty );
	assert_eq( lex3.front.str, "5" );
}

unittest
{
	auto lex = lexerFromString(`=""`);
	assert_eq(lex.front.str, "="); lex.popFront;
	assert_eq(lex.front.str, ""); lex.popFront;
	assert( lex.empty );
}

/// Forward range for reader character by character,
/// keeping track of position information and caring \r\n -> \n conversion.

private
struct PositionedReader(CharSeq)
	if( isForwardRange!(CharSeq) && is(ElementType!(CharSeq) == dchar) )
{
	CharSeq buffer;
	string  filename;
	int     lineno;
	int     column;

	/// Range primitive
	bool empty() /*@property*/
	{
		return buffer.empty;
	}

	/// Range primitive
	dchar front() /*@property*/
	{
		dchar c = buffer.front;
		return (c=='\r' ? '\n' : c);
	}

	/// Range primitive
	void popFront() /*@property*/
	{
		dchar c = buffer.front;
		buffer.popFront;
		if( c=='\r' )
		{
			if( !buffer.empty && buffer.front=='\n' )
				buffer.popFront;
			c = '\n';
		}
		if( c=='\n' )
		{
			lineno ++;
			column = 1;
		}
		else
			column ++;
	}

	/// Range primitive
	typeof(this) save() /*@property*/
	{
		return this;
	}

	/// Get the current position
	immutable(LexPosition) currentPosition() const
	{
		return new immutable(LexPosition)(filename, lineno, column);
	}
}

unittest
{
	assert( isForwardRange!(PositionedReader!string) );
	assert( is(ElementType!(PositionedReader!string) == dchar) );
}