Differences From Artifact [0972f7a454ea8e4f]:
- File
polemy/lex.d
- 2010-11-08 08:45:51 - part of checkin [8d297342aa] on branch trunk - Replaced Token.Kind with bool quoted (user: kinaba) [annotate]
To Artifact [5f52873e3ff7ae30]:
- File
polemy/lex.d
- 2010-11-08 11:42:14 - part of checkin [5e407d7cf8] on branch trunk - Lexer Refactored so that it can accpet multi-symbol token (user: kinaba) [annotate]
2 2 * Authors: k.inaba
3 3 * License: NYSL 0.9982 http://www.kmonos.net/nysl/
4 4 *
5 5 * Lexer for Polemy programming language.
6 6 */
7 7 module polemy.lex;
8 8 import polemy._common;
9 +import std.file : readText;
10 +import std.ctype : isspace, isalnum;
9 11
10 -import std.file : readText;
11 -import std.string : munch;
12 -import std.ctype;
12 +/// Exception from this module
13 +
14 +class LexException : Exception
15 +{
16 + this( const LexPosition pos, string msg )
17 + { super(sprintf!"%s [%s]"(msg, pos)); this.pos = pos; }
18 + const LexPosition pos;
19 +};
13 20
14 21 /// Represents a position in a source code
15 22
16 23 class LexPosition
17 24 {
18 25 immutable string filename; /// name of the source file
19 26 immutable int lineno; /// line number, 1, 2, ...
................................................................................
74 81 assert( !__traits(compiles, t.pos=p) );
75 82 assert( !__traits(compiles, t.str=789) );
76 83 assert( !__traits(compiles, t.quoted=true) );
77 84 }
78 85
79 86 /// Named Construtor for Lexer
80 87
81 -Lexer lexerFromFile(T...)( string filename, T rest )
88 +auto lexerFromFile(T...)( string filename, T rest )
82 89 {
83 90 return lexerFromString( std.file.readText(filename), filename, rest );
84 91 }
85 92
86 93 /// Named Construtor for Lexer
87 94
88 -Lexer lexerFromString( string str, string filename="<unnamed>", int lineno=1, int column=1 )
95 +auto lexerFromString(CharSeq)( CharSeq str, string filename="<unnamed>", int lineno=1, int column=1 )
89 96 {
90 - return new Lexer(str, filename, lineno, column);
97 + return new LexerT!(PositionedReader!CharSeq)(
98 + PositionedReader!CharSeq(str, filename, lineno, column)
99 + );
91 100 }
92 101
93 -/// Lexer is a forward range of Tokens
102 +/// Standard Lexer Type (all users have to know is that this is a forward range of Tokens)
94 103
95 -class Lexer
104 +alias LexerT!(PositionedReader!string) Lexer;
105 +
106 +/// Lexer Implementation
107 +
108 +class LexerT(Reader)
109 + if( isForwardRange!(Reader) && is(ElementType!(Reader) == dchar) )
96 110 {
97 111 /// Range primitive
98 112 bool empty() /*@property*/
99 113 {
100 114 return current is null;
101 115 }
102 116
................................................................................
110 124 void popFront() /*@property*/
111 125 {
112 126 std.exception.enforce(current, "Lexer has already reached the end");
113 127 current = readNext();
114 128 }
115 129
116 130 /// Range primitive
117 - Lexer save() /*@property*/
131 + typeof(this) save() /*@property*/
118 132 {
119 - return new Lexer(this.tupleof);
133 + return new typeof(this)(reader.save, current);
120 134 }
121 135
122 136 private: // implementation
123 137
124 - string buffer;
125 - string filename;
126 - int lineno;
127 - int column;
138 + Reader reader;
128 139 Token current;
129 140
130 141 invariant()
131 142 {
132 - assert( buffer.empty || !std.ctype.isspace(buffer[0]) );
143 + assert( reader.empty || !std.ctype.isspace(reader.front) );
144 + }
145 +
146 + this( Reader reader, Token current = null )
147 + {
148 + this.reader = reader;
149 + readWhile!isSpace();
150 + this.current = (current is null ? readNext() : current);
151 + }
152 +
153 + public static {
154 + bool isSpace (dchar c) { return std.ctype.isspace(c)!=0; }
155 + bool isSymbol (dchar c) { return 0x21<=c && c<=0x7f && !std.ctype.isalnum(c) && c!='_' && c!='\''; }
156 + bool isSSymbol (dchar c) { return !find("()[]{};", c).empty; }
157 + bool isMSymbol (dchar c) { return isSymbol(c) && !isSSymbol(c); }
158 + bool isLetter (dchar c) { return !isSpace(c) && !isSymbol(c); }
133 159 }
134 160
135 - this( string buffer, string filename, int lineno, int column, Token current=null )
161 + string readQuoted(const LexPosition pos){char[] buf; return readQuoted(pos,buf);}
162 + string readQuoted(const LexPosition pos, ref char[] buf)
136 163 {
137 - this.buffer = buffer;
138 - this.filename = filename;
139 - this.lineno = lineno;
140 - this.column = column;
141 - skipws();
142 - this.current = (current is null ? readNext() : current);
164 + if( reader.empty )
165 + throw new LexException(pos, "EOF found while lexing a quoted-string");
166 + dchar c = reader.front;
167 + reader.popFront;
168 + if( c == '"' )
169 + return assumeUnique(buf);
170 + if( c == '\\' && !reader.empty ) {
171 + if( reader.front=='"' ) {
172 + reader.popFront;
173 + return readQuoted(pos,buf ~= '\"');
174 + }
175 + if( reader.front=='\\' ) {
176 + reader.popFront;
177 + return readQuoted(pos,buf ~= '\\');
178 + }
179 + }
180 + return readQuoted(pos,buf ~= c);
143 181 }
144 182
145 - void skipws()
183 + string readWhile(alias fn)()
146 184 {
147 - bool progress = false;
148 - do
149 - {
150 - string ws = buffer.munch(" \t");
151 - column += ws.length;
152 - progress = !ws.empty;
153 - while( !buffer.empty && (buffer[0]=='\r' || buffer[0]=='\n') )
154 - {
155 - progress = true;
156 - if( buffer[0] == '\n' )
157 - buffer = buffer[1..$];
158 - else // if( buffer.front == '\r' )
159 - {
160 - buffer = buffer[1..$];
161 - if( !buffer.empty && buffer[0]=='\n' )
162 - buffer = buffer[1..$];
163 - }
164 - lineno ++;
165 - column = 1;
166 - }
167 - }while( progress );
185 + char[] buf;
186 + for(; !reader.empty && fn(reader.front); reader.popFront)
187 + buf ~= reader.front;
188 + return assumeUnique(buf);
168 189 }
169 190
170 - char readChar()
171 - {
172 - scope(exit) {
173 - buffer = buffer[1..$];
174 - column ++;
175 - }
176 - return buffer[0];
177 - }
178 -
179 - /// This is the main lexing routine
180 191 Token readNext()
181 192 {
182 - if( buffer.empty )
193 + if( reader.empty )
183 194 return null;
184 - scope(exit)
185 - skipws();
186 -
187 - if( isSymbol(buffer[0]) )
195 + scope(success)
196 + readWhile!isSpace();
197 + if( reader.front == '#' ) // comment
198 + {
199 + reader = find(reader, '\n');
200 + readWhile!isSpace();
201 + return readNext();
202 + }
203 + else if( reader.front == '"' ) // quoted
204 + {
205 + auto pos = reader.currentPosition();
206 + reader.popFront;
207 + return new Token(pos, readQuoted(pos), true);
208 + }
209 + else if( isSSymbol(reader.front) ) // paren
210 + {
211 + auto pos = reader.currentPosition();
212 + string s; s~=reader.front; reader.popFront;
213 + return new Token(pos, s, false);
214 + }
215 + else if( isMSymbol(reader.front) ) // symbol
188 216 {
189 - if( buffer[0] == '#' )
190 - {
191 - // skip comment
192 - while( !buffer.empty && (buffer[0]!='\n' && buffer[0]!='\r') )
193 - readChar();
194 - skipws();
195 - return readNext();
196 - }
197 - else if( buffer[0] == '"' )
198 - {
199 - // string literal
200 - auto pos = currentPosition();
201 - string lit;
202 - readChar();
203 - while( !buffer.empty && buffer[0]!='"' )
204 - {
205 - // read one char
206 - char c = readChar();
207 - if( c == '\\' )
208 - {
209 - if( !buffer.empty && (buffer[0]=='\\' || buffer[0]=='"') )
210 - lit ~= readChar();
211 - else
212 - lit ~= c;
213 - }
214 - else if( c == '\n' )
215 - {
216 - lit ~= c;
217 - lineno++;
218 - column = 1;
219 - }
220 - else if( c == '\r' )
221 - {
222 - if( !buffer.empty && buffer[0]=='\n' )
223 - readChar();
224 - lit ~= '\n';
225 - lineno++;
226 - column = 1;
227 - }
228 - else
229 - lit ~= c;
230 - }
231 - if( !buffer.empty )
232 - readChar();
233 - return new Token(pos, lit, true);
234 - }
235 - else
236 - {
237 - // normal symbol
238 - auto pos = currentPosition();
239 - auto str = ""~readChar();
240 - return new Token(pos, str, false);
241 - }
217 + auto pos = reader.currentPosition();
218 + return new Token(pos, readWhile!isMSymbol(), false);
242 219 }
243 220 else
244 221 {
245 - auto pos = currentPosition();
246 - int i = 0;
247 - while( i<buffer.length && !std.ctype.isspace(buffer[i]) && !isSymbol(buffer[i]) )
248 - ++i;
249 - auto str = buffer[0 .. i];
250 - buffer = buffer[i .. $];
251 - column += i;
252 - return new Token(pos, str, false);
222 + auto pos = reader.currentPosition();
223 + return new Token(pos, readWhile!isLetter(), false);
253 224 }
254 225 }
255 -
256 - bool isSymbol(char c)
257 - {
258 - return (0x21<=c && c<=0x7f && !std.ctype.isalnum(c) && c!='_');
259 - }
260 -
261 - immutable(LexPosition) currentPosition()
262 - {
263 - return new immutable(LexPosition)(filename, lineno, column);
264 - }
265 226 }
266 227
267 228 unittest
268 229 {
269 230 assert( std.range.isForwardRange!(Lexer) );
270 231 }
271 232
272 233 unittest
273 234 {
274 - auto lex = lexerFromString("this is a \t\r\n pen :-( ");
235 + auto lex = lexerFromString("this is a \t\r\n pen :-( @@; ");
275 236 Token[] ts = std.array.array(lex);
276 237
277 238 assert_eq( ts[0].pos.lineno, 1 );
278 239 assert_eq( ts[0].pos.column, 1 );
279 240 assert( !ts[0].quoted );
280 241 assert_eq( ts[0].str, "this" );
281 242
................................................................................
290 251 assert_eq( ts[2].str, "a" );
291 252
292 253 assert_eq( ts[3].pos.lineno, 2 );
293 254 assert_eq( ts[3].pos.column, 2 );
294 255 assert( !ts[3].quoted );
295 256 assert_eq( ts[3].str, "pen" );
296 257
297 - // consecutive symbols are always separated
298 - // hence, no "++" or "<<" or ...
299 -
300 258 assert_eq( ts[4].pos.lineno, 2 );
301 259 assert_eq( ts[4].pos.column, 6 );
302 - assert_eq( ts[4].str, ":" );
260 + assert_eq( ts[4].str, ":-" );
303 261
304 262 assert_eq( ts[5].pos.lineno, 2 );
305 - assert_eq( ts[5].pos.column, 7 );
306 - assert_eq( ts[5].str, "-" );
263 + assert_eq( ts[5].pos.column, 8 );
264 + assert_eq( ts[5].str, "(" );
265 + assert_eq( ts[6].str, "@@" );
266 + assert_eq( ts[7].str, ";" ); // paren and simicolons are split
307 267
308 - assert_eq( ts[6].pos.lineno, 2 );
309 - assert_eq( ts[6].pos.column, 8 );
310 - assert_eq( ts[6].str, "(" );
311 -
312 - assert_eq( ts.length, 7 );
268 + assert_eq( ts.length, 8 );
313 269 }
314 270
315 271 unittest
316 272 {
317 - auto lex2 = lexerFromString(" a12\n3a 5 ");
318 - assert_eq( lex2.front.str, "a12" );
319 - lex2.popFront;
320 - auto lex3 = lex2.save;
321 - assert_eq( lex2.front.str, "3a" );
322 - lex2.popFront;
323 - assert_eq( lex3.front.str, "3a" );
324 - assert_eq( lex2.front.str, "5" );
325 - lex2.popFront;
326 - lex3.popFront;
327 - assert( lex2.empty );
328 - assert( !lex3.empty );
329 - assert_eq( lex3.front.str, "5" );
330 -}
331 -
332 -unittest
333 -{
334 -//!! be sure to run the unittest on the root of the source directory
273 + // !! be sure to run the unittest on the root of the source directory
335 274 auto lexf = lexerFromFile("polemy/lex.d");
336 275 lexf = find!`a.str == "module"`(lexf);
337 276 assert_eq( lexf.front.str, "module" );
338 277 assert_eq( lexf.front.pos.filename, "polemy/lex.d" );
339 278 assert_eq( lexf.front.pos.lineno, 7 );
340 279 assert_eq( lexf.front.pos.column, 1 );
341 280 lexf.popFront;
342 281 assert_eq( lexf.front.str, "polemy" );
343 282 assert_eq( lexf.front.pos.lineno, 7 );
344 283 assert_eq( lexf.front.pos.column, 8 );
345 284 lexf.popFront;
346 - assert_eq( lexf.front.str, "." );
347 285 lexf.popFront;
348 - assert_eq( lexf.front.str, "lex" );
349 286 lexf.popFront;
350 - assert_eq( lexf.front.str, ";" );
351 287 lexf.popFront;
352 288 assert_eq( lexf.front.str, "import" );
353 289 assert_eq( lexf.front.pos.lineno, 8 );
354 290 assert_eq( lexf.front.pos.column, 1 );
355 291 }
292 +
293 +unittest
294 +{
295 + assert_throw!LexException( lexerFromString(`"`) );
296 +}
356 297
357 298 unittest
358 299 {
359 300 auto lex = lexerFromString(`my # comment should`~"\r\n"~`# hey!!
360 301 be ignored.
361 302 hahaha"hihihi""hu\\\"huhu"#123 aa
362 -123 aa "aaa`~"\r\n"~`bbb # 123`~"\r\n"~`eee"
303 +123 aa "aaa`~"\n"~`bbb # 123`~"\r\n"~`eee"
363 304 zzz
364 305 `);
365 306 Token[] ts = std.array.array(lex);
366 307 assert_eq( ts[0].str, "my" );
367 308 assert_eq( ts[0].pos.lineno, 1 );
368 309 assert( !ts[0].quoted );
369 310 assert_eq( ts[1].str, "be" );
................................................................................
388 329 assert_eq( ts[9].pos.lineno, 5 );
389 330 assert_eq( ts[9].str, "aaa\nbbb # 123\neee" );
390 331 assert( ts[9].quoted );
391 332 assert_eq( ts[10].pos.lineno, 8 );
392 333 assert( !ts[10].quoted );
393 334 assert_eq( ts.length, 11 );
394 335 }
336 +
337 +unittest
338 +{
339 + auto lex2 = lexerFromString(" a12\n3a 5 ");
340 + assert_eq( lex2.front.str, "a12" );
341 + lex2.popFront;
342 + auto lex3 = lex2.save;
343 + assert_eq( lex2.front.str, "3a" );
344 + lex2.popFront;
345 + assert_eq( lex3.front.str, "3a" );
346 + assert_eq( lex2.front.str, "5" );
347 + lex2.popFront;
348 + lex3.popFront;
349 + assert( lex2.empty );
350 + assert( !lex3.empty );
351 + assert_eq( lex3.front.str, "5" );
352 +}
353 +
354 +/// Forward range for reader character by character,
355 +/// keeping track of position information and caring \r\n -> \n conversion.
356 +
357 +private
358 +struct PositionedReader(CharSeq)
359 + if( isForwardRange!(CharSeq) && is(ElementType!(CharSeq) == dchar) )
360 +{
361 + CharSeq buffer;
362 + string filename;
363 + int lineno;
364 + int column;
365 +
366 + /// Range primitive
367 + bool empty() /*@property*/
368 + {
369 + return buffer.empty;
370 + }
371 +
372 + /// Range primitive
373 + dchar front() /*@property*/
374 + {
375 + dchar c = buffer.front;
376 + return (c=='\r' ? '\n' : c);
377 + }
378 +
379 + /// Range primitive
380 + void popFront() /*@property*/
381 + {
382 + dchar c = buffer.front;
383 + buffer.popFront;
384 + if( c=='\r' )
385 + {
386 + if( !buffer.empty && buffer.front=='\n' )
387 + buffer.popFront;
388 + c = '\n';
389 + }
390 + if( c=='\n' )
391 + {
392 + lineno ++;
393 + column = 1;
394 + }
395 + else
396 + column ++;
397 + }
398 +
399 + /// Range primitive
400 + typeof(this) save() /*@property*/
401 + {
402 + return this;
403 + }
404 +
405 + /// Get the current position
406 + immutable(LexPosition) currentPosition() const
407 + {
408 + return new immutable(LexPosition)(filename, lineno, column);
409 + }
410 +}
411 +
412 +unittest
413 +{
414 + assert( isForwardRange!(PositionedReader!string) );
415 + assert( is(ElementType!(PositionedReader!string) == dchar) );
416 +}