Differences From Artifact [0972f7a454ea8e4f]:
- File
polemy/lex.d
- 2010-11-08 08:45:51 - part of checkin [8d297342aa] on branch trunk - Replaced Token.Kind with bool quoted (user: kinaba) [annotate]
To Artifact [5f52873e3ff7ae30]:
- File
polemy/lex.d
- 2010-11-08 11:42:14 - part of checkin [5e407d7cf8] on branch trunk - Lexer Refactored so that it can accpet multi-symbol token (user: kinaba) [annotate]
2 * Authors: k.inaba 2 * Authors: k.inaba
3 * License: NYSL 0.9982 http://www.kmonos.net/nysl/ 3 * License: NYSL 0.9982 http://www.kmonos.net/nysl/
4 * 4 *
5 * Lexer for Polemy programming language. 5 * Lexer for Polemy programming language.
6 */ 6 */
7 module polemy.lex; 7 module polemy.lex;
8 import polemy._common; 8 import polemy._common;
> 9 import std.file : readText;
> 10 import std.ctype : isspace, isalnum;
9 11
10 import std.file : readText; | 12 /// Exception from this module
11 import std.string : munch; <
> 13
12 import std.ctype; | 14 class LexException : Exception
> 15 {
> 16 this( const LexPosition pos, string msg )
> 17 { super(sprintf!"%s [%s]"(msg, pos)); this.pos = pos; }
> 18 const LexPosition pos;
> 19 };
13 20
14 /// Represents a position in a source code 21 /// Represents a position in a source code
15 22
16 class LexPosition 23 class LexPosition
17 { 24 {
18 immutable string filename; /// name of the source file 25 immutable string filename; /// name of the source file
19 immutable int lineno; /// line number, 1, 2, ... 26 immutable int lineno; /// line number, 1, 2, ...
................................................................................................................................................................................
74 assert( !__traits(compiles, t.pos=p) ); 81 assert( !__traits(compiles, t.pos=p) );
75 assert( !__traits(compiles, t.str=789) ); 82 assert( !__traits(compiles, t.str=789) );
76 assert( !__traits(compiles, t.quoted=true) ); 83 assert( !__traits(compiles, t.quoted=true) );
77 } 84 }
78 85
79 /// Named Construtor for Lexer 86 /// Named Construtor for Lexer
80 87
81 Lexer lexerFromFile(T...)( string filename, T rest ) | 88 auto lexerFromFile(T...)( string filename, T rest )
82 { 89 {
83 return lexerFromString( std.file.readText(filename), filename, rest ); 90 return lexerFromString( std.file.readText(filename), filename, rest );
84 } 91 }
85 92
86 /// Named Construtor for Lexer 93 /// Named Construtor for Lexer
87 94
88 Lexer lexerFromString( string str, string filename="<unnamed>", int lineno=1, in | 95 auto lexerFromString(CharSeq)( CharSeq str, string filename="<unnamed>", int lin
89 { 96 {
> 97 return new LexerT!(PositionedReader!CharSeq)(
90 return new Lexer(str, filename, lineno, column); | 98 PositionedReader!CharSeq(str, filename, lineno, column)
> 99 );
91 } 100 }
92 101
93 /// Lexer is a forward range of Tokens | 102 /// Standard Lexer Type (all users have to know is that this is a forward range
94 103
> 104 alias LexerT!(PositionedReader!string) Lexer;
> 105
> 106 /// Lexer Implementation
> 107
95 class Lexer | 108 class LexerT(Reader)
> 109 if( isForwardRange!(Reader) && is(ElementType!(Reader) == dchar) )
96 { 110 {
97 /// Range primitive 111 /// Range primitive
98 bool empty() /*@property*/ 112 bool empty() /*@property*/
99 { 113 {
100 return current is null; 114 return current is null;
101 } 115 }
102 116
................................................................................................................................................................................
110 void popFront() /*@property*/ 124 void popFront() /*@property*/
111 { 125 {
112 std.exception.enforce(current, "Lexer has already reached the en 126 std.exception.enforce(current, "Lexer has already reached the en
113 current = readNext(); 127 current = readNext();
114 } 128 }
115 129
116 /// Range primitive 130 /// Range primitive
117 Lexer save() /*@property*/ | 131 typeof(this) save() /*@property*/
118 { 132 {
119 return new Lexer(this.tupleof); | 133 return new typeof(this)(reader.save, current);
120 } 134 }
121 135
122 private: // implementation 136 private: // implementation
123 137
124 string buffer; | 138 Reader reader;
125 string filename; <
126 int lineno; <
127 int column; <
128 Token current; 139 Token current;
129 140
130 invariant() 141 invariant()
131 { 142 {
132 assert( buffer.empty || !std.ctype.isspace(buffer[0]) ); | 143 assert( reader.empty || !std.ctype.isspace(reader.front) );
> 144 }
> 145
> 146 this( Reader reader, Token current = null )
> 147 {
> 148 this.reader = reader;
> 149 readWhile!isSpace();
> 150 this.current = (current is null ? readNext() : current);
> 151 }
> 152
> 153 public static {
> 154 bool isSpace (dchar c) { return std.ctype.isspace(c)!=0; }
> 155 bool isSymbol (dchar c) { return 0x21<=c && c<=0x7f && !std.cty
> 156 bool isSSymbol (dchar c) { return !find("()[]{};", c).empty; }
> 157 bool isMSymbol (dchar c) { return isSymbol(c) && !isSSymbol(c);
> 158 bool isLetter (dchar c) { return !isSpace(c) && !isSymbol(c); }
133 } 159 }
134 160
135 this( string buffer, string filename, int lineno, int column, Token curr | 161 string readQuoted(const LexPosition pos){char[] buf; return readQuoted(p
> 162 string readQuoted(const LexPosition pos, ref char[] buf)
136 { 163 {
137 this.buffer = buffer; | 164 if( reader.empty )
138 this.filename = filename; | 165 throw new LexException(pos, "EOF found while lexing a qu
139 this.lineno = lineno; | 166 dchar c = reader.front;
140 this.column = column; | 167 reader.popFront;
141 skipws(); | 168 if( c == '"' )
142 this.current = (current is null ? readNext() : current); | 169 return assumeUnique(buf);
> 170 if( c == '\\' && !reader.empty ) {
> 171 if( reader.front=='"' ) {
> 172 reader.popFront;
> 173 return readQuoted(pos,buf ~= '\"');
> 174 }
> 175 if( reader.front=='\\' ) {
> 176 reader.popFront;
> 177 return readQuoted(pos,buf ~= '\\');
> 178 }
> 179 }
> 180 return readQuoted(pos,buf ~= c);
143 } 181 }
144 182
145 void skipws() | 183 string readWhile(alias fn)()
146 { 184 {
147 bool progress = false; | 185 char[] buf;
148 do | 186 for(; !reader.empty && fn(reader.front); reader.popFront)
149 { <
> 187 buf ~= reader.front;
150 string ws = buffer.munch(" \t"); | 188 return assumeUnique(buf);
151 column += ws.length; <
152 progress = !ws.empty; <
153 while( !buffer.empty && (buffer[0]=='\r' || buffer[0]==' <
154 { <
155 progress = true; <
156 if( buffer[0] == '\n' ) <
157 buffer = buffer[1..$]; <
158 else // if( buffer.front == '\r' ) <
159 { <
160 buffer = buffer[1..$]; <
161 if( !buffer.empty && buffer[0]=='\n' ) <
162 buffer = buffer[1..$]; <
163 } <
164 lineno ++; <
165 column = 1; <
166 } <
167 }while( progress ); <
168 } 189 }
169 190
170 char readChar() <
171 { <
172 scope(exit) { <
173 buffer = buffer[1..$]; <
174 column ++; <
175 } <
176 return buffer[0]; <
177 } <
178 <
179 /// This is the main lexing routine <
180 Token readNext() 191 Token readNext()
181 { 192 {
182 if( buffer.empty ) | 193 if( reader.empty )
183 return null; 194 return null;
184 scope(exit) | 195 scope(success)
185 skipws(); | 196 readWhile!isSpace();
> 197 if( reader.front == '#' ) // comment
186 | 198 {
187 if( isSymbol(buffer[0]) ) <
> 199 reader = find(reader, '\n');
> 200 readWhile!isSpace();
> 201 return readNext();
> 202 }
> 203 else if( reader.front == '"' ) // quoted
> 204 {
> 205 auto pos = reader.currentPosition();
> 206 reader.popFront;
> 207 return new Token(pos, readQuoted(pos), true);
> 208 }
> 209 else if( isSSymbol(reader.front) ) // paren
> 210 {
> 211 auto pos = reader.currentPosition();
> 212 string s; s~=reader.front; reader.popFront;
> 213 return new Token(pos, s, false);
> 214 }
> 215 else if( isMSymbol(reader.front) ) // symbol
188 { 216 {
189 if( buffer[0] == '#' ) <
190 { <
191 // skip comment <
192 while( !buffer.empty && (buffer[0]!='\n' && buff <
193 readChar(); <
194 skipws(); <
195 return readNext(); <
196 } <
197 else if( buffer[0] == '"' ) <
198 { <
199 // string literal <
200 auto pos = currentPosition(); | 217 auto pos = reader.currentPosition();
201 string lit; <
202 readChar(); <
203 while( !buffer.empty && buffer[0]!='"' ) <
204 { <
205 // read one char <
206 char c = readChar(); <
207 if( c == '\\' ) <
208 { <
209 if( !buffer.empty && (buffer[0]= <
210 lit ~= readChar(); <
211 else <
212 lit ~= c; <
213 } <
214 else if( c == '\n' ) <
215 { <
216 lit ~= c; <
217 lineno++; <
218 column = 1; <
219 } <
220 else if( c == '\r' ) <
221 { <
222 if( !buffer.empty && buffer[0]== <
223 readChar(); <
224 lit ~= '\n'; <
225 lineno++; <
226 column = 1; <
227 } <
228 else <
229 lit ~= c; <
230 } <
231 if( !buffer.empty ) <
232 readChar(); <
233 return new Token(pos, lit, true); | 218 return new Token(pos, readWhile!isMSymbol(), false);
234 } <
235 else <
236 { <
237 // normal symbol <
238 auto pos = currentPosition(); <
239 auto str = ""~readChar(); <
240 return new Token(pos, str, false); <
241 } <
242 } 219 }
243 else 220 else
244 { 221 {
245 auto pos = currentPosition(); | 222 auto pos = reader.currentPosition();
246 int i = 0; <
247 while( i<buffer.length && !std.ctype.isspace(buffer[i]) <
248 ++i; <
249 auto str = buffer[0 .. i]; <
250 buffer = buffer[i .. $]; <
251 column += i; <
252 return new Token(pos, str, false); | 223 return new Token(pos, readWhile!isLetter(), false);
253 } 224 }
254 } 225 }
255 <
256 bool isSymbol(char c) <
257 { <
258 return (0x21<=c && c<=0x7f && !std.ctype.isalnum(c) && c!='_'); <
259 } <
260 <
261 immutable(LexPosition) currentPosition() <
262 { <
263 return new immutable(LexPosition)(filename, lineno, column); <
264 } <
265 } 226 }
266 227
267 unittest 228 unittest
268 { 229 {
269 assert( std.range.isForwardRange!(Lexer) ); 230 assert( std.range.isForwardRange!(Lexer) );
270 } 231 }
271 232
272 unittest 233 unittest
273 { 234 {
274 auto lex = lexerFromString("this is a \t\r\n pen :-( "); | 235 auto lex = lexerFromString("this is a \t\r\n pen :-( @@; ");
275 Token[] ts = std.array.array(lex); 236 Token[] ts = std.array.array(lex);
276 237
277 assert_eq( ts[0].pos.lineno, 1 ); 238 assert_eq( ts[0].pos.lineno, 1 );
278 assert_eq( ts[0].pos.column, 1 ); 239 assert_eq( ts[0].pos.column, 1 );
279 assert( !ts[0].quoted ); 240 assert( !ts[0].quoted );
280 assert_eq( ts[0].str, "this" ); 241 assert_eq( ts[0].str, "this" );
281 242
................................................................................................................................................................................
290 assert_eq( ts[2].str, "a" ); 251 assert_eq( ts[2].str, "a" );
291 252
292 assert_eq( ts[3].pos.lineno, 2 ); 253 assert_eq( ts[3].pos.lineno, 2 );
293 assert_eq( ts[3].pos.column, 2 ); 254 assert_eq( ts[3].pos.column, 2 );
294 assert( !ts[3].quoted ); 255 assert( !ts[3].quoted );
295 assert_eq( ts[3].str, "pen" ); 256 assert_eq( ts[3].str, "pen" );
296 257
297 // consecutive symbols are always separated <
298 // hence, no "++" or "<<" or ... <
299 <
300 assert_eq( ts[4].pos.lineno, 2 ); 258 assert_eq( ts[4].pos.lineno, 2 );
301 assert_eq( ts[4].pos.column, 6 ); 259 assert_eq( ts[4].pos.column, 6 );
302 assert_eq( ts[4].str, ":" ); | 260 assert_eq( ts[4].str, ":-" );
303 261
304 assert_eq( ts[5].pos.lineno, 2 ); 262 assert_eq( ts[5].pos.lineno, 2 );
305 assert_eq( ts[5].pos.column, 7 ); | 263 assert_eq( ts[5].pos.column, 8 );
306 assert_eq( ts[5].str, "-" ); | 264 assert_eq( ts[5].str, "(" );
> 265 assert_eq( ts[6].str, "@@" );
> 266 assert_eq( ts[7].str, ";" ); // paren and simicolons are split
307 267
308 assert_eq( ts[6].pos.lineno, 2 ); <
309 assert_eq( ts[6].pos.column, 8 ); <
310 assert_eq( ts[6].str, "(" ); <
311 <
312 assert_eq( ts.length, 7 ); | 268 assert_eq( ts.length, 8 );
313 } 269 }
314 270
315 unittest 271 unittest
316 { 272 {
317 auto lex2 = lexerFromString(" a12\n3a 5 "); <
318 assert_eq( lex2.front.str, "a12" ); <
319 lex2.popFront; <
320 auto lex3 = lex2.save; <
321 assert_eq( lex2.front.str, "3a" ); <
322 lex2.popFront; <
323 assert_eq( lex3.front.str, "3a" ); <
324 assert_eq( lex2.front.str, "5" ); <
325 lex2.popFront; <
326 lex3.popFront; <
327 assert( lex2.empty ); <
328 assert( !lex3.empty ); <
329 assert_eq( lex3.front.str, "5" ); <
330 } <
331 <
332 unittest <
333 { <
334 //!! be sure to run the unittest on the root of the source directory | 273 // !! be sure to run the unittest on the root of the source directory
335 auto lexf = lexerFromFile("polemy/lex.d"); 274 auto lexf = lexerFromFile("polemy/lex.d");
336 lexf = find!`a.str == "module"`(lexf); 275 lexf = find!`a.str == "module"`(lexf);
337 assert_eq( lexf.front.str, "module" ); 276 assert_eq( lexf.front.str, "module" );
338 assert_eq( lexf.front.pos.filename, "polemy/lex.d" ); 277 assert_eq( lexf.front.pos.filename, "polemy/lex.d" );
339 assert_eq( lexf.front.pos.lineno, 7 ); 278 assert_eq( lexf.front.pos.lineno, 7 );
340 assert_eq( lexf.front.pos.column, 1 ); 279 assert_eq( lexf.front.pos.column, 1 );
341 lexf.popFront; 280 lexf.popFront;
342 assert_eq( lexf.front.str, "polemy" ); 281 assert_eq( lexf.front.str, "polemy" );
343 assert_eq( lexf.front.pos.lineno, 7 ); 282 assert_eq( lexf.front.pos.lineno, 7 );
344 assert_eq( lexf.front.pos.column, 8 ); 283 assert_eq( lexf.front.pos.column, 8 );
345 lexf.popFront; 284 lexf.popFront;
346 assert_eq( lexf.front.str, "." ); <
347 lexf.popFront; 285 lexf.popFront;
348 assert_eq( lexf.front.str, "lex" ); <
349 lexf.popFront; 286 lexf.popFront;
350 assert_eq( lexf.front.str, ";" ); <
351 lexf.popFront; 287 lexf.popFront;
352 assert_eq( lexf.front.str, "import" ); 288 assert_eq( lexf.front.str, "import" );
353 assert_eq( lexf.front.pos.lineno, 8 ); 289 assert_eq( lexf.front.pos.lineno, 8 );
354 assert_eq( lexf.front.pos.column, 1 ); 290 assert_eq( lexf.front.pos.column, 1 );
355 } 291 }
> 292
> 293 unittest
> 294 {
> 295 assert_throw!LexException( lexerFromString(`"`) );
> 296 }
356 297
357 unittest 298 unittest
358 { 299 {
359 auto lex = lexerFromString(`my # comment should`~"\r\n"~`# hey!! 300 auto lex = lexerFromString(`my # comment should`~"\r\n"~`# hey!!
360 be ignored. 301 be ignored.
361 hahaha"hihihi""hu\\\"huhu"#123 aa 302 hahaha"hihihi""hu\\\"huhu"#123 aa
362 123 aa "aaa`~"\r\n"~`bbb # 123`~"\r\n"~`eee" | 303 123 aa "aaa`~"\n"~`bbb # 123`~"\r\n"~`eee"
363 zzz 304 zzz
364 `); 305 `);
365 Token[] ts = std.array.array(lex); 306 Token[] ts = std.array.array(lex);
366 assert_eq( ts[0].str, "my" ); 307 assert_eq( ts[0].str, "my" );
367 assert_eq( ts[0].pos.lineno, 1 ); 308 assert_eq( ts[0].pos.lineno, 1 );
368 assert( !ts[0].quoted ); 309 assert( !ts[0].quoted );
369 assert_eq( ts[1].str, "be" ); 310 assert_eq( ts[1].str, "be" );
................................................................................................................................................................................
388 assert_eq( ts[9].pos.lineno, 5 ); 329 assert_eq( ts[9].pos.lineno, 5 );
389 assert_eq( ts[9].str, "aaa\nbbb # 123\neee" ); 330 assert_eq( ts[9].str, "aaa\nbbb # 123\neee" );
390 assert( ts[9].quoted ); 331 assert( ts[9].quoted );
391 assert_eq( ts[10].pos.lineno, 8 ); 332 assert_eq( ts[10].pos.lineno, 8 );
392 assert( !ts[10].quoted ); 333 assert( !ts[10].quoted );
393 assert_eq( ts.length, 11 ); 334 assert_eq( ts.length, 11 );
394 } 335 }
> 336
> 337 unittest
> 338 {
> 339 auto lex2 = lexerFromString(" a12\n3a 5 ");
> 340 assert_eq( lex2.front.str, "a12" );
> 341 lex2.popFront;
> 342 auto lex3 = lex2.save;
> 343 assert_eq( lex2.front.str, "3a" );
> 344 lex2.popFront;
> 345 assert_eq( lex3.front.str, "3a" );
> 346 assert_eq( lex2.front.str, "5" );
> 347 lex2.popFront;
> 348 lex3.popFront;
> 349 assert( lex2.empty );
> 350 assert( !lex3.empty );
> 351 assert_eq( lex3.front.str, "5" );
> 352 }
> 353
> 354 /// Forward range for reader character by character,
> 355 /// keeping track of position information and caring \r\n -> \n conversion.
> 356
> 357 private
> 358 struct PositionedReader(CharSeq)
> 359 if( isForwardRange!(CharSeq) && is(ElementType!(CharSeq) == dchar) )
> 360 {
> 361 CharSeq buffer;
> 362 string filename;
> 363 int lineno;
> 364 int column;
> 365
> 366 /// Range primitive
> 367 bool empty() /*@property*/
> 368 {
> 369 return buffer.empty;
> 370 }
> 371
> 372 /// Range primitive
> 373 dchar front() /*@property*/
> 374 {
> 375 dchar c = buffer.front;
> 376 return (c=='\r' ? '\n' : c);
> 377 }
> 378
> 379 /// Range primitive
> 380 void popFront() /*@property*/
> 381 {
> 382 dchar c = buffer.front;
> 383 buffer.popFront;
> 384 if( c=='\r' )
> 385 {
> 386 if( !buffer.empty && buffer.front=='\n' )
> 387 buffer.popFront;
> 388 c = '\n';
> 389 }
> 390 if( c=='\n' )
> 391 {
> 392 lineno ++;
> 393 column = 1;
> 394 }
> 395 else
> 396 column ++;
> 397 }
> 398
> 399 /// Range primitive
> 400 typeof(this) save() /*@property*/
> 401 {
> 402 return this;
> 403 }
> 404
> 405 /// Get the current position
> 406 immutable(LexPosition) currentPosition() const
> 407 {
> 408 return new immutable(LexPosition)(filename, lineno, column);
> 409 }
> 410 }
> 411
> 412 unittest
> 413 {
> 414 assert( isForwardRange!(PositionedReader!string) );
> 415 assert( is(ElementType!(PositionedReader!string) == dchar) );
> 416 }