1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25 __version__ = "2.3"
26
27 import re, sys, types
28
29
30 _is_identifier = re.compile(r'^[a-zA-Z0-9_]+$')
31
32
33
34
35
36 try:
37 _INSTANCETYPE = (types.InstanceType, types.ObjectType)
38 except AttributeError:
39 _INSTANCETYPE = types.InstanceType
41
42
43
46 self.args = (message,)
47 self.text = s
48
49
57
58
59
60
61
62
63
64
65
66
69 self.lexre = None
70
71
72
73 self.lexretext = None
74 self.lexstatere = {}
75 self.lexstateretext = {}
76 self.lexstate = "INITIAL"
77 self.lexstatestack = []
78 self.lexstateinfo = None
79 self.lexstateignore = {}
80 self.lexstateerrorf = {}
81 self.lexreflags = 0
82 self.lexdata = None
83 self.lexpos = 0
84 self.lexlen = 0
85 self.lexerrorf = None
86 self.lextokens = None
87 self.lexignore = ""
88 self.lexliterals = ""
89 self.lexmodule = None
90 self.lineno = 1
91 self.lexdebug = 0
92 self.lexoptimize = 0
93
94 - def clone(self,object=None):
95 c = Lexer()
96 c.lexstatere = self.lexstatere
97 c.lexstateinfo = self.lexstateinfo
98 c.lexstateretext = self.lexstateretext
99 c.lexstate = self.lexstate
100 c.lexstatestack = self.lexstatestack
101 c.lexstateignore = self.lexstateignore
102 c.lexstateerrorf = self.lexstateerrorf
103 c.lexreflags = self.lexreflags
104 c.lexdata = self.lexdata
105 c.lexpos = self.lexpos
106 c.lexlen = self.lexlen
107 c.lextokens = self.lextokens
108 c.lexdebug = self.lexdebug
109 c.lineno = self.lineno
110 c.lexoptimize = self.lexoptimize
111 c.lexliterals = self.lexliterals
112 c.lexmodule = self.lexmodule
113
114
115
116
117
118 if object:
119 newtab = { }
120 for key, ritem in self.lexstatere.items():
121 newre = []
122 for cre, findex in ritem:
123 newfindex = []
124 for f in findex:
125 if not f or not f[0]:
126 newfindex.append(f)
127 continue
128 newfindex.append((getattr(object,f[0].__name__),f[1]))
129 newre.append((cre,newfindex))
130 newtab[key] = newre
131 c.lexstatere = newtab
132 c.lexstateerrorf = { }
133 for key, ef in self.lexstateerrorf.items():
134 c.lexstateerrorf[key] = getattr(object,ef.__name__)
135 c.lexmodule = object
136
137
138 c.begin(c.lexstate)
139 return c
140
141
142
143
145 tf = open(tabfile+".py","w")
146 tf.write("# %s.py. This file automatically created by PLY (version %s). Don't edit!\n" % (tabfile,__version__))
147 tf.write("_lextokens = %s\n" % repr(self.lextokens))
148 tf.write("_lexreflags = %s\n" % repr(self.lexreflags))
149 tf.write("_lexliterals = %s\n" % repr(self.lexliterals))
150 tf.write("_lexstateinfo = %s\n" % repr(self.lexstateinfo))
151
152 tabre = { }
153 for key, lre in self.lexstatere.items():
154 titem = []
155 for i in range(len(lre)):
156 titem.append((self.lexstateretext[key][i],_funcs_to_names(lre[i][1])))
157 tabre[key] = titem
158
159 tf.write("_lexstatere = %s\n" % repr(tabre))
160 tf.write("_lexstateignore = %s\n" % repr(self.lexstateignore))
161
162 taberr = { }
163 for key, ef in self.lexstateerrorf.items():
164 if ef:
165 taberr[key] = ef.__name__
166 else:
167 taberr[key] = None
168 tf.write("_lexstateerrorf = %s\n" % repr(taberr))
169 tf.close()
170
171
172
173
175 exec "import %s as lextab" % tabfile
176 self.lextokens = lextab._lextokens
177 self.lexreflags = lextab._lexreflags
178 self.lexliterals = lextab._lexliterals
179 self.lexstateinfo = lextab._lexstateinfo
180 self.lexstateignore = lextab._lexstateignore
181 self.lexstatere = { }
182 self.lexstateretext = { }
183 for key,lre in lextab._lexstatere.items():
184 titem = []
185 txtitem = []
186 for i in range(len(lre)):
187 titem.append((re.compile(lre[i][0],lextab._lexreflags),_names_to_funcs(lre[i][1],fdict)))
188 txtitem.append(lre[i][0])
189 self.lexstatere[key] = titem
190 self.lexstateretext[key] = txtitem
191 self.lexstateerrorf = { }
192 for key,ef in lextab._lexstateerrorf.items():
193 self.lexstateerrorf[key] = fdict[ef]
194 self.begin('INITIAL')
195
196
197
198
205
206
207
208
210 if not self.lexstatere.has_key(state):
211 raise ValueError, "Undefined state"
212 self.lexre = self.lexstatere[state]
213 self.lexretext = self.lexstateretext[state]
214 self.lexignore = self.lexstateignore.get(state,"")
215 self.lexerrorf = self.lexstateerrorf.get(state,None)
216 self.lexstate = state
217
218
219
220
222 self.lexstatestack.append(self.lexstate)
223 self.begin(state)
224
225
226
227
229 self.begin(self.lexstatestack.pop())
230
231
232
233
236
237
238
239
242
243
244
245
246
247
248
249
251
252 lexpos = self.lexpos
253 lexlen = self.lexlen
254 lexignore = self.lexignore
255 lexdata = self.lexdata
256
257 while lexpos < lexlen:
258
259 if lexdata[lexpos] in lexignore:
260 lexpos += 1
261 continue
262
263
264 for lexre,lexindexfunc in self.lexre:
265 m = lexre.match(lexdata,lexpos)
266 if not m: continue
267
268
269 self.lexmatch = m
270
271
272 tok = LexToken()
273 tok.value = m.group()
274 tok.lineno = self.lineno
275 tok.lexpos = lexpos
276 tok.lexer = self
277
278 lexpos = m.end()
279 i = m.lastindex
280 func,tok.type = lexindexfunc[i]
281 self.lexpos = lexpos
282
283 if not func:
284
285 if tok.type: return tok
286 break
287
288
289 if not callable(func):
290 break
291
292
293 newtok = func(tok)
294
295
296 if not newtok:
297 lexpos = self.lexpos
298 break
299
300
301 if not self.lexoptimize:
302 if not self.lextokens.has_key(newtok.type):
303 raise LexError, ("%s:%d: Rule '%s' returned an unknown token type '%s'" % (
304 func.func_code.co_filename, func.func_code.co_firstlineno,
305 func.__name__, newtok.type),lexdata[lexpos:])
306
307 return newtok
308 else:
309
310 if lexdata[lexpos] in self.lexliterals:
311 tok = LexToken()
312 tok.value = lexdata[lexpos]
313 tok.lineno = self.lineno
314 tok.lexer = self
315 tok.type = tok.value
316 tok.lexpos = lexpos
317 self.lexpos = lexpos + 1
318 return tok
319
320
321 if self.lexerrorf:
322 tok = LexToken()
323 tok.value = self.lexdata[lexpos:]
324 tok.lineno = self.lineno
325 tok.type = "error"
326 tok.lexer = self
327 tok.lexpos = lexpos
328 self.lexpos = lexpos
329 newtok = self.lexerrorf(tok)
330 if lexpos == self.lexpos:
331
332 raise LexError, ("Scanning error. Illegal character '%s'" % (lexdata[lexpos]), lexdata[lexpos:])
333 lexpos = self.lexpos
334 if not newtok: continue
335 return newtok
336
337 self.lexpos = lexpos
338 raise LexError, ("Illegal character '%s' at index %d" % (lexdata[lexpos],lexpos), lexdata[lexpos:])
339
340 self.lexpos = lexpos + 1
341 if self.lexdata is None:
342 raise RuntimeError, "No input string given with input()"
343 return None
344
345
346
347
348
349
350
351
352
354 import os.path
355 base,ext = os.path.splitext(filename)
356 if ext != '.py': return 1
357
358 try:
359 f = open(filename)
360 lines = f.readlines()
361 f.close()
362 except IOError:
363 return 1
364
365 fre = re.compile(r'\s*def\s+(t_[a-zA-Z_0-9]*)\(')
366 sre = re.compile(r'\s*(t_[a-zA-Z_0-9]*)\s*=')
367 counthash = { }
368 linen = 1
369 noerror = 1
370 for l in lines:
371 m = fre.match(l)
372 if not m:
373 m = sre.match(l)
374 if m:
375 name = m.group(1)
376 prev = counthash.get(name)
377 if not prev:
378 counthash[name] = linen
379 else:
380 print >>sys.stderr, "%s:%d: Rule %s redefined. Previously defined on line %d" % (filename,linen,name,prev)
381 noerror = 0
382 linen += 1
383 return noerror
384
385
386
387
388
389
390
391
393 result = []
394 for f in funclist:
395 if f and f[0]:
396 result.append((f[0].__name__,f[1]))
397 else:
398 result.append(f)
399 return result
400
401
402
403
404
405
406
407
409 result = []
410 for n in namelist:
411 if n and n[0]:
412 result.append((fdict[n[0]],n[1]))
413 else:
414 result.append(n)
415 return result
416
417
418
419
420
421
422
423
424
452
453
454
455
456
457
458
459
460
461
463 nonstate = 1
464 parts = s.split("_")
465 for i in range(1,len(parts)):
466 if not names.has_key(parts[i]) and parts[i] != 'ANY': break
467 if i > 1:
468 states = tuple(parts[1:i])
469 else:
470 states = ('INITIAL',)
471
472 if 'ANY' in states:
473 states = tuple(names.keys())
474
475 tokenname = "_".join(parts[i:])
476 return (states,tokenname)
477
478
479
480
481
482
483 -def lex(module=None,object=None,debug=0,optimize=0,lextab="lextab",reflags=0,nowarn=0):
484 global lexer
485 ldict = None
486 stateinfo = { 'INITIAL' : 'inclusive'}
487 error = 0
488 files = { }
489 lexobj = Lexer()
490 lexobj.lexdebug = debug
491 lexobj.lexoptimize = optimize
492 global token,input
493
494 if nowarn: warn = 0
495 else: warn = 1
496
497 if object: module = object
498
499 if module:
500
501 if isinstance(module, types.ModuleType):
502 ldict = module.__dict__
503 elif isinstance(module, _INSTANCETYPE):
504 _items = [(k,getattr(module,k)) for k in dir(module)]
505 ldict = { }
506 for (i,v) in _items:
507 ldict[i] = v
508 else:
509 raise ValueError,"Expected a module or instance"
510 lexobj.lexmodule = module
511
512 else:
513
514 try:
515 raise RuntimeError
516 except RuntimeError:
517 e,b,t = sys.exc_info()
518 f = t.tb_frame
519 f = f.f_back
520 ldict = f.f_globals
521
522 if optimize and lextab:
523 try:
524 lexobj.readtab(lextab,ldict)
525 token = lexobj.token
526 input = lexobj.input
527 lexer = lexobj
528 return lexobj
529
530 except ImportError:
531 pass
532
533
534 if (module and isinstance(module,_INSTANCETYPE)):
535 tokens = getattr(module,"tokens",None)
536 states = getattr(module,"states",None)
537 literals = getattr(module,"literals","")
538 else:
539 tokens = ldict.get("tokens",None)
540 states = ldict.get("states",None)
541 literals = ldict.get("literals","")
542
543 if not tokens:
544 raise SyntaxError,"lex: module does not define 'tokens'"
545 if not (isinstance(tokens,types.ListType) or isinstance(tokens,types.TupleType)):
546 raise SyntaxError,"lex: tokens must be a list or tuple."
547
548
549 lexobj.lextokens = { }
550 if not optimize:
551 for n in tokens:
552 if not _is_identifier.match(n):
553 print >>sys.stderr, "lex: Bad token name '%s'" % n
554 error = 1
555 if warn and lexobj.lextokens.has_key(n):
556 print >>sys.stderr, "lex: Warning. Token '%s' multiply defined." % n
557 lexobj.lextokens[n] = None
558 else:
559 for n in tokens: lexobj.lextokens[n] = None
560
561 if debug:
562 print "lex: tokens = '%s'" % lexobj.lextokens.keys()
563
564 try:
565 for c in literals:
566 if not (isinstance(c,types.StringType) or isinstance(c,types.UnicodeType)) or len(c) > 1:
567 print >>sys.stderr, "lex: Invalid literal %s. Must be a single character" % repr(c)
568 error = 1
569 continue
570
571 except TypeError:
572 print >>sys.stderr, "lex: Invalid literals specification. literals must be a sequence of characters."
573 error = 1
574
575 lexobj.lexliterals = literals
576
577
578 if states:
579 if not (isinstance(states,types.TupleType) or isinstance(states,types.ListType)):
580 print >>sys.stderr, "lex: states must be defined as a tuple or list."
581 error = 1
582 else:
583 for s in states:
584 if not isinstance(s,types.TupleType) or len(s) != 2:
585 print >>sys.stderr, "lex: invalid state specifier %s. Must be a tuple (statename,'exclusive|inclusive')" % repr(s)
586 error = 1
587 continue
588 name, statetype = s
589 if not isinstance(name,types.StringType):
590 print >>sys.stderr, "lex: state name %s must be a string" % repr(name)
591 error = 1
592 continue
593 if not (statetype == 'inclusive' or statetype == 'exclusive'):
594 print >>sys.stderr, "lex: state type for state %s must be 'inclusive' or 'exclusive'" % name
595 error = 1
596 continue
597 if stateinfo.has_key(name):
598 print >>sys.stderr, "lex: state '%s' already defined." % name
599 error = 1
600 continue
601 stateinfo[name] = statetype
602
603
604 tsymbols = [f for f in ldict.keys() if f[:2] == 't_' ]
605
606
607
608 funcsym = { }
609 strsym = { }
610 toknames = { }
611
612 for s in stateinfo.keys():
613 funcsym[s] = []
614 strsym[s] = []
615
616 ignore = { }
617 errorf = { }
618
619 if len(tsymbols) == 0:
620 raise SyntaxError,"lex: no rules of the form t_rulename are defined."
621
622 for f in tsymbols:
623 t = ldict[f]
624 states, tokname = _statetoken(f,stateinfo)
625 toknames[f] = tokname
626
627 if callable(t):
628 for s in states: funcsym[s].append((f,t))
629 elif (isinstance(t, types.StringType) or isinstance(t,types.UnicodeType)):
630 for s in states: strsym[s].append((f,t))
631 else:
632 print >>sys.stderr, "lex: %s not defined as a function or string" % f
633 error = 1
634
635
636 for f in funcsym.values():
637 f.sort(lambda x,y: cmp(x[1].func_code.co_firstlineno,y[1].func_code.co_firstlineno))
638
639
640 for s in strsym.values():
641 s.sort(lambda x,y: (len(x[1]) < len(y[1])) - (len(x[1]) > len(y[1])))
642
643 regexs = { }
644
645
646 for state in stateinfo.keys():
647 regex_list = []
648
649
650 for fname, f in funcsym[state]:
651 line = f.func_code.co_firstlineno
652 file = f.func_code.co_filename
653 files[file] = None
654 tokname = toknames[fname]
655
656 ismethod = isinstance(f, types.MethodType)
657
658 if not optimize:
659 nargs = f.func_code.co_argcount
660 if ismethod:
661 reqargs = 2
662 else:
663 reqargs = 1
664 if nargs > reqargs:
665 print >>sys.stderr, "%s:%d: Rule '%s' has too many arguments." % (file,line,f.__name__)
666 error = 1
667 continue
668
669 if nargs < reqargs:
670 print >>sys.stderr, "%s:%d: Rule '%s' requires an argument." % (file,line,f.__name__)
671 error = 1
672 continue
673
674 if tokname == 'ignore':
675 print >>sys.stderr, "%s:%d: Rule '%s' must be defined as a string." % (file,line,f.__name__)
676 error = 1
677 continue
678
679 if tokname == 'error':
680 errorf[state] = f
681 continue
682
683 if f.__doc__:
684 if not optimize:
685 try:
686 c = re.compile("(?P<%s>%s)" % (f.__name__,f.__doc__), re.VERBOSE | reflags)
687 if c.match(""):
688 print >>sys.stderr, "%s:%d: Regular expression for rule '%s' matches empty string." % (file,line,f.__name__)
689 error = 1
690 continue
691 except re.error,e:
692 print >>sys.stderr, "%s:%d: Invalid regular expression for rule '%s'. %s" % (file,line,f.__name__,e)
693 if '#' in f.__doc__:
694 print >>sys.stderr, "%s:%d. Make sure '#' in rule '%s' is escaped with '\\#'." % (file,line, f.__name__)
695 error = 1
696 continue
697
698 if debug:
699 print "lex: Adding rule %s -> '%s' (state '%s')" % (f.__name__,f.__doc__, state)
700
701
702
703
704 regex_list.append("(?P<%s>%s)" % (f.__name__,f.__doc__))
705 else:
706 print >>sys.stderr, "%s:%d: No regular expression defined for rule '%s'" % (file,line,f.__name__)
707
708
709 for name,r in strsym[state]:
710 tokname = toknames[name]
711
712 if tokname == 'ignore':
713 if "\\" in r:
714 print >>sys.stderr, "lex: Warning. %s contains a literal backslash '\\'" % name
715 ignore[state] = r
716 continue
717
718 if not optimize:
719 if tokname == 'error':
720 raise SyntaxError,"lex: Rule '%s' must be defined as a function" % name
721 error = 1
722 continue
723
724 if not lexobj.lextokens.has_key(tokname) and tokname.find("ignore_") < 0:
725 print >>sys.stderr, "lex: Rule '%s' defined for an unspecified token %s." % (name,tokname)
726 error = 1
727 continue
728 try:
729 c = re.compile("(?P<%s>%s)" % (name,r),re.VERBOSE | reflags)
730 if (c.match("")):
731 print >>sys.stderr, "lex: Regular expression for rule '%s' matches empty string." % name
732 error = 1
733 continue
734 except re.error,e:
735 print >>sys.stderr, "lex: Invalid regular expression for rule '%s'. %s" % (name,e)
736 if '#' in r:
737 print >>sys.stderr, "lex: Make sure '#' in rule '%s' is escaped with '\\#'." % name
738
739 error = 1
740 continue
741 if debug:
742 print "lex: Adding rule %s -> '%s' (state '%s')" % (name,r,state)
743
744 regex_list.append("(?P<%s>%s)" % (name,r))
745
746 if not regex_list:
747 print >>sys.stderr, "lex: No rules defined for state '%s'" % state
748 error = 1
749
750 regexs[state] = regex_list
751
752
753 if not optimize:
754 for f in files.keys():
755 if not _validate_file(f):
756 error = 1
757
758 if error:
759 raise SyntaxError,"lex: Unable to build lexer."
760
761
762
763
764
765
766 for state in regexs.keys():
767 lexre, re_text = _form_master_re(regexs[state],reflags,ldict,toknames)
768 lexobj.lexstatere[state] = lexre
769 lexobj.lexstateretext[state] = re_text
770 if debug:
771 for i in range(len(re_text)):
772 print "lex: state '%s'. regex[%d] = '%s'" % (state, i, re_text[i])
773
774
775 for state,type in stateinfo.items():
776 if state != "INITIAL" and type == 'inclusive':
777 lexobj.lexstatere[state].extend(lexobj.lexstatere['INITIAL'])
778 lexobj.lexstateretext[state].extend(lexobj.lexstateretext['INITIAL'])
779
780 lexobj.lexstateinfo = stateinfo
781 lexobj.lexre = lexobj.lexstatere["INITIAL"]
782 lexobj.lexretext = lexobj.lexstateretext["INITIAL"]
783
784
785 lexobj.lexstateignore = ignore
786 lexobj.lexignore = lexobj.lexstateignore.get("INITIAL","")
787
788
789 lexobj.lexstateerrorf = errorf
790 lexobj.lexerrorf = errorf.get("INITIAL",None)
791 if warn and not lexobj.lexerrorf:
792 print >>sys.stderr, "lex: Warning. no t_error rule is defined."
793
794
795 for s,stype in stateinfo.items():
796 if stype == 'exclusive':
797 if warn and not errorf.has_key(s):
798 print >>sys.stderr, "lex: Warning. no error rule is defined for exclusive state '%s'" % s
799 if warn and not ignore.has_key(s) and lexobj.lexignore:
800 print >>sys.stderr, "lex: Warning. no ignore rule is defined for exclusive state '%s'" % s
801 elif stype == 'inclusive':
802 if not errorf.has_key(s):
803 errorf[s] = errorf.get("INITIAL",None)
804 if not ignore.has_key(s):
805 ignore[s] = ignore.get("INITIAL","")
806
807
808
809 token = lexobj.token
810 input = lexobj.input
811 lexer = lexobj
812
813
814 if lextab and optimize:
815 lexobj.writetab(lextab)
816
817 return lexobj
818
819
820
821
822
823
824
825 -def runmain(lexer=None,data=None):
826 if not data:
827 try:
828 filename = sys.argv[1]
829 f = open(filename)
830 data = f.read()
831 f.close()
832 except IndexError:
833 print "Reading from standard input (type EOF to end):"
834 data = sys.stdin.read()
835
836 if lexer:
837 _input = lexer.input
838 else:
839 _input = input
840 _input(data)
841 if lexer:
842 _token = lexer.token
843 else:
844 _token = token
845
846 while 1:
847 tok = _token()
848 if not tok: break
849 print "(%s,%r,%d,%d)" % (tok.type, tok.value, tok.lineno,tok.lexpos)
850
851
852
853
854
855
856
857
858
860 def set_doc(f):
861 f.__doc__ = r
862 return f
863 return set_doc
864
865
866 Token = TOKEN
867