0001"""
0002reader for pylogo
0003 Ian Bicking <ianb@colorstudy.com>
0004
0005Tokenizer/lexer. Examples:
0006
0007 >>> tokenize('1 2 3')
0008 [1, 2, 3, '\\n']
0009 >>> tokenize('fd 100')
0010 ['fd', 100, '\\n']
0011 >>> tokenize('pr \"hello\\nfd 100\\n')
0012 ['pr', '\"', 'hello', '\\n', 'fd', 100, '\\n']
0013 >>> tokenize('while [:a>2] [make :a :a+1]')
0014 ['while', '[', ':', 'a', '>', 2, ']', '[', 'make', ':', 'a', ':', 'a', '+', 1, ']', '\\n']
0015 >>> tokenize('>>>= <= <><> == =>=<')
0016 ['>', '>', '>=', '<=', '<>', '<>', '=', '=', '=>', '=<', '\\n']
0017 >>> tokenize('apple? !apple .apple apple._me apple10 10apple')
0018 ['apple?', '!apple', '.apple', 'apple._me', 'apple10', 10, 'apple', '\\n']
0019
0020Note that every file fed in is expected to end with a '\\n' (even if
0021the file doesn't actually). We get common.EOF from the tokenizer when
0022it is done.
0023"""
0024
0025
0026from __future__ import generators
0027
0028import re
0029import sys
0030from pylogo.common import *
0031
0032import readline
0033
0034word_matcher = r'[a-zA-Z\._\?!][a-zA-Z0-9\._\?!]*'
0035word_re = re.compile(word_matcher)
0036only_word_re = re.compile(r'^%s$' % word_matcher)
0037number_re = re.compile(r'(?:[0-9][.0-9]*|-[0-9][0-9]*)')
0038symbols = '()[]+-/*":=><;'
0039extended_symbols = ['>=', '=>', '<=', '=<', '<>']
0040white_re = re.compile(r'[ \t\n\r]+')
0041
0042class FileTokenizer:
0043
0044 """
0045 An interator over the tokens of a file. Will prompt interactively
0046 if `prompt` is given.
0047 """
0048
0049 def __init__(self, f, output=None, prompt=None):
0050 if type(f) is file:
0051 f = TrackingStream(f)
0052 self.file = f
0053 self.generator = self._generator()
0054 self.peeked = []
0055 self.prompt = prompt
0056 self.output = output
0057 self.context = []
0058
0059 def __repr__(self):
0060 try:
0061 return '<FileTokenizer %x parsing %s:%i:%i>' % (id(self), self.file.name, self.file.row,
0063 self.file.col)
0064 except:
0065 return '<FileTokenizer %x parsing %r>' % (id(self), self.file)
0067
0068 def print_prompt(self):
0069 if not self.prompt or not self.output:
0070 return
0071 if isinstance(self.prompt, str):
0072 prompt = self.prompt
0073 else:
0074 if self.context:
0075 context = self.context[-1]
0076 else:
0077 context = None
0078 prompt = self.prompt.get(context, '?')
0079 if prompt:
0080 self.output.write(prompt)
0081 self.output.flush()
0082
0083 def push_context(self, context):
0084 self.context.append(context)
0085
0086 def pop_context(self):
0087 self.context.pop()
0088
0089 def next(self):
0090 try:
0091 return self.generator.next()
0092 except StopIteration:
0093 import traceback
0094 traceback.print_exc()
0095 import sys
0096 sys.exit()
0097
0098 def peek(self):
0099 if self.peeked:
0100 return self.peeked[0]
0101 p = self.next()
0102 self.peeked = [p]
0103 return p
0104
0105 def _generator(self):
0106 """
0107 Generator - gets one token from the TrackingStream
0108 """
0109 while 1:
0110 if self.peeked:
0111 v = self.peeked[0]
0112 del self.peeked[0]
0113 yield v
0114 self.print_prompt()
0115 l = self.file.readline()
0116 while 1:
0117 if self.peeked:
0118 v = self.peeked[0]
0119 del self.peeked[0]
0120 yield v
0121 m = white_re.match(l, pos=self.file.col)
0122 if m:
0123 self.file.col = m.end()
0124 if l == '':
0125 yield EOF
0126 break
0127 if len(l) <= self.file.col:
0128 yield '\n'
0129 break
0130 c = l[self.file.col]
0131 try:
0132 cnext = l[self.file.col+1]
0133 except IndexError:
0134 cnext = None
0135 if (number_re.match(c) or
0136 c == '-' and
0137 cnext and number_re.match(cnext)):
0138 m = number_re.match(l, pos=self.file.col)
0139 assert m
0140 self.file.col = m.end()
0141 n = m.group(0)
0142 try:
0143 yield int(n)
0144 except ValueError:
0145 try:
0146 yield float(n)
0147 except ValueError:
0148 raise LogoSyntaxError(self.file, 'Not a number: %s' % repr(n))
0149 continue
0150 if c in symbols:
0151 if cnext and c + cnext in extended_symbols:
0152 self.file.col += 2
0153 yield c + cnext
0154 else:
0155 self.file.col += 1
0156 yield c
0157 elif word_re.match(c):
0158 m = word_re.match(l, pos=self.file.col)
0159 assert m
0160 self.file.col = m.end()
0161 yield m.group(0)
0162 else:
0163 self.file.col += 1
0164 yield c
0165
0166
0167
0168def is_word(tok):
0169 if isinstance(tok, str):
0170 return bool(only_word_re.search(tok))
0171 else:
0172 return False
0173
0174class ListTokenizer:
0175
0176 """
0177 This is just a cache of previously tokenized expressions. So that
0178 [a block] can be treated like a stream of tokens. The tokens are
0179 taken from `l`.
0180 """
0181
0182 def __init__(self, l):
0183 self.list = l
0184 try:
0185 self.file = l.file
0186 except AttributeError:
0187 self.file = None
0188 self.pos = 0
0189 self.peeked = []
0190
0191 def __repr__(self):
0192 try:
0193 return '<ListTokenizer %x tokenizing list len=%i, pos=%i>' % (id(self), len(self.list), self.pos)
0195 except:
0196 return '<ListTokenizer %x>' % (id(self))
0197
0198 def push_context(self, context):
0199 pass
0200
0201 def pop_context(self):
0202 pass
0203
0204 def peek(self):
0205 if self.peeked:
0206 return self.peeked[0]
0207 p = self.next()
0208 self.peeked = [p]
0209 return p
0210
0211 def next(self):
0212 if self.peeked:
0213 v = self.peeked[0]
0214 del self.peeked[0]
0215 return v
0216 if self.pos >= len(self.list):
0217 return EOF
0218 self.pos += 1
0219 return self.list[self.pos-1]
0220
0221class TrackingStream:
0222
0223 """
0224 A file-like object that also keeps track of rows and columns,
0225 for tracebacks.
0226 """
0227
0228 def __init__(self, file, name=None):
0229 self.file = file
0230 self.col = 0
0231 self.row = 0
0232 self.savedLines = []
0233 self.maxSavedLines = 10
0234 if name is None:
0235 self.name = self.file.name
0236 else:
0237 self.name = name
0238
0239 def readline(self):
0240 self.row += 1
0241 self.col = 0
0242 if self.file is sys.stdin:
0243 try:
0244 l = raw_input() + '\n'
0245 except EOFError:
0246 l = ''
0247 else:
0248 l = self.file.readline()
0249 self.savedLines.insert(0, l)
0250 if len(self.savedLines) > self.maxSavedLines:
0251 del self.savedLines[self.maxSavedLines:]
0252 return l
0253
0254 def row_line(self, row):
0255 if row < self.row - len(self.savedLines):
0256 return None
0257 return self.savedLines[self.row-row]
0258
0259 def __repr__(self):
0260 s = repr(self.file)[:-1]
0261 return '%s %s:%s>' % (s, self.row, self.col)
0262
0263def tokenize(s):
0264 from StringIO import StringIO
0265 input = StringIO(s)
0266 input.name = '<string>'
0267 tok = FileTokenizer(TrackingStream(input))
0268 result = []
0269 while 1:
0270 t = tok.next()
0271 if t is EOF:
0272 break
0273 result.append(t)
0274 return result
0275
0276def main():
0277 import sys
0278 tok = FileTokenizer(TrackingStream(sys.stdin))
0279 while 1:
0280 print '>> %s' % repr(tok.next())
0281
0282if __name__ == '__main__':
0283 main()