generator/cpp/tokenize.py

0001 #!/usr/bin/env python
0002 #
0003 # Copyright 2007 Neal Norwitz
0004 # Portions Copyright 2007 Google Inc.
0005 #
0006 # Licensed under the Apache License, Version 2.0 (the "License");
0007 # you may not use this file except in compliance with the License.
0008 # You may obtain a copy of the License at
0009 #
0010 #      http://www.apache.org/licenses/LICENSE-2.0
0011 #
0012 # Unless required by applicable law or agreed to in writing, software
0013 # distributed under the License is distributed on an "AS IS" BASIS,
0014 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
0015 # See the License for the specific language governing permissions and
0016 # limitations under the License.
0017
0018 """Tokenize C++ source code."""
0019
0020 __author__ = 'nnorwitz@google.com (Neal Norwitz)'
0021
0022
0023 try:
0024     # Python 3.x
0025     import builtins
0026 except ImportError:
0027     # Python 2.x
0028     import __builtin__ as builtins
0029
0030
0031 import sys
0032
0033 from cpp import utils
0034
0035
0036 if not hasattr(builtins, 'set'):
0037     # Nominal support for Python 2.3.
0038     from sets import Set as set
0039
0040
0041 # Add $ as a valid identifier char since so much code uses it.
0042 _letters = 'abcdefghijklmnopqrstuvwxyz'
0043 VALID_IDENTIFIER_CHARS = set(_letters + _letters.upper() + '_0123456789$')
0044 HEX_DIGITS = set('0123456789abcdefABCDEF')
0045 INT_OR_FLOAT_DIGITS = set('01234567890eE-+')
0046
0047
0048 # C++0x string preffixes.
0049 _STR_PREFIXES = set(('R', 'u8', 'u8R', 'u', 'uR', 'U', 'UR', 'L', 'LR'))
0050
0051
0052 # Token types.
0053 UNKNOWN = 'UNKNOWN'
0054 SYNTAX = 'SYNTAX'
0055 CONSTANT = 'CONSTANT'
0056 NAME = 'NAME'
0057 PREPROCESSOR = 'PREPROCESSOR'
0058
0059 # Where the token originated from.  This can be used for backtracking.
0060 # It is always set to WHENCE_STREAM in this code.
0061 WHENCE_STREAM, WHENCE_QUEUE = range(2)
0062
0063
0064 class Token(object):
0065     """Data container to represent a C++ token.
0066
0067     Tokens can be identifiers, syntax char(s), constants, or
0068     pre-processor directives.
0069
0070     start contains the index of the first char of the token in the source
0071     end contains the index of the last char of the token in the source
0072     """
0073
0074     def __init__(self, token_type, name, start, end):
0075         self.token_type = token_type
0076         self.name = name
0077         self.start = start
0078         self.end = end
0079         self.whence = WHENCE_STREAM
0080
0081     def __str__(self):
0082         if not utils.DEBUG:
0083             return 'Token(%r)' % self.name
0084         return 'Token(%r, %s, %s)' % (self.name, self.start, self.end)
0085
0086     __repr__ = __str__
0087
0088
0089 def _GetString(source, start, i):
0090     i = source.find('"', i+1)
0091     while source[i-1] == '\\':
0092         # Count the trailing backslashes.
0093         backslash_count = 1
0094         j = i - 2
0095         while source[j] == '\\':
0096             backslash_count += 1
0097             j -= 1
0098         # When trailing backslashes are even, they escape each other.
0099         if (backslash_count % 2) == 0:
0100             break
0101         i = source.find('"', i+1)
0102     return i + 1
0103
0104
0105 def _GetChar(source, start, i):
0106     # NOTE(nnorwitz): may not be quite correct, should be good enough.
0107     i = source.find("'", i+1)
0108     while source[i-1] == '\\':
0109         # Need to special case '\\'.
0110         if (i - 2) > start and source[i-2] == '\\':
0111             break
0112         i = source.find("'", i+1)
0113     # Try to handle unterminated single quotes (in a #if 0 block).
0114     if i < 0:
0115         i = start
0116     return i + 1
0117
0118
0119 def GetTokens(source):
0120     """Returns a sequence of Tokens.
0121
0122     Args:
0123       source: string of C++ source code.
0124
0125     Yields:
0126       Token that represents the next token in the source.
0127     """
0128     # Cache various valid character sets for speed.
0129     valid_identifier_chars = VALID_IDENTIFIER_CHARS
0130     hex_digits = HEX_DIGITS
0131     int_or_float_digits = INT_OR_FLOAT_DIGITS
0132     int_or_float_digits2 = int_or_float_digits | set('.')
0133
0134     # Only ignore errors while in a #if 0 block.
0135     ignore_errors = False
0136     count_ifs = 0
0137
0138     i = 0
0139     end = len(source)
0140     while i < end:
0141         # Skip whitespace.
0142         while i < end and source[i].isspace():
0143             i += 1
0144         if i >= end:
0145             return
0146
0147         token_type = UNKNOWN
0148         start = i
0149         c = source[i]
0150         if c.isalpha() or c == '_':              # Find a string token.
0151             token_type = NAME
0152             while source[i] in valid_identifier_chars:
0153                 i += 1
0154             # String and character constants can look like a name if
0155             # they are something like L"".
0156             if (source[i] == "'" and (i - start) == 1 and
0157                 source[start:i] in 'uUL'):
0158                 # u, U, and L are valid C++0x character preffixes.
0159                 token_type = CONSTANT
0160                 i = _GetChar(source, start, i)
0161             elif source[i] == "'" and source[start:i] in _STR_PREFIXES:
0162                 token_type = CONSTANT
0163                 i = _GetString(source, start, i)
0164         elif c == '/' and source[i+1] == '/':    # Find // comments.
0165             i = source.find('\n', i)
0166             if i == -1:  # Handle EOF.
0167                 i = end
0168             continue
0169         elif c == '/' and source[i+1] == '*':    # Find /* comments. */
0170             i = source.find('*/', i) + 2
0171             continue
0172         elif c in ':+-<>&|*=':                   # : or :: (plus other chars).
0173             token_type = SYNTAX
0174             i += 1
0175             new_ch = source[i]
0176             if new_ch == c and c != '>':         # Treat ">>" as two tokens.
0177                 i += 1
0178             elif c == '-' and new_ch == '>':
0179                 i += 1
0180             elif new_ch == '=':
0181                 i += 1
0182         elif c in '()[]{}~!?^%;/.,':             # Handle single char tokens.
0183             token_type = SYNTAX
0184             i += 1
0185             if c == '.' and source[i].isdigit():
0186                 token_type = CONSTANT
0187                 i += 1
0188                 while source[i] in int_or_float_digits:
0189                     i += 1
0190                 # Handle float suffixes.
0191                 for suffix in ('l', 'f'):
0192                     if suffix == source[i:i+1].lower():
0193                         i += 1
0194                         break
0195         elif c.isdigit():                        # Find integer.
0196             token_type = CONSTANT
0197             if c == '0' and source[i+1] in 'xX':
0198                 # Handle hex digits.
0199                 i += 2
0200                 while source[i] in hex_digits:
0201                     i += 1
0202             else:
0203                 while source[i] in int_or_float_digits2:
0204                     i += 1
0205             # Handle integer (and float) suffixes.
0206             for suffix in ('ull', 'll', 'ul', 'l', 'f', 'u'):
0207                 size = len(suffix)
0208                 if suffix == source[i:i+size].lower():
0209                     i += size
0210                     break
0211         elif c == '"':                           # Find string.
0212             token_type = CONSTANT
0213             i = _GetString(source, start, i)
0214         elif c == "'":                           # Find char.
0215             token_type = CONSTANT
0216             i = _GetChar(source, start, i)
0217         elif c == '#':                           # Find pre-processor command.
0218             token_type = PREPROCESSOR
0219             got_if = source[i:i+3] == '#if' and source[i+3:i+4].isspace()
0220             if got_if:
0221                 count_ifs += 1
0222             elif source[i:i+6] == '#endif':
0223                 count_ifs -= 1
0224                 if count_ifs == 0:
0225                     ignore_errors = False
0226
0227             # TODO(nnorwitz): handle preprocessor statements (\ continuations).
0228             while 1:
0229                 i1 = source.find('\n', i)
0230                 i2 = source.find('//', i)
0231                 i3 = source.find('/*', i)
0232                 i4 = source.find('"', i)
0233                 # NOTE(nnorwitz): doesn't handle comments in #define macros.
0234                 # Get the first important symbol (newline, comment, EOF/end).
0235                 i = min([x for x in (i1, i2, i3, i4, end) if x != -1])
0236
0237                 # Handle #include "dir//foo.h" properly.
0238                 if source[i] == '"':
0239                     i = source.find('"', i+1) + 1
0240                     assert i > 0
0241                     continue
0242                 # Keep going if end of the line and the line ends with \.
0243                 if not (i == i1 and source[i-1] == '\\'):
0244                     if got_if:
0245                         condition = source[start+4:i].lstrip()
0246                         if (condition.startswith('0') or
0247                             condition.startswith('(0)')):
0248                             ignore_errors = True
0249                     break
0250                 i += 1
0251         elif c == '\\':                          # Handle \ in code.
0252             # This is different from the pre-processor \ handling.
0253             i += 1
0254             continue
0255         elif ignore_errors:
0256             # The tokenizer seems to be in pretty good shape.  This
0257             # raise is conditionally disabled so that bogus code
0258             # in an #if 0 block can be handled.  Since we will ignore
0259             # it anyways, this is probably fine.  So disable the
0260             # exception and  return the bogus char.
0261             i += 1
0262         else:
0263             sys.stderr.write('Got invalid token in %s @ %d token:%s: %r\n' %
0264                              ('?', i, c, source[i-10:i+10]))
0265             raise RuntimeError('unexpected token')
0266
0267         if i <= 0:
0268             print('Invalid index, exiting now.')
0269             return
0270         yield Token(token_type, source[start:i], start, i)
0271
0272
0273 if __name__ == '__main__':
0274     def main(argv):
0275         """Driver mostly for testing purposes."""
0276         for filename in argv[1:]:
0277             source = utils.ReadFile(filename)
0278             if source is None:
0279                 continue
0280
0281             for token in GetTokens(source):
0282                 print('%-12s: %s' % (token.token_type, token.name))
0283                 # print('\r%6.2f%%' % (100.0 * index / token.end),)
0284             sys.stdout.write('\n')
0285
0286
0287     main(sys.argv)