File indexing completed on 2025-08-09 08:19:08
0001
0002
0003
0004
0005
0006
0007
0008
0009
0010
0011
0012
0013
0014
0015
0016
0017
0018 """Tokenize C++ source code."""
0019
0020 __author__ = 'nnorwitz@google.com (Neal Norwitz)'
0021
0022
0023 try:
0024
0025 import builtins
0026 except ImportError:
0027
0028 import __builtin__ as builtins
0029
0030
0031 import sys
0032
0033 from cpp import utils
0034
0035
0036 if not hasattr(builtins, 'set'):
0037
0038 from sets import Set as set
0039
0040
0041
0042 _letters = 'abcdefghijklmnopqrstuvwxyz'
0043 VALID_IDENTIFIER_CHARS = set(_letters + _letters.upper() + '_0123456789$')
0044 HEX_DIGITS = set('0123456789abcdefABCDEF')
0045 INT_OR_FLOAT_DIGITS = set('01234567890eE-+')
0046
0047
0048
0049 _STR_PREFIXES = set(('R', 'u8', 'u8R', 'u', 'uR', 'U', 'UR', 'L', 'LR'))
0050
0051
0052
0053 UNKNOWN = 'UNKNOWN'
0054 SYNTAX = 'SYNTAX'
0055 CONSTANT = 'CONSTANT'
0056 NAME = 'NAME'
0057 PREPROCESSOR = 'PREPROCESSOR'
0058
0059
0060
0061 WHENCE_STREAM, WHENCE_QUEUE = range(2)
0062
0063
0064 class Token(object):
0065 """Data container to represent a C++ token.
0066
0067 Tokens can be identifiers, syntax char(s), constants, or
0068 pre-processor directives.
0069
0070 start contains the index of the first char of the token in the source
0071 end contains the index of the last char of the token in the source
0072 """
0073
0074 def __init__(self, token_type, name, start, end):
0075 self.token_type = token_type
0076 self.name = name
0077 self.start = start
0078 self.end = end
0079 self.whence = WHENCE_STREAM
0080
0081 def __str__(self):
0082 if not utils.DEBUG:
0083 return 'Token(%r)' % self.name
0084 return 'Token(%r, %s, %s)' % (self.name, self.start, self.end)
0085
0086 __repr__ = __str__
0087
0088
0089 def _GetString(source, start, i):
0090 i = source.find('"', i+1)
0091 while source[i-1] == '\\':
0092
0093 backslash_count = 1
0094 j = i - 2
0095 while source[j] == '\\':
0096 backslash_count += 1
0097 j -= 1
0098
0099 if (backslash_count % 2) == 0:
0100 break
0101 i = source.find('"', i+1)
0102 return i + 1
0103
0104
0105 def _GetChar(source, start, i):
0106
0107 i = source.find("'", i+1)
0108 while source[i-1] == '\\':
0109
0110 if (i - 2) > start and source[i-2] == '\\':
0111 break
0112 i = source.find("'", i+1)
0113
0114 if i < 0:
0115 i = start
0116 return i + 1
0117
0118
0119 def GetTokens(source):
0120 """Returns a sequence of Tokens.
0121
0122 Args:
0123 source: string of C++ source code.
0124
0125 Yields:
0126 Token that represents the next token in the source.
0127 """
0128
0129 valid_identifier_chars = VALID_IDENTIFIER_CHARS
0130 hex_digits = HEX_DIGITS
0131 int_or_float_digits = INT_OR_FLOAT_DIGITS
0132 int_or_float_digits2 = int_or_float_digits | set('.')
0133
0134
0135 ignore_errors = False
0136 count_ifs = 0
0137
0138 i = 0
0139 end = len(source)
0140 while i < end:
0141
0142 while i < end and source[i].isspace():
0143 i += 1
0144 if i >= end:
0145 return
0146
0147 token_type = UNKNOWN
0148 start = i
0149 c = source[i]
0150 if c.isalpha() or c == '_':
0151 token_type = NAME
0152 while source[i] in valid_identifier_chars:
0153 i += 1
0154
0155
0156 if (source[i] == "'" and (i - start) == 1 and
0157 source[start:i] in 'uUL'):
0158
0159 token_type = CONSTANT
0160 i = _GetChar(source, start, i)
0161 elif source[i] == "'" and source[start:i] in _STR_PREFIXES:
0162 token_type = CONSTANT
0163 i = _GetString(source, start, i)
0164 elif c == '/' and source[i+1] == '/':
0165 i = source.find('\n', i)
0166 if i == -1:
0167 i = end
0168 continue
0169 elif c == '/' and source[i+1] == '*':
0170 i = source.find('*/', i) + 2
0171 continue
0172 elif c in ':+-<>&|*=':
0173 token_type = SYNTAX
0174 i += 1
0175 new_ch = source[i]
0176 if new_ch == c and c != '>':
0177 i += 1
0178 elif c == '-' and new_ch == '>':
0179 i += 1
0180 elif new_ch == '=':
0181 i += 1
0182 elif c in '()[]{}~!?^%;/.,':
0183 token_type = SYNTAX
0184 i += 1
0185 if c == '.' and source[i].isdigit():
0186 token_type = CONSTANT
0187 i += 1
0188 while source[i] in int_or_float_digits:
0189 i += 1
0190
0191 for suffix in ('l', 'f'):
0192 if suffix == source[i:i+1].lower():
0193 i += 1
0194 break
0195 elif c.isdigit():
0196 token_type = CONSTANT
0197 if c == '0' and source[i+1] in 'xX':
0198
0199 i += 2
0200 while source[i] in hex_digits:
0201 i += 1
0202 else:
0203 while source[i] in int_or_float_digits2:
0204 i += 1
0205
0206 for suffix in ('ull', 'll', 'ul', 'l', 'f', 'u'):
0207 size = len(suffix)
0208 if suffix == source[i:i+size].lower():
0209 i += size
0210 break
0211 elif c == '"':
0212 token_type = CONSTANT
0213 i = _GetString(source, start, i)
0214 elif c == "'":
0215 token_type = CONSTANT
0216 i = _GetChar(source, start, i)
0217 elif c == '#':
0218 token_type = PREPROCESSOR
0219 got_if = source[i:i+3] == '#if' and source[i+3:i+4].isspace()
0220 if got_if:
0221 count_ifs += 1
0222 elif source[i:i+6] == '#endif':
0223 count_ifs -= 1
0224 if count_ifs == 0:
0225 ignore_errors = False
0226
0227
0228 while 1:
0229 i1 = source.find('\n', i)
0230 i2 = source.find('//', i)
0231 i3 = source.find('/*', i)
0232 i4 = source.find('"', i)
0233
0234
0235 i = min([x for x in (i1, i2, i3, i4, end) if x != -1])
0236
0237
0238 if source[i] == '"':
0239 i = source.find('"', i+1) + 1
0240 assert i > 0
0241 continue
0242
0243 if not (i == i1 and source[i-1] == '\\'):
0244 if got_if:
0245 condition = source[start+4:i].lstrip()
0246 if (condition.startswith('0') or
0247 condition.startswith('(0)')):
0248 ignore_errors = True
0249 break
0250 i += 1
0251 elif c == '\\':
0252
0253 i += 1
0254 continue
0255 elif ignore_errors:
0256
0257
0258
0259
0260
0261 i += 1
0262 else:
0263 sys.stderr.write('Got invalid token in %s @ %d token:%s: %r\n' %
0264 ('?', i, c, source[i-10:i+10]))
0265 raise RuntimeError('unexpected token')
0266
0267 if i <= 0:
0268 print('Invalid index, exiting now.')
0269 return
0270 yield Token(token_type, source[start:i], start, i)
0271
0272
0273 if __name__ == '__main__':
0274 def main(argv):
0275 """Driver mostly for testing purposes."""
0276 for filename in argv[1:]:
0277 source = utils.ReadFile(filename)
0278 if source is None:
0279 continue
0280
0281 for token in GetTokens(source):
0282 print('%-12s: %s' % (token.token_type, token.name))
0283
0284 sys.stdout.write('\n')
0285
0286
0287 main(sys.argv)