From 14efdca3b34ae8b96dba71cc835aad381927bd4c Mon Sep 17 00:00:00 2001 From: Luca Chiodini Date: Wed, 5 Nov 2025 09:21:24 +0100 Subject: [PATCH] Improve Python's tokenizer for numeric literals This improves Python's tokenizer for numeric literals with respect to several aspects: - Support underscores between digits and after prefixes (fixes #4745) - Support octal and binary literals - Support case-insensitive prefixes for hex/octal/binary literals - Recognize a possible leading minus sign as a separate token, instead of mistakenly treating it as part of the numeric literal Reference: https://docs.python.org/3/reference/lexical_analysis.html#numeric-literals Add tests to cover several of the above cases and their combinations. --- src/basic-languages/python/python.test.ts | 90 +++++++++++++++++++++-- src/basic-languages/python/python.ts | 16 +++- 2 files changed, 98 insertions(+), 8 deletions(-) diff --git a/src/basic-languages/python/python.test.ts b/src/basic-languages/python/python.test.ts index 019758a4..eeb1ddc8 100644 --- a/src/basic-languages/python/python.test.ts +++ b/src/basic-languages/python/python.test.ts @@ -99,7 +99,7 @@ testTokenization('python', [ line: "'''Lots '''0.3e-5", tokens: [ { startIndex: 0, type: 'string.python' }, - { startIndex: 11, type: 'number.python' } + { startIndex: 11, type: 'number.float.python' } ] } ], @@ -171,10 +171,11 @@ testTokenization('python', [ { line: '0xAcBFd', tokens: [{ startIndex: 0, type: 'number.hex.python' }] - } - ], - - [ + }, + { + line: '0X_1234_ABCD', + tokens: [{ startIndex: 0, type: 'number.hex.python' }] + }, { line: '0x0cH', tokens: [ @@ -184,9 +185,88 @@ testTokenization('python', [ } ], + [ + { + line: '0o7501', + tokens: [{ startIndex: 0, type: 'number.octal.python' }] + }, + { + line: '0O_1_2_3_4_5_6_7', + tokens: [{ startIndex: 0, type: 'number.octal.python' }] + } + ], + + [ + { + line: '0b0', + tokens: [{ startIndex: 0, type: 'number.binary.python' }] + }, + { + line: '0B_1010_0101', + tokens: [{ startIndex: 0, type: 'number.binary.python' }] + } + ], + + [ + { + line: '3.14', + tokens: [{ startIndex: 0, type: 'number.float.python' }] + }, + { + line: '456.7j', + tokens: [{ startIndex: 0, type: 'number.float.python' }] + }, + { + line: '0.34J', + tokens: [{ startIndex: 0, type: 'number.float.python' }] + }, + { + line: '.999_999', + tokens: [{ startIndex: 0, type: 'number.float.python' }] + }, + { + line: '1.', + tokens: [{ startIndex: 0, type: 'number.float.python' }] + } + ], + [ { line: '456.7e-7j', + tokens: [{ startIndex: 0, type: 'number.float.python' }] + }, + { + line: '0.1234e+1J', + tokens: [{ startIndex: 0, type: 'number.float.python' }] + }, + { + line: '.12e-0j', + tokens: [{ startIndex: 0, type: 'number.float.python' }] + }, + { + line: '0E0', + tokens: [{ startIndex: 0, type: 'number.float.python' }] + }, + { + line: '1e1_0', + tokens: [{ startIndex: 0, type: 'number.float.python' }] + } + ], + + [ + { + line: '123456', + tokens: [{ startIndex: 0, type: 'number.python' }] + }, + { + line: '-1L', + tokens: [ + { startIndex: 0, type: '' }, + { startIndex: 1, type: 'number.python' } + ] + }, + { + line: '1_000_000_000', tokens: [{ startIndex: 0, type: 'number.python' }] } ], diff --git a/src/basic-languages/python/python.ts b/src/basic-languages/python/python.ts index c48255a9..cb832702 100644 --- a/src/basic-languages/python/python.ts +++ b/src/basic-languages/python/python.ts @@ -200,6 +200,12 @@ export const language = { { open: '(', close: ')', token: 'delimiter.parenthesis' } ], + // we include these common regular expressions + digits: /\d+(_+\d+)*/, + octaldigits: /[0-7]+(_+[0-7]+)*/, + binarydigits: /[0-1]+(_+[0-1]+)*/, + hexdigits: /[[0-9a-fA-F]+(_+[0-9a-fA-F]+)*/, + tokenizer: { root: [ { include: '@whitespace' }, @@ -241,10 +247,14 @@ export const language = { [/"/, 'string'] ], - // Recognize hex, negatives, decimals, imaginaries, longs, and scientific notation + // Recognize hex, octal, binary, floating-point (including scientific notation), decimals, plus variants (imaginaries, Python 2 longs) numbers: [ - [/-?0x([abcdef]|[ABCDEF]|\d)+[lL]?/, 'number.hex'], - [/-?(\d*\.)?\d+([eE][+\-]?\d+)?[jJ]?[lL]?/, 'number'] + [/0[xX]_?(@hexdigits)[lL]?/, 'number.hex'], + [/0[oO]_?(@octaldigits)[lL]?/, 'number.octal'], + [/0[bB]_?(@binarydigits)[lL]?/, 'number.binary'], + [/(((@digits)\.(@digits)?)|(\.(@digits)))([eE][+-]?(@digits))?[jJ]?/, 'number.float'], + [/(@digits)[eE][+-]?(@digits)[jJ]?/, 'number.float'], + [/(@digits)[lLjJ]?/, 'number'] ], // Recognize strings, including those broken across lines with \ (but not without)