From 14efdca3b34ae8b96dba71cc835aad381927bd4c Mon Sep 17 00:00:00 2001
From: Luca Chiodini <luca@chiodini.org>
Date: Wed, 5 Nov 2025 09:21:24 +0100
Subject: [PATCH] Improve Python's tokenizer for numeric literals

This improves Python's tokenizer for numeric literals with respect to several aspects:
- Support underscores between digits and after prefixes (fixes #4745)
- Support octal and binary literals
- Support case-insensitive prefixes for hex/octal/binary literals
- Recognize a possible leading minus sign as a separate token, instead of mistakenly treating it as part of the numeric literal

Reference: https://docs.python.org/3/reference/lexical_analysis.html#numeric-literals

Add tests to cover several of the above cases and their combinations.
---
 src/basic-languages/python/python.test.ts | 90 +++++++++++++++++++++--
 src/basic-languages/python/python.ts      | 16 +++-
 2 files changed, 98 insertions(+), 8 deletions(-)

diff --git a/src/basic-languages/python/python.test.ts b/src/basic-languages/python/python.test.ts
index 019758a4..eeb1ddc8 100644
--- a/src/basic-languages/python/python.test.ts
+++ b/src/basic-languages/python/python.test.ts
@@ -99,7 +99,7 @@ testTokenization('python', [
 			line: "'''Lots '''0.3e-5",
 			tokens: [
 				{ startIndex: 0, type: 'string.python' },
-				{ startIndex: 11, type: 'number.python' }
+				{ startIndex: 11, type: 'number.float.python' }
 			]
 		}
 	],
@@ -171,10 +171,11 @@ testTokenization('python', [
 		{
 			line: '0xAcBFd',
 			tokens: [{ startIndex: 0, type: 'number.hex.python' }]
-		}
-	],
-
-	[
+		},
+		{
+			line: '0X_1234_ABCD',
+			tokens: [{ startIndex: 0, type: 'number.hex.python' }]
+		},
 		{
 			line: '0x0cH',
 			tokens: [
@@ -184,9 +185,88 @@ testTokenization('python', [
 		}
 	],
 
+	[
+		{
+			line: '0o7501',
+			tokens: [{ startIndex: 0, type: 'number.octal.python' }]
+		},
+		{
+			line: '0O_1_2_3_4_5_6_7',
+			tokens: [{ startIndex: 0, type: 'number.octal.python' }]
+		}
+	],
+
+	[
+		{
+			line: '0b0',
+			tokens: [{ startIndex: 0, type: 'number.binary.python' }]
+		},
+		{
+			line: '0B_1010_0101',
+			tokens: [{ startIndex: 0, type: 'number.binary.python' }]
+		}
+	],
+
+	[
+		{
+			line: '3.14',
+			tokens: [{ startIndex: 0, type: 'number.float.python' }]
+		},
+		{
+			line: '456.7j',
+			tokens: [{ startIndex: 0, type: 'number.float.python' }]
+		},
+		{
+			line: '0.34J',
+			tokens: [{ startIndex: 0, type: 'number.float.python' }]
+		},
+		{
+			line: '.999_999',
+			tokens: [{ startIndex: 0, type: 'number.float.python' }]
+		},
+		{
+			line: '1.',
+			tokens: [{ startIndex: 0, type: 'number.float.python' }]
+		}
+	],
+
 	[
 		{
 			line: '456.7e-7j',
+			tokens: [{ startIndex: 0, type: 'number.float.python' }]
+		},
+		{
+			line: '0.1234e+1J',
+			tokens: [{ startIndex: 0, type: 'number.float.python' }]
+		},
+		{
+			line: '.12e-0j',
+			tokens: [{ startIndex: 0, type: 'number.float.python' }]
+		},
+		{
+			line: '0E0',
+			tokens: [{ startIndex: 0, type: 'number.float.python' }]
+		},
+		{
+			line: '1e1_0',
+			tokens: [{ startIndex: 0, type: 'number.float.python' }]
+		}
+	],
+
+	[
+		{
+			line: '123456',
+			tokens: [{ startIndex: 0, type: 'number.python' }]
+		},
+		{
+			line: '-1L',
+			tokens: [
+				{ startIndex: 0, type: '' },
+				{ startIndex: 1, type: 'number.python' }
+			]
+		},
+		{
+			line: '1_000_000_000',
 			tokens: [{ startIndex: 0, type: 'number.python' }]
 		}
 	],
diff --git a/src/basic-languages/python/python.ts b/src/basic-languages/python/python.ts
index c48255a9..cb832702 100644
--- a/src/basic-languages/python/python.ts
+++ b/src/basic-languages/python/python.ts
@@ -200,6 +200,12 @@ export const language = <languages.IMonarchLanguage>{
 		{ open: '(', close: ')', token: 'delimiter.parenthesis' }
 	],
 
+	// we include these common regular expressions
+	digits: /\d+(_+\d+)*/,
+	octaldigits: /[0-7]+(_+[0-7]+)*/,
+	binarydigits: /[0-1]+(_+[0-1]+)*/,
+	hexdigits: /[[0-9a-fA-F]+(_+[0-9a-fA-F]+)*/,
+
 	tokenizer: {
 		root: [
 			{ include: '@whitespace' },
@@ -241,10 +247,14 @@ export const language = <languages.IMonarchLanguage>{
 			[/"/, 'string']
 		],
 
-		// Recognize hex, negatives, decimals, imaginaries, longs, and scientific notation
+		// Recognize hex, octal, binary, floating-point (including scientific notation), decimals, plus variants (imaginaries, Python 2 longs)
 		numbers: [
-			[/-?0x([abcdef]|[ABCDEF]|\d)+[lL]?/, 'number.hex'],
-			[/-?(\d*\.)?\d+([eE][+\-]?\d+)?[jJ]?[lL]?/, 'number']
+			[/0[xX]_?(@hexdigits)[lL]?/, 'number.hex'],
+			[/0[oO]_?(@octaldigits)[lL]?/, 'number.octal'],
+			[/0[bB]_?(@binarydigits)[lL]?/, 'number.binary'],
+			[/(((@digits)\.(@digits)?)|(\.(@digits)))([eE][+-]?(@digits))?[jJ]?/, 'number.float'],
+			[/(@digits)[eE][+-]?(@digits)[jJ]?/, 'number.float'],
+			[/(@digits)[lLjJ]?/, 'number']
 		],
 
 		// Recognize strings, including those broken across lines with \ (but not without)