StringInterner: add support for UCN identifiers

Closes #823
Vexu · Jan 12, 2025 · 22c237c · 22c237c
1 parent 21ef6e5
commit 22c237c
Show file tree

Hide file tree

Showing 10 changed files with 212 additions and 47 deletions.
diff --git a/src/aro.zig b/src/aro.zig
@@ -5,7 +5,7 @@ pub const Driver = @import("aro/Driver.zig");
 pub const Parser = @import("aro/Parser.zig");
 pub const Preprocessor = @import("aro/Preprocessor.zig");
 pub const Source = @import("aro/Source.zig");
-pub const StringInterner = @import("aro/StringInterner.zig");
+pub const IdentifierInterner = @import("aro/IdentifierInterner.zig");
 pub const target_util = @import("aro/target.zig");
 pub const Tokenizer = @import("aro/Tokenizer.zig");
 pub const Toolchain = @import("aro/Toolchain.zig");

diff --git a/src/aro/CodeGen.zig b/src/aro/CodeGen.zig
@@ -10,7 +10,7 @@ const Builder = Ir.Builder;
 const Builtins = @import("Builtins.zig");
 const Builtin = Builtins.Builtin;
 const Compilation = @import("Compilation.zig");
-const StringId = @import("StringInterner.zig").StringId;
+const StringId = @import("IdentifierInterner.zig").StringId;
 const Tree = @import("Tree.zig");
 const Node = Tree.Node;
 const QualType = @import("TypeStore.zig").QualType;

diff --git a/src/aro/Compilation.zig b/src/aro/Compilation.zig
@@ -15,7 +15,7 @@ const LangOpts = @import("LangOpts.zig");
 const Pragma = @import("Pragma.zig");
 const record_layout = @import("record_layout.zig");
 const Source = @import("Source.zig");
-const StringInterner = @import("StringInterner.zig");
+const IdentifierInterner = @import("IdentifierInterner.zig");
 const target_util = @import("target.zig");
 const Tokenizer = @import("Tokenizer.zig");
 const Token = Tokenizer.Token;
@@ -108,7 +108,7 @@ pragma_handlers: std.StringArrayHashMapUnmanaged(*Pragma) = .{},
 langopts: LangOpts = .{},
 generated_buf: std.ArrayListUnmanaged(u8) = .{},
 builtins: Builtins = .{},
-string_interner: StringInterner = .{},
+identifier_interner: IdentifierInterner = .{},
 interner: Interner = .{},
 type_store: TypeStore = .{},
 /// If this is not null, the directory containing the specified Source will be searched for includes
@@ -156,15 +156,15 @@ pub fn deinit(comp: *Compilation) void {
     comp.pragma_handlers.deinit(comp.gpa);
     comp.generated_buf.deinit(comp.gpa);
     comp.builtins.deinit(comp.gpa);
-    comp.string_interner.deinit(comp.gpa);
+    comp.identifier_interner.deinit(comp.gpa);
     comp.interner.deinit(comp.gpa);
     comp.environment.deinit(comp.gpa);
     comp.type_store.deinit(comp.gpa);
     comp.* = undefined;
 }
 
-pub fn internString(comp: *Compilation, str: []const u8) !StringInterner.StringId {
-    return comp.string_interner.intern(comp.gpa, str);
+pub fn internString(comp: *Compilation, str: []const u8) !IdentifierInterner.StringId {
+    return comp.identifier_interner.intern(comp.gpa, str);
 }
 
 pub fn getSourceEpoch(self: *const Compilation, max: i64) !?u47 {

diff --git a/src/aro/IdentifierInterner.zig b/src/aro/IdentifierInterner.zig
@@ -0,0 +1,149 @@
+const std = @import("std");
+const mem = std.mem;
+const Compilation = @import("Compilation.zig");
+
+const IdentifierInterner = @This();
+
+const IdentifierContext = struct {
+    pub fn hash(self: @This(), s: []const u8) u32 {
+        _ = self;
+        return hashIdentifier(s);
+    }
+    pub fn eql(self: @This(), a: []const u8, b: []const u8, b_index: usize) bool {
+        _ = self;
+        _ = b_index;
+        return eqlIdentifier(a, b);
+    }
+};
+
+const DecodedUniversalChar = struct {
+    codepoint: u32,
+    consumed: usize,
+};
+
+/// Decodes a C99-style universal character name (e.g., \uXXXX or \UXXXXXXXX)
+/// into a unicode codepoint. Returns the decoded character and the number of
+/// bytes consumed from the input string.
+fn decodeUniversalChar(input: []const u8) ?DecodedUniversalChar {
+    const is_long = input[1] == 'U';
+    const required: usize = if (is_long) 10 else 6;
+
+    if (input.len < required)
+        return null;
+
+    const hex_part = input[2..required];
+    var codepoint: u32 = 0;
+    for (hex_part) |c| {
+        codepoint *= 16;
+        const value = switch (c) {
+            '0'...'9' => c - '0',
+            'a'...'f' => 10 + (c - 'a'),
+            'A'...'F' => 10 + (c - 'A'),
+            else => return null,
+        };
+        codepoint += value;
+    }
+
+    return .{ .codepoint = codepoint, .consumed = required };
+}
+
+const CharIterator = struct {
+    str: []const u8,
+    i: usize = 0,
+
+    fn next(self: *@This()) ?u32 {
+        if (self.i >= self.str.len) return null;
+        if (self.str[self.i] == '\\' and self.i + 1 < self.str.len and (self.str[self.i + 1] == 'u' or self.str[self.i + 1] == 'U')) {
+            const decoded = decodeUniversalChar(self.str[self.i..]) orelse {
+                self.i += 1;
+                return '\\';
+            };
+            self.i += decoded.consumed;
+            return decoded.codepoint;
+        } else {
+            const len = std.unicode.utf8ByteSequenceLength(self.str[self.i]) catch 1;
+            const cp = switch (len) {
+                1 => self.str[self.i],
+                2 => std.unicode.utf8Decode2(self.str[self.i..][0..2].*),
+                3 => std.unicode.utf8Decode3(self.str[self.i..][0..3].*),
+                4 => std.unicode.utf8Decode4(self.str[self.i..][0..4].*),
+                else => unreachable,
+            } catch {
+                defer self.i += 1;
+                return self.str[self.i];
+            };
+            self.i += len;
+            return cp;
+        }
+    }
+};
+
+fn eqlIdentifier(lhs: []const u8, rhs: []const u8) bool {
+    if (mem.eql(u8, lhs, rhs)) return true;
+    if (mem.indexOfScalar(u8, lhs, '\\') == null and mem.indexOfScalar(u8, rhs, '\\') == null) return false;
+
+    var lhs_it: CharIterator = .{ .str = lhs };
+    var rhs_it: CharIterator = .{ .str = rhs };
+    while (true) {
+        const maybe_l_char = lhs_it.next();
+        const maybe_r_char = rhs_it.next();
+        if (maybe_l_char == null and maybe_r_char == null) return true;
+        const l_char = maybe_l_char orelse return false;
+        const r_char = maybe_r_char orelse return false;
+        if (l_char != r_char) return false;
+    }
+}
+
+fn hashIdentifier(input: []const u8) u32 {
+    var hasher = std.hash.XxHash3.init(0);
+    var i: usize = 0;
+    while (i < input.len) {
+        var space: [4]u8 = undefined;
+        const str = if (input[i] == '\\' and i + 1 < input.len and (input[i + 1] == 'u' or input[i + 1] == 'U')) blk: {
+            const decoded = decodeUniversalChar(input[i..]).?;
+            const len = std.unicode.utf8Encode(@as(u21, @intCast(decoded.codepoint)), space[0..]) catch {
+                space[0] = input[i];
+                i += 1;
+                break :blk space[0..1];
+            };
+            i += decoded.consumed;
+            break :blk space[0..len];
+        } else blk: {
+            space[0] = input[i];
+            i += 1;
+            break :blk space[0..1];
+        };
+        hasher.update(str);
+    }
+    return @as(u32, @truncate(hasher.final()));
+}
+
+pub const StringId = enum(u32) {
+    empty = std.math.maxInt(u32),
+    _,
+
+    pub fn lookup(id: StringId, comp: *const Compilation) []const u8 {
+        if (id == .empty) return "";
+        return comp.identifier_interner.table.keys()[@intFromEnum(id)];
+    }
+
+    pub fn lookupExtra(id: StringId, si: IdentifierInterner) []const u8 {
+        if (id == .empty) return "";
+        return si.table.keys()[@intFromEnum(id)];
+    }
+};
+
+table: std.ArrayHashMapUnmanaged([]const u8, void, IdentifierContext, true) = .empty,
+
+pub fn deinit(si: *IdentifierInterner, allocator: mem.Allocator) void {
+    si.table.deinit(allocator);
+    si.* = undefined;
+}
+
+/// Intern externally owned string.
+pub fn intern(si: *IdentifierInterner, allocator: mem.Allocator, str: []const u8) !StringId {
+    if (str.len == 0) return .empty;
+
+    const gop = try si.table.getOrPut(allocator, str);
+    return @enumFromInt(gop.index);
+}
diff --git a/src/aro/Parser.zig b/src/aro/Parser.zig
@@ -15,7 +15,7 @@ const InitList = @import("InitList.zig");
 const Preprocessor = @import("Preprocessor.zig");
 const record_layout = @import("record_layout.zig");
 const Source = @import("Source.zig");
-const StringId = @import("StringInterner.zig").StringId;
+const StringId = @import("IdentifierInterner.zig").StringId;
 const SymbolStack = @import("SymbolStack.zig");
 const Symbol = SymbolStack.Symbol;
 const target_util = @import("target.zig");

diff --git a/src/aro/StringInterner.zig b/src/aro/StringInterner.zig
diff --git a/src/aro/SymbolStack.zig b/src/aro/SymbolStack.zig
@@ -4,7 +4,7 @@ const Allocator = mem.Allocator;
 const assert = std.debug.assert;
 
 const Parser = @import("Parser.zig");
-const StringId = @import("StringInterner.zig").StringId;
+const StringId = @import("IdentifierInterner.zig").StringId;
 const Tree = @import("Tree.zig");
 const Token = Tree.Token;
 const TokenIndex = Tree.TokenIndex;

diff --git a/src/aro/Tokenizer.zig b/src/aro/Tokenizer.zig
@@ -19,7 +19,7 @@ pub const Token = struct {
         eof,
         /// identifier containing solely basic character set characters
         identifier,
-        /// identifier with at least one extended character
+        /// identifier with at least one extended character or UCN escape sequence
         extended_identifier,
 
         // string literals with prefixes
@@ -1074,14 +1074,45 @@ pub fn next(self: *Tokenizer) Token {
         pp_num,
         pp_num_exponent,
         pp_num_digit_separator,
+        ucn_slash,
+        ucn,
     } = .start;
 
     var start = self.index;
     var id: Token.Id = .eof;
+    var ucn_wants: u8 = undefined;
+    var ucn_consumed: u8 = undefined;
 
     while (self.index < self.buf.len) : (self.index += 1) {
         const c = self.buf[self.index];
         switch (state) {
+            .ucn_slash => switch (c) {
+                'u' => {
+                    ucn_wants = 4;
+                    ucn_consumed = 0;
+                    state = .ucn;
+                },
+                'U' => {
+                    ucn_wants = 8;
+                    ucn_consumed = 0;
+                    state = .ucn;
+                },
+                else => {
+                    id = .invalid;
+                    break;
+                },
+            },
+            .ucn => switch (c) {
+                'a'...'f', 'A'...'F', '0'...'9' => {
+                    ucn_consumed += 1;
+                    if (ucn_consumed == ucn_wants) {
+                        state = .extended_identifier;
+                    }
+                },
+                else => {
+                    @panic("todo");
+                },
+            },
             .start => switch (c) {
                 '\n' => {
                     id = .nl;
@@ -1100,6 +1131,7 @@ pub fn next(self: *Tokenizer) Token {
                 'u' => state = .u,
                 'U' => state = .U,
                 'L' => state = .L,
+                '\\' => state = .ucn_slash,
                 'a'...'t', 'v'...'z', 'A'...'K', 'M'...'T', 'V'...'Z', '_' => state = .identifier,
                 '=' => state = .equal,
                 '!' => state = .bang,
@@ -1325,6 +1357,7 @@ pub fn next(self: *Tokenizer) Token {
                     break;
                 },
                 0x80...0xFF => state = .extended_identifier,
+                '\\' => state = .ucn_slash,
                 else => {
                     id = if (state == .identifier) Token.getTokenId(self.langopts, self.buf[start..self.index]) else .extended_identifier;
                     break;
@@ -1732,6 +1765,7 @@ pub fn next(self: *Tokenizer) Token {
         }
     } else if (self.index == self.buf.len) {
         switch (state) {
+            .ucn_slash, .ucn => @panic("todo"),
             .start, .line_comment => {},
             .u, .u8, .U, .L, .identifier => id = Token.getTokenId(self.langopts, self.buf[start..self.index]),
             .extended_identifier => id = .extended_identifier,

diff --git a/src/aro/TypeStore.zig b/src/aro/TypeStore.zig
@@ -5,8 +5,8 @@ const Compilation = @import("Compilation.zig");
 const LangOpts = @import("LangOpts.zig");
 const record_layout = @import("record_layout.zig");
 const Parser = @import("Parser.zig");
-const StringInterner = @import("StringInterner.zig");
-const StringId = StringInterner.StringId;
+const IdentifierInterner = @import("IdentifierInterner.zig");
+const StringId = IdentifierInterner.StringId;
 const target_util = @import("target.zig");
 const Tree = @import("Tree.zig");
 const Node = Tree.Node;

diff --git a/test/cases/ucn identifiers.c b/test/cases/ucn identifiers.c
@@ -0,0 +1,17 @@
+int foo(void) {
+    int \u4F60\u597D = 5;
+    int \u0061 = 5; // TODO: error: character 'a' cannot be specified by a universal character name
+    return 你好;
+}
+
+struct S {
+    int 你好;
+};
+
+int bar(int x) {
+    struct S s;
+    s.\u4F60\u597D = x;
+    return s.你好;
+}
+
+#define TESTS_SKIPPED 1