Skip to content

Commit

Permalink
StringInterner: add support for UCN identifiers
Browse files Browse the repository at this point in the history
Closes #823
  • Loading branch information
ehaas committed Jan 12, 2025
1 parent 21ef6e5 commit 22c237c
Show file tree
Hide file tree
Showing 10 changed files with 212 additions and 47 deletions.
2 changes: 1 addition & 1 deletion src/aro.zig
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ pub const Driver = @import("aro/Driver.zig");
pub const Parser = @import("aro/Parser.zig");
pub const Preprocessor = @import("aro/Preprocessor.zig");
pub const Source = @import("aro/Source.zig");
pub const StringInterner = @import("aro/StringInterner.zig");
pub const IdentifierInterner = @import("aro/IdentifierInterner.zig");
pub const target_util = @import("aro/target.zig");
pub const Tokenizer = @import("aro/Tokenizer.zig");
pub const Toolchain = @import("aro/Toolchain.zig");
Expand Down
2 changes: 1 addition & 1 deletion src/aro/CodeGen.zig
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ const Builder = Ir.Builder;
const Builtins = @import("Builtins.zig");
const Builtin = Builtins.Builtin;
const Compilation = @import("Compilation.zig");
const StringId = @import("StringInterner.zig").StringId;
const StringId = @import("IdentifierInterner.zig").StringId;
const Tree = @import("Tree.zig");
const Node = Tree.Node;
const QualType = @import("TypeStore.zig").QualType;
Expand Down
10 changes: 5 additions & 5 deletions src/aro/Compilation.zig
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ const LangOpts = @import("LangOpts.zig");
const Pragma = @import("Pragma.zig");
const record_layout = @import("record_layout.zig");
const Source = @import("Source.zig");
const StringInterner = @import("StringInterner.zig");
const IdentifierInterner = @import("IdentifierInterner.zig");
const target_util = @import("target.zig");
const Tokenizer = @import("Tokenizer.zig");
const Token = Tokenizer.Token;
Expand Down Expand Up @@ -108,7 +108,7 @@ pragma_handlers: std.StringArrayHashMapUnmanaged(*Pragma) = .{},
langopts: LangOpts = .{},
generated_buf: std.ArrayListUnmanaged(u8) = .{},
builtins: Builtins = .{},
string_interner: StringInterner = .{},
identifier_interner: IdentifierInterner = .{},
interner: Interner = .{},
type_store: TypeStore = .{},
/// If this is not null, the directory containing the specified Source will be searched for includes
Expand Down Expand Up @@ -156,15 +156,15 @@ pub fn deinit(comp: *Compilation) void {
comp.pragma_handlers.deinit(comp.gpa);
comp.generated_buf.deinit(comp.gpa);
comp.builtins.deinit(comp.gpa);
comp.string_interner.deinit(comp.gpa);
comp.identifier_interner.deinit(comp.gpa);
comp.interner.deinit(comp.gpa);
comp.environment.deinit(comp.gpa);
comp.type_store.deinit(comp.gpa);
comp.* = undefined;
}

pub fn internString(comp: *Compilation, str: []const u8) !StringInterner.StringId {
return comp.string_interner.intern(comp.gpa, str);
pub fn internString(comp: *Compilation, str: []const u8) !IdentifierInterner.StringId {
return comp.identifier_interner.intern(comp.gpa, str);
}

pub fn getSourceEpoch(self: *const Compilation, max: i64) !?u47 {
Expand Down
149 changes: 149 additions & 0 deletions src/aro/IdentifierInterner.zig
Original file line number Diff line number Diff line change
@@ -0,0 +1,149 @@
const std = @import("std");
const mem = std.mem;
const Compilation = @import("Compilation.zig");

const IdentifierInterner = @This();

const IdentifierContext = struct {
pub fn hash(self: @This(), s: []const u8) u32 {
_ = self;
return hashIdentifier(s);
}
pub fn eql(self: @This(), a: []const u8, b: []const u8, b_index: usize) bool {
_ = self;
_ = b_index;
return eqlIdentifier(a, b);
}
};

const DecodedUniversalChar = struct {
codepoint: u32,
consumed: usize,
};

/// Decodes a C99-style universal character name (e.g., \uXXXX or \UXXXXXXXX)
/// into a unicode codepoint. Returns the decoded character and the number of
/// bytes consumed from the input string.
fn decodeUniversalChar(input: []const u8) ?DecodedUniversalChar {
const is_long = input[1] == 'U';
const required: usize = if (is_long) 10 else 6;

if (input.len < required)
return null;

const hex_part = input[2..required];
var codepoint: u32 = 0;
for (hex_part) |c| {
codepoint *= 16;
const value = switch (c) {
'0'...'9' => c - '0',
'a'...'f' => 10 + (c - 'a'),
'A'...'F' => 10 + (c - 'A'),
else => return null,
};
codepoint += value;
}

return .{ .codepoint = codepoint, .consumed = required };
}

const CharIterator = struct {
str: []const u8,
i: usize = 0,

fn next(self: *@This()) ?u32 {
if (self.i >= self.str.len) return null;
if (self.str[self.i] == '\\' and self.i + 1 < self.str.len and (self.str[self.i + 1] == 'u' or self.str[self.i + 1] == 'U')) {
const decoded = decodeUniversalChar(self.str[self.i..]) orelse {
self.i += 1;
return '\\';
};
self.i += decoded.consumed;
return decoded.codepoint;
} else {
const len = std.unicode.utf8ByteSequenceLength(self.str[self.i]) catch 1;
const cp = switch (len) {
1 => self.str[self.i],
2 => std.unicode.utf8Decode2(self.str[self.i..][0..2].*),
3 => std.unicode.utf8Decode3(self.str[self.i..][0..3].*),
4 => std.unicode.utf8Decode4(self.str[self.i..][0..4].*),
else => unreachable,
} catch {
defer self.i += 1;
return self.str[self.i];
};
self.i += len;
return cp;
}
}
};

fn eqlIdentifier(lhs: []const u8, rhs: []const u8) bool {
if (mem.eql(u8, lhs, rhs)) return true;
if (mem.indexOfScalar(u8, lhs, '\\') == null and mem.indexOfScalar(u8, rhs, '\\') == null) return false;

var lhs_it: CharIterator = .{ .str = lhs };
var rhs_it: CharIterator = .{ .str = rhs };
while (true) {
const maybe_l_char = lhs_it.next();
const maybe_r_char = rhs_it.next();
if (maybe_l_char == null and maybe_r_char == null) return true;
const l_char = maybe_l_char orelse return false;
const r_char = maybe_r_char orelse return false;
if (l_char != r_char) return false;
}
}

fn hashIdentifier(input: []const u8) u32 {
var hasher = std.hash.XxHash3.init(0);
var i: usize = 0;
while (i < input.len) {
var space: [4]u8 = undefined;
const str = if (input[i] == '\\' and i + 1 < input.len and (input[i + 1] == 'u' or input[i + 1] == 'U')) blk: {
const decoded = decodeUniversalChar(input[i..]).?;
const len = std.unicode.utf8Encode(@as(u21, @intCast(decoded.codepoint)), space[0..]) catch {
space[0] = input[i];
i += 1;
break :blk space[0..1];
};
i += decoded.consumed;
break :blk space[0..len];
} else blk: {
space[0] = input[i];
i += 1;
break :blk space[0..1];
};
hasher.update(str);
}
return @as(u32, @truncate(hasher.final()));
}

pub const StringId = enum(u32) {
empty = std.math.maxInt(u32),
_,

pub fn lookup(id: StringId, comp: *const Compilation) []const u8 {
if (id == .empty) return "";
return comp.identifier_interner.table.keys()[@intFromEnum(id)];
}

pub fn lookupExtra(id: StringId, si: IdentifierInterner) []const u8 {
if (id == .empty) return "";
return si.table.keys()[@intFromEnum(id)];
}
};

table: std.ArrayHashMapUnmanaged([]const u8, void, IdentifierContext, true) = .empty,

pub fn deinit(si: *IdentifierInterner, allocator: mem.Allocator) void {
si.table.deinit(allocator);
si.* = undefined;
}

/// Intern externally owned string.
pub fn intern(si: *IdentifierInterner, allocator: mem.Allocator, str: []const u8) !StringId {
if (str.len == 0) return .empty;

const gop = try si.table.getOrPut(allocator, str);
return @enumFromInt(gop.index);
}
2 changes: 1 addition & 1 deletion src/aro/Parser.zig
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ const InitList = @import("InitList.zig");
const Preprocessor = @import("Preprocessor.zig");
const record_layout = @import("record_layout.zig");
const Source = @import("Source.zig");
const StringId = @import("StringInterner.zig").StringId;
const StringId = @import("IdentifierInterner.zig").StringId;
const SymbolStack = @import("SymbolStack.zig");
const Symbol = SymbolStack.Symbol;
const target_util = @import("target.zig");
Expand Down
35 changes: 0 additions & 35 deletions src/aro/StringInterner.zig

This file was deleted.

2 changes: 1 addition & 1 deletion src/aro/SymbolStack.zig
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ const Allocator = mem.Allocator;
const assert = std.debug.assert;

const Parser = @import("Parser.zig");
const StringId = @import("StringInterner.zig").StringId;
const StringId = @import("IdentifierInterner.zig").StringId;
const Tree = @import("Tree.zig");
const Token = Tree.Token;
const TokenIndex = Tree.TokenIndex;
Expand Down
36 changes: 35 additions & 1 deletion src/aro/Tokenizer.zig
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ pub const Token = struct {
eof,
/// identifier containing solely basic character set characters
identifier,
/// identifier with at least one extended character
/// identifier with at least one extended character or UCN escape sequence
extended_identifier,

// string literals with prefixes
Expand Down Expand Up @@ -1074,14 +1074,45 @@ pub fn next(self: *Tokenizer) Token {
pp_num,
pp_num_exponent,
pp_num_digit_separator,
ucn_slash,
ucn,
} = .start;

var start = self.index;
var id: Token.Id = .eof;
var ucn_wants: u8 = undefined;
var ucn_consumed: u8 = undefined;

while (self.index < self.buf.len) : (self.index += 1) {
const c = self.buf[self.index];
switch (state) {
.ucn_slash => switch (c) {
'u' => {
ucn_wants = 4;
ucn_consumed = 0;
state = .ucn;
},
'U' => {
ucn_wants = 8;
ucn_consumed = 0;
state = .ucn;
},
else => {
id = .invalid;
break;
},
},
.ucn => switch (c) {
'a'...'f', 'A'...'F', '0'...'9' => {
ucn_consumed += 1;
if (ucn_consumed == ucn_wants) {
state = .extended_identifier;
}
},
else => {
@panic("todo");
},
},
.start => switch (c) {
'\n' => {
id = .nl;
Expand All @@ -1100,6 +1131,7 @@ pub fn next(self: *Tokenizer) Token {
'u' => state = .u,
'U' => state = .U,
'L' => state = .L,
'\\' => state = .ucn_slash,
'a'...'t', 'v'...'z', 'A'...'K', 'M'...'T', 'V'...'Z', '_' => state = .identifier,
'=' => state = .equal,
'!' => state = .bang,
Expand Down Expand Up @@ -1325,6 +1357,7 @@ pub fn next(self: *Tokenizer) Token {
break;
},
0x80...0xFF => state = .extended_identifier,
'\\' => state = .ucn_slash,
else => {
id = if (state == .identifier) Token.getTokenId(self.langopts, self.buf[start..self.index]) else .extended_identifier;
break;
Expand Down Expand Up @@ -1732,6 +1765,7 @@ pub fn next(self: *Tokenizer) Token {
}
} else if (self.index == self.buf.len) {
switch (state) {
.ucn_slash, .ucn => @panic("todo"),
.start, .line_comment => {},
.u, .u8, .U, .L, .identifier => id = Token.getTokenId(self.langopts, self.buf[start..self.index]),
.extended_identifier => id = .extended_identifier,
Expand Down
4 changes: 2 additions & 2 deletions src/aro/TypeStore.zig
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,8 @@ const Compilation = @import("Compilation.zig");
const LangOpts = @import("LangOpts.zig");
const record_layout = @import("record_layout.zig");
const Parser = @import("Parser.zig");
const StringInterner = @import("StringInterner.zig");
const StringId = StringInterner.StringId;
const IdentifierInterner = @import("IdentifierInterner.zig");
const StringId = IdentifierInterner.StringId;
const target_util = @import("target.zig");
const Tree = @import("Tree.zig");
const Node = Tree.Node;
Expand Down
17 changes: 17 additions & 0 deletions test/cases/ucn identifiers.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
int foo(void) {
int \u4F60\u597D = 5;
int \u0061 = 5; // TODO: error: character 'a' cannot be specified by a universal character name
return 你好;
}

struct S {
int 你好;
};

int bar(int x) {
struct S s;
s.\u4F60\u597D = x;
return s.你好;
}

#define TESTS_SKIPPED 1

0 comments on commit 22c237c

Please sign in to comment.