-
Notifications
You must be signed in to change notification settings - Fork 57
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
StringInterner: add support for UCN identifiers
Closes #823
- Loading branch information
Showing
10 changed files
with
212 additions
and
47 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,149 @@ | ||
const std = @import("std"); | ||
const mem = std.mem; | ||
const Compilation = @import("Compilation.zig"); | ||
|
||
const IdentifierInterner = @This(); | ||
|
||
const IdentifierContext = struct { | ||
pub fn hash(self: @This(), s: []const u8) u32 { | ||
_ = self; | ||
return hashIdentifier(s); | ||
} | ||
pub fn eql(self: @This(), a: []const u8, b: []const u8, b_index: usize) bool { | ||
_ = self; | ||
_ = b_index; | ||
return eqlIdentifier(a, b); | ||
} | ||
}; | ||
|
||
const DecodedUniversalChar = struct { | ||
codepoint: u32, | ||
consumed: usize, | ||
}; | ||
|
||
/// Decodes a C99-style universal character name (e.g., \uXXXX or \UXXXXXXXX) | ||
/// into a unicode codepoint. Returns the decoded character and the number of | ||
/// bytes consumed from the input string. | ||
fn decodeUniversalChar(input: []const u8) ?DecodedUniversalChar { | ||
const is_long = input[1] == 'U'; | ||
const required: usize = if (is_long) 10 else 6; | ||
|
||
if (input.len < required) | ||
return null; | ||
|
||
const hex_part = input[2..required]; | ||
var codepoint: u32 = 0; | ||
for (hex_part) |c| { | ||
codepoint *= 16; | ||
const value = switch (c) { | ||
'0'...'9' => c - '0', | ||
'a'...'f' => 10 + (c - 'a'), | ||
'A'...'F' => 10 + (c - 'A'), | ||
else => return null, | ||
}; | ||
codepoint += value; | ||
} | ||
|
||
return .{ .codepoint = codepoint, .consumed = required }; | ||
} | ||
|
||
const CharIterator = struct { | ||
str: []const u8, | ||
i: usize = 0, | ||
|
||
fn next(self: *@This()) ?u32 { | ||
if (self.i >= self.str.len) return null; | ||
if (self.str[self.i] == '\\' and self.i + 1 < self.str.len and (self.str[self.i + 1] == 'u' or self.str[self.i + 1] == 'U')) { | ||
const decoded = decodeUniversalChar(self.str[self.i..]) orelse { | ||
self.i += 1; | ||
return '\\'; | ||
}; | ||
self.i += decoded.consumed; | ||
return decoded.codepoint; | ||
} else { | ||
const len = std.unicode.utf8ByteSequenceLength(self.str[self.i]) catch 1; | ||
const cp = switch (len) { | ||
1 => self.str[self.i], | ||
2 => std.unicode.utf8Decode2(self.str[self.i..][0..2].*), | ||
3 => std.unicode.utf8Decode3(self.str[self.i..][0..3].*), | ||
4 => std.unicode.utf8Decode4(self.str[self.i..][0..4].*), | ||
else => unreachable, | ||
} catch { | ||
defer self.i += 1; | ||
return self.str[self.i]; | ||
}; | ||
self.i += len; | ||
return cp; | ||
} | ||
} | ||
}; | ||
|
||
fn eqlIdentifier(lhs: []const u8, rhs: []const u8) bool { | ||
if (mem.eql(u8, lhs, rhs)) return true; | ||
if (mem.indexOfScalar(u8, lhs, '\\') == null and mem.indexOfScalar(u8, rhs, '\\') == null) return false; | ||
|
||
var lhs_it: CharIterator = .{ .str = lhs }; | ||
var rhs_it: CharIterator = .{ .str = rhs }; | ||
while (true) { | ||
const maybe_l_char = lhs_it.next(); | ||
const maybe_r_char = rhs_it.next(); | ||
if (maybe_l_char == null and maybe_r_char == null) return true; | ||
const l_char = maybe_l_char orelse return false; | ||
const r_char = maybe_r_char orelse return false; | ||
if (l_char != r_char) return false; | ||
} | ||
} | ||
|
||
fn hashIdentifier(input: []const u8) u32 { | ||
var hasher = std.hash.XxHash3.init(0); | ||
var i: usize = 0; | ||
while (i < input.len) { | ||
var space: [4]u8 = undefined; | ||
const str = if (input[i] == '\\' and i + 1 < input.len and (input[i + 1] == 'u' or input[i + 1] == 'U')) blk: { | ||
const decoded = decodeUniversalChar(input[i..]).?; | ||
const len = std.unicode.utf8Encode(@as(u21, @intCast(decoded.codepoint)), space[0..]) catch { | ||
space[0] = input[i]; | ||
i += 1; | ||
break :blk space[0..1]; | ||
}; | ||
i += decoded.consumed; | ||
break :blk space[0..len]; | ||
} else blk: { | ||
space[0] = input[i]; | ||
i += 1; | ||
break :blk space[0..1]; | ||
}; | ||
hasher.update(str); | ||
} | ||
return @as(u32, @truncate(hasher.final())); | ||
} | ||
|
||
pub const StringId = enum(u32) { | ||
empty = std.math.maxInt(u32), | ||
_, | ||
|
||
pub fn lookup(id: StringId, comp: *const Compilation) []const u8 { | ||
if (id == .empty) return ""; | ||
return comp.identifier_interner.table.keys()[@intFromEnum(id)]; | ||
} | ||
|
||
pub fn lookupExtra(id: StringId, si: IdentifierInterner) []const u8 { | ||
if (id == .empty) return ""; | ||
return si.table.keys()[@intFromEnum(id)]; | ||
} | ||
}; | ||
|
||
table: std.ArrayHashMapUnmanaged([]const u8, void, IdentifierContext, true) = .empty, | ||
|
||
pub fn deinit(si: *IdentifierInterner, allocator: mem.Allocator) void { | ||
si.table.deinit(allocator); | ||
si.* = undefined; | ||
} | ||
|
||
/// Intern externally owned string. | ||
pub fn intern(si: *IdentifierInterner, allocator: mem.Allocator, str: []const u8) !StringId { | ||
if (str.len == 0) return .empty; | ||
|
||
const gop = try si.table.getOrPut(allocator, str); | ||
return @enumFromInt(gop.index); | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file was deleted.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,17 @@ | ||
int foo(void) { | ||
int \u4F60\u597D = 5; | ||
int \u0061 = 5; // TODO: error: character 'a' cannot be specified by a universal character name | ||
return 你好; | ||
} | ||
|
||
struct S { | ||
int 你好; | ||
}; | ||
|
||
int bar(int x) { | ||
struct S s; | ||
s.\u4F60\u597D = x; | ||
return s.你好; | ||
} | ||
|
||
#define TESTS_SKIPPED 1 |