Skip to content

Commit

Permalink
Improved CharStream class
Browse files Browse the repository at this point in the history
- Avoid duplicate data in CharStream (as array and a string) and make character index working for the full Unicode range. Use a typed array instead of a plain number array, saving so even more space.
- This also fixed getting sub strings using intervals.
- Added unit tests for CharStream.

Signed-off-by: Mike Lischke <[email protected]>
  • Loading branch information
mike-lischke committed Nov 25, 2023
1 parent 7de0497 commit 8f8331a
Show file tree
Hide file tree
Showing 5 changed files with 184 additions and 44 deletions.
59 changes: 21 additions & 38 deletions src/CharStream.ts
Original file line number Diff line number Diff line change
Expand Up @@ -10,39 +10,21 @@ import { Token } from "./Token.js";
import { Interval } from "./misc/Interval.js";
import { IntStream } from "./IntStream.js";

/**
* If decodeToUnicodeCodePoints is true, the input is treated
* as a series of Unicode code points.
*
* Otherwise, the input is treated as a series of 16-bit UTF-16 code
* units.
*/
// TODO: CharStream should be an interface, not a class.
export class CharStream implements IntStream {
public name = "";
public index = 0;

private stringData: string;
private data: number[];
private decodeToUnicodeCodePoints: boolean;

public constructor(data: string, decodeToUnicodeCodePoints = false) {
this.stringData = data;
this.decodeToUnicodeCodePoints = decodeToUnicodeCodePoints;

this.data = [];
if (this.decodeToUnicodeCodePoints) {
for (let i = 0; i < this.stringData.length;) {
const codePoint = this.stringData.codePointAt(i)!;
this.data.push(codePoint);
i += codePoint <= 0xFFFF ? 1 : 2;
}
} else {
this.data = new Array(this.stringData.length);
for (let i = 0; i < this.stringData.length; i++) {
this.data[i] = this.stringData.charCodeAt(i);
}
private data: Uint32Array;

public constructor(input: string) {
// Convert input to UTF-32 code points.
const codePoints: number[] = [];
for (const char of input) {
codePoints.push(char.codePointAt(0)!);
}

this.data = new Uint32Array(codePoints);
}

/**
Expand Down Expand Up @@ -119,21 +101,12 @@ export class CharStream implements IntStream {
if (begin >= this.data.length) {
return "";
} else {
if (this.decodeToUnicodeCodePoints) {
let result = "";
for (let i = begin; i <= end; i++) {
result += String.fromCodePoint(this.data[i]);
}

return result;
} else {
return this.stringData.slice(begin, end + 1);
}
return this.#stringFromRange(begin, end + 1);
}
}

public toString(): string {
return this.stringData;
return this.#stringFromRange(0);
}

public get size(): number {
Expand All @@ -147,4 +120,14 @@ export class CharStream implements IntStream {
return IntStream.UNKNOWN_SOURCE_NAME;
}
}

#stringFromRange(start: number, stop?: number): string {
const data = this.data.slice(start, stop);
let result = "";
data.forEach((value) => {
result += String.fromCodePoint(value);
});

return result;
}
}
4 changes: 2 additions & 2 deletions src/CharStreams.ts
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ import { CharStream } from "./CharStream.js";
export class CharStreams {
// Creates an CharStream from a string.
public static fromString(str: string): CharStream {
return new CharStream(str, true);
return new CharStream(str);
}

/**
Expand All @@ -30,6 +30,6 @@ export class CharStreams {
* encoding is null).
*/
public static fromBuffer(buffer: Buffer, encoding?: BufferEncoding): CharStream {
return new CharStream(buffer.toString(encoding), true);
return new CharStream(buffer.toString(encoding));
}
}
4 changes: 2 additions & 2 deletions src/ParserRuleContext.ts
Original file line number Diff line number Diff line change
Expand Up @@ -65,8 +65,8 @@ export class ParserRuleContext extends RuleContext {
this.stop = null;
}

public override get parent(): this | null {
return super.parent;
public override get parent(): ParserRuleContext | null {
return super.parent as ParserRuleContext;
}

public override set parent(parent: ParserRuleContext | null) {
Expand Down
4 changes: 2 additions & 2 deletions src/RuleContext.ts
Original file line number Diff line number Diff line change
Expand Up @@ -53,8 +53,8 @@ export class RuleContext implements ParseTree {
this.invokingState = invokingState ?? -1;
}

public get parent(): this | null {
return this.#parent as this;
public get parent(): RuleContext | null {
return this.#parent;
}

public set parent(parent: RuleContext | null) {
Expand Down
157 changes: 157 additions & 0 deletions tests/CharStream.spec.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,157 @@
/*
* Copyright (c) The ANTLR Project. All rights reserved.
* Use of this file is governed by the BSD 3-clause license that
* can be found in the LICENSE.txt file in the project root.
*/

import { CharStream, IntStream, Interval } from "../src/index.js";

const unicodeInput = "Hello 👋, World! 😁";

describe("CharStream", () => {
describe("constructor", () => {
it("should initialize the CharStream with the provided string data", () => {
const charStream = new CharStream(unicodeInput);
expect(charStream.toString()).toBe(unicodeInput);
});
});

describe("reset", () => {
it("should reset the CharStream index to 0", () => {
const charStream = new CharStream(unicodeInput);
charStream.index = 5;
charStream.reset();
expect(charStream.index).toBe(0);
});
});

describe("consume", () => {
it("should increment the CharStream index by 1", () => {
const charStream = new CharStream(unicodeInput);
charStream.consume();
expect(charStream.index).toBe(1);
});

it("should throw an error if the CharStream index is at the end of the data", () => {
const charStream = new CharStream(unicodeInput);
charStream.index = charStream.size;
expect(() => { return charStream.consume(); }).toThrow("cannot consume EOF");
});
});

describe("LA", () => {
it("should return the character at the specified offset from the current index", () => {
const charStream = new CharStream(unicodeInput);
const result = charStream.LA(7);
expect(result).toBe("👋".codePointAt(0));
});

it("should return 0 if the offset is 0", () => {
const charStream = new CharStream(unicodeInput);
const result = charStream.LA(0);
expect(result).toBe(0);
});

it("should return Token.EOF if the offset is out of range", () => {
const charStream = new CharStream(unicodeInput);
const result = charStream.LA(20);
expect(result).toBe(-1);
});
});

describe("mark/release", () => {
it("should return -1 for mark", () => {
const charStream = new CharStream(unicodeInput);
const result = charStream.mark();
expect(result).toBe(-1);
});

it("should do nothing for release", () => {
const charStream = new CharStream(unicodeInput);
expect(() => { return charStream.release(0); }).not.toThrow();
});
});

describe("seek", () => {
it("should set the CharStream index to the specified index if it is less than or equal to the current index",
() => {
const charStream = new CharStream(unicodeInput);
charStream.index = 5;
charStream.seek(3);
expect(charStream.index).toBe(3);
});

it("should set the CharStream index to the specified index if it is greater than the current index", () => {
const charStream = new CharStream(unicodeInput);
charStream.index = 5;
charStream.seek(8);
expect(charStream.index).toBe(8);
});

it("should not set CharStream index beyond the data length", () => {
const charStream = new CharStream(unicodeInput);
charStream.index = 5;
charStream.seek(20);
expect(charStream.index).toBe(17);
});
});

describe("getText", () => {
it("should return an empty string if the begin index is greater than or equal to the data length", () => {
const charStream = new CharStream(unicodeInput);
const result = charStream.getText(20, 10);
expect(result).toBe("");
});

it("should return the correct substring when given an interval", () => {
const charStream = new CharStream(unicodeInput);
const result = charStream.getText(new Interval(6, 11));
expect(result).toBe("👋, Wor");
});

it("should return the correct substring when given a start and stop index", () => {
const charStream = new CharStream(unicodeInput);
const result = charStream.getText(8, 11);
expect(result).toBe(" Wor");
});

it("should handle stop index greater than data length", () => {
const charStream = new CharStream(unicodeInput);
const result = charStream.getText(7, 20);
expect(result).toBe(", World! 😁");
});

});

describe("toString", () => {
it("should return the string data of the CharStream", () => {
const data = unicodeInput;
const charStream = new CharStream(data);
const result = charStream.toString();
expect(result).toBe(data);
});
});

describe("size", () => {
it("should return the size of the CharStream data", () => {
const charStream = new CharStream(unicodeInput);
const result = charStream.size;
expect(result).toBe(17);
});
});

describe("getSourceName", () => {
it("should return the name of the CharStream if it is set", () => {
const charStream = new CharStream(unicodeInput);
charStream.name = "Test";
const result = charStream.getSourceName();
expect(result).toBe("Test");
});

it("should return IntStream.UNKNOWN_SOURCE_NAME if the name is not set", () => {
const charStream = new CharStream(unicodeInput);
const result = charStream.getSourceName();
expect(result).toBe(IntStream.UNKNOWN_SOURCE_NAME);
});
});
});

0 comments on commit 8f8331a

Please sign in to comment.