utf8.pas

{$MODE OBJFPC} { -*- delphi -*- }
{$INCLUDE settings.inc}
{$CODEPAGE UTF8}
unit utf8;

interface

//{$DEFINE TESTS}

uses
   unicode;

type
   TUTF8SequenceLength = 1..4;
   TZeroableUTF8SequenceLength = 0..4;
   TUTF8Sequence = packed record
     case Byte of
        0: (Length: TUTF8SequenceLength;
            case Byte of 
               0: (Byte1, Byte2, Byte3, Byte4: Byte);
               1: (Start: record end));
        1: (AsString: String[4]);
   end;
   operator := (const Value: TUTF8Sequence): UTF8String; deprecated;
   operator + (const Op1: TUTF8Sequence; const Op2: UTF8String): UTF8String;
   operator + (const Op1: UTF8String; const Op2: TUTF8Sequence): UTF8String;
   operator + (const Op1, Op2: TUTF8Sequence): UTF8String;

function IsValidUTF8(const Value: RawByteString): Boolean;

function UTF8ToCodepoint(const Data: PByte; out BytesRead: TZeroableUTF8SequenceLength): TUnicodeCodepoint; inline;
function UTF8ToCodepoint(const Data: PByte; out BytesRead: TUTF8SequenceLength): TUnicodeCodepoint; inline;
function UTF8ToUTF8Length(const Data: PByte): TUTF8SequenceLength; inline;
  // WARNING: There are no length guards in these!
  // Value needs to be at least 4 bytes long if you have no idea what it is.
  // If it is less than four bytes, then there needs to be at least one byte
  // with its high bit unset in the first three bytes.
  // A simple way to guard against this is to add a trailing #0 to your input.

function UTF8ToCodepoint(const Data: PUTF8String; const Position: Cardinal; out BytesRead: TZeroableUTF8SequenceLength): TUnicodeCodepoint; inline;
function UTF8ToCodepoint(const Data: PUTF8String; const Position: Cardinal; out BytesRead: TUTF8SequenceLength): TUnicodeCodepoint; inline;
function UTF8ToUTF8Length(const Data: PUTF8String; const Position: Cardinal): TUTF8SequenceLength; inline;
  // these handle the length guarding

function CodepointToUTF8(const Value: TUnicodeCodepoint): TUTF8Sequence; inline;
function CodepointToUTF8Length(const Value: TUnicodeCodepoint): TUTF8SequenceLength; inline;

function CodepointArrayToUTF8String(const Value: TUnicodeCodepointArray): UTF8String;

implementation

uses exceptions, sysutils;

// TUTF8Sequence

operator := (const Value: TUTF8Sequence): UTF8String;
begin
   Result := Value.AsString;
end;

operator + (const Op1: TUTF8Sequence; const Op2: UTF8String): UTF8String;
begin
   Result := Op1.AsString + Op2;
end;

operator + (const Op1: UTF8String; const Op2: TUTF8Sequence): UTF8String;
begin
   Result := Op1 + Op2.AsString;
end;

operator + (const Op1, Op2: TUTF8Sequence): UTF8String;
begin
   Result := Op1.AsString + Op2.AsString;
end;

// API

function IsValidUTF8(const Value: RawByteString): Boolean;
var
   Index: Cardinal;

   function ConsumeContinuationByte(): Boolean; inline;
   begin
      Inc(Index);
      Result := (Index <= Length(Value)) and (Ord(Value[Index]) in [$80..$BF]);
   end;

var
   C: Char;
   Codepoint: Cardinal;
begin
   Result := False;
   Index := 1;
   while (Index <= Length(Value)) do
   begin
      C := Value[Index];
      case Ord(C) of
         $80..$BF: exit; // unexpected continuation byte
         $C0, $C1: exit; // forcibly an overlong sequence
         $C2..$DF: if (not ConsumeContinuationByte()) then exit; // two-byte sequence
         $E0: // three-byte sequence with possible overlong sequence
           begin
              if (not (ConsumeContinuationByte() and ConsumeContinuationByte())) then exit; // short
              Codepoint := ((Ord(Value[Index-1])-$80) shl 6) + (Ord(Value[Index])-$80); // $R-
              if (Codepoint < $0800) then
                 exit; // overlong
           end;
         $E1..$EC: if (not (ConsumeContinuationByte() and ConsumeContinuationByte())) then exit; // three-byte sequence
         $ED: // three-byte sequence with possible surrogates
           begin
              if (not (ConsumeContinuationByte() and ConsumeContinuationByte())) then exit; // short
              Codepoint := $D000 + ((Ord(Value[Index-1])-$80) shl 6) + (Ord(Value[Index])-$80); // $R-
              Assert(Codepoint < $E000);
              if (Codepoint >= $D800) then
                 exit; // surrogate
           end;
         $EE..$EF: if (not (ConsumeContinuationByte() and ConsumeContinuationByte())) then exit; // three-byte sequence
         $F0: // four-byte sequence with possible overlong sequence
           begin
              if (not (ConsumeContinuationByte() and ConsumeContinuationByte() and ConsumeContinuationByte())) then exit; // short
              Codepoint := ((Ord(Value[Index-2])-$80) shl 12) + ((Ord(Value[Index-1])-$80) shl 6) + (Ord(Value[Index])-$80); // $R-
              if (Codepoint < $10000) then
                 exit; // overlong
           end;
         $F1..$F3: if (not (ConsumeContinuationByte() and ConsumeContinuationByte() and ConsumeContinuationByte())) then exit; // four-byte sequence
         $F4: // four-byte sequence with possible too-high codepoint
           begin
              if (not (ConsumeContinuationByte() and ConsumeContinuationByte() and ConsumeContinuationByte())) then exit; // short
              Codepoint := $100000 + ((Ord(Value[Index-2])-$80) shl 12) + ((Ord(Value[Index-1])-$80) shl 6) + (Ord(Value[Index])-$80); // $R-
              if (Codepoint > $10FFFF) then
                 exit; // overlong
           end;
         $F5..$FD: exit; // forcibly an invalid Unicode character
         $FE, $FF: exit; // non-UTF-8 bytes
      end;
      Inc(Index);
   end;
   Result := True;
end;

function UTF8ToCodepoint(const Data: PByte; out BytesRead: TUTF8SequenceLength): TUnicodeCodepoint;

   procedure Return(const Codepoint: TUnicodeCodepoint; const Length: TUTF8SequenceLength); inline;
   begin
      Result := Codepoint;
      BytesRead := Length;
   end;

begin
   case (Data+0)^ of
      $00..$7F: Return((Data+0)^, 1); // ASCII
      $C2..$DF:
         if ((Data+1)^ in [$80..$BF]) then
            Return(((Data+0)^ and %00011111) shl 6 +
                   ((Data+1)^ and %00111111), 2) // $R-
         else // missing continuation byte
            Return($FFFD, 1);
      $E0: // three-byte sequence with possible overlong sequence
         if ((Data+1)^ in [$A0..$BF]) then
         begin
            if ((Data+2)^ in [$80..$BF]) then
            begin
               Assert(((Data+0)^ and %00001111) = 0);
               Return({((Data+0)^ and %00001111) shl 12 +} // always zero
                       ((Data+1)^ and %00111111) shl 6 +
                       ((Data+2)^ and %00111111), 3); // $R-
            end
            else
               Return($FFFD, 2);
         end
         else
            Return($FFFD, 1);
      $E1..$EC, $EE..$EF: // three-byte sequence
         if ((Data+1)^ in [$80..$BF]) then
         begin
            if ((Data+2)^ in [$80..$BF]) then
            begin
               Return(((Data+0)^ and %00001111) shl 12 +
                      ((Data+1)^ and %00111111) shl 6 +
                      ((Data+2)^ and %00111111), 3); // $R-
            end
            else
               Return($FFFD, 2);
         end
         else
            Return($FFFD, 1);
      $ED: // three-byte sequence with possible surrogates
         if ((Data+1)^ in [$80..$9F]) then
         begin
            if ((Data+2)^ in [$80..$BF]) then
            begin
               Return(((Data+0)^ and %00001111) shl 12 +
                      ((Data+1)^ and %00111111) shl 6 +
                      ((Data+2)^ and %00111111), 3); // $R-
               Assert(Result.Value < $D800);
            end
            else
               Return($FFFD, 2);
         end
         else
            Return($FFFD, 1);
      $F0: // four-byte sequence with possible overlong sequence
         if ((Data+1)^ in [$90..$BF]) then
         begin
            if ((Data+2)^ in [$80..$BF]) then
            begin
               if ((Data+3)^ in [$80..$BF]) then
               begin
                  Assert(((Data+0)^ and %00001111) = 0);
                  Return({((Data+0)^ and %00001111) shl 18 +} // always zero
                          ((Data+1)^ and %00111111) shl 12 +
                          ((Data+2)^ and %00111111) shl 6 +
                          ((Data+3)^ and %00111111), 4); // $R-
               end
               else
                  Return($FFFD, 3);
            end
            else
               Return($FFFD, 2);
         end
         else
            Return($FFFD, 1);
      $F1..$F3:
         if ((Data+1)^ in [$80..$BF]) then
         begin
            if ((Data+2)^ in [$80..$BF]) then
            begin
               if ((Data+3)^ in [$80..$BF]) then
               begin
                  Return(((Data+0)^ and %00001111) shl 18 +
                         ((Data+1)^ and %00111111) shl 12 +
                         ((Data+2)^ and %00111111) shl 6 +
                         ((Data+3)^ and %00111111), 4); // $R-
               end
               else
                  Return($FFFD, 3);
            end
            else
               Return($FFFD, 2);
         end
         else
            Return($FFFD, 1);
      $F4: // four-byte sequence with possible too-high codepoint
         if ((Data+1)^ in [$80..$8F]) then
         begin
            if ((Data+2)^ in [$80..$BF]) then
            begin
               if ((Data+3)^ in [$80..$BF]) then
               begin
                  Return(((Data+0)^ and %00001111) shl 18 +
                         ((Data+1)^ and %00111111) shl 12 +
                         ((Data+2)^ and %00111111) shl 6 +
                         ((Data+3)^ and %00111111), 4); // $R-
                  Assert(Result.Value <= $10FFFF);
               end
               else
                  Return($FFFD, 3);
            end
            else
               Return($FFFD, 2);
         end
         else
            Return($FFFD, 1);
   else
      // $80..$BF, // unexpected continuation byte
      // $C0, $C1, // forcibly an overlong sequence
      // $F5..$FD, // forcibly an invalid Unicode character
      // $FE, $FF: // non-UTF-8 bytes
         Return($FFFD, 1);
   end;
   Assert((Result.Value < $D800) or (Result.Value > $DFFF)); // not a surrogate // $DFA- for Result
   Assert(Result.Value <= $10FFFF);
end;

function UTF8ToCodepoint(const Data: PByte; out BytesRead: TZeroableUTF8SequenceLength): TUnicodeCodepoint;
begin
   Result := UTF8ToCodepoint(Data, TUTF8SequenceLength(BytesRead));
end;

function UTF8ToCodepoint(const Data: PUTF8String; const Position: Cardinal; out BytesRead: TUTF8SequenceLength): TUnicodeCodepoint;
var
   Buffer: array[0..3] of AnsiChar;
begin
   Assert(Position <= Length(Data^));
   if (Position+3 <= Length(Data^)) then
   begin
      Result := UTF8ToCodepoint(PByte(PPointer(Data)^+Position-1), BytesRead);
   end
   else
   begin
      Buffer[0] := Data^[Position];
      if (Position+1 <= Length(Data^)) then
      begin
         Buffer[1] := Data^[Position+1];
         if (Position+2 <= Length(Data^)) then
         begin
            Buffer[2] := Data^[Position+2];
            Assert(Position+3 > Length(Data^));
            Buffer[3] := #0;
         end
         else
         begin
            Buffer[2] := #0;
         end;
      end
      else
      begin
         Buffer[1] := #0;
      end;
      Result := UTF8ToCodepoint(PByte(@Buffer), BytesRead);
   end;
end;

function UTF8ToCodepoint(const Data: PUTF8String; const Position: Cardinal; out BytesRead: TZeroableUTF8SequenceLength): TUnicodeCodepoint;
begin
   Result := UTF8ToCodepoint(Data, Position, TUTF8SequenceLength(BytesRead));
end;

function UTF8ToUTF8Length(const Data: PByte): TUTF8SequenceLength; inline;
begin
   case (Data+0)^ of
      $00..$7F: Result := 1; // ASCII
      $C2..$DF:
         if ((Data+1)^ in [$80..$BF]) then
            Result := 2
         else // missing continuation byte
            Result := 1;
      $E0: // three-byte sequence with possible overlong sequence
         if ((Data+1)^ in [$A0..$BF]) then
         begin
            if ((Data+2)^ in [$80..$BF]) then
               Result := 3
            else
               Result := 2;
         end
         else
            Result := 1;
      $E1..$EC, $EE..$EF: // three-byte sequence
         if ((Data+1)^ in [$80..$BF]) then
         begin
            if ((Data+2)^ in [$80..$BF]) then
               Result := 3
            else
               Result := 2;
         end
         else
            Result := 1;
      $ED: // three-byte sequence with possible surrogates
         if ((Data+1)^ in [$80..$9F]) then
         begin
            if ((Data+2)^ in [$80..$BF]) then
               Result := 3
            else
               Result := 2;
         end
         else
            Result := 1;
      $F0: // four-byte sequence with possible overlong sequence
         if ((Data+1)^ in [$90..$BF]) then
         begin
            if ((Data+2)^ in [$80..$BF]) then
            begin
               if ((Data+3)^ in [$80..$BF]) then
                  Result := 4
               else
                  Result := 3;
            end
            else
               Result := 2;
         end
         else
            Result := 1;
      $F1..$F3:
         if ((Data+1)^ in [$80..$BF]) then
         begin
            if ((Data+2)^ in [$80..$BF]) then
            begin
               if ((Data+3)^ in [$80..$BF]) then
                  Result := 4
               else
                  Result := 3;
            end
            else
               Result := 2;
         end
         else
            Result := 1;
      $F4: // four-byte sequence with possible too-high codepoint
         if ((Data+1)^ in [$80..$8F]) then
         begin
            if ((Data+2)^ in [$80..$BF]) then
            begin
               if ((Data+3)^ in [$80..$BF]) then
                  Result := 4
               else
                  Result := 3;
            end
            else
               Result := 2;
         end
         else
            Result := 1;
   else
      // $80..$BF, // unexpected continuation byte
      // $C0, $C1, // forcibly an overlong sequence
      // $F5..$FD, // forcibly an invalid Unicode character
      // $FE, $FF: // non-UTF-8 bytes
         Result := 1;
   end;
end;

function UTF8ToUTF8Length(const Data: PUTF8String; const Position: Cardinal): TUTF8SequenceLength;
var
   Buffer: array[0..3] of AnsiChar;
begin
   if (Position+3 <= Length(Data^)) then
   begin
      Result := UTF8ToUTF8Length(PByte(PPointer(Data)^+Position-1));
   end
   else
   begin
      Buffer[0] := Data^[Position];
      if (Position+1 <= Length(Data^)) then
      begin
         Buffer[1] := Data^[Position+1];
         if (Position+2 <= Length(Data^)) then
         begin
            Buffer[2] := Data^[Position+2];
            Assert(Position+3 > Length(Data^));
            Buffer[3] := #0;
         end
         else
         begin
            Buffer[2] := #0;
         end;
      end
      else
      begin
         Buffer[1] := #0;
      end;
      Result := UTF8ToUTF8Length(PByte(@Buffer));
   end;
end;

function CodepointToUTF8(const Value: TUnicodeCodepoint): TUTF8Sequence;
begin
   Assert(Value.Value <> kEOF, 'Cannot convert virtual EOF codepoint to UTF-8');
   case (Value.Value) of
      $000000 .. $00007F:
         begin
            Result.Length := 1;
            Result.Byte1 := Byte(Value.Value);
            exit;
         end;
      $000080 .. $0007FF:
         begin
            Result.Length := 2;
            Result.Byte1 := Byte(%11000000 or (Value.Value shr 6));
            Result.Byte2 := Byte(%10000000 or (Value.Value and %00000111111));
            exit;
         end;
      $000800 .. $00D7FF,
      $00E000 .. $00FFFF:
         begin
            Result.Length := 3;
            Result.Byte1 := Byte(%11100000 or (Value.Value shr 12));
            Result.Byte2 := Byte(%10000000 or (Value.Value and %0000111111000000) shr 6);
            Result.Byte3 := Byte(%10000000 or (Value.Value and %0000000000111111));
            exit;
         end;
      $010000 .. $10FFFF:
         begin
            Result.Length := 4;
            Result.Byte1 := Byte(%11110000 or (Value.Value shr 18));
            Result.Byte2 := Byte(%10000000 or (Value.Value and %0000111111000000000000) shr 12);
            Result.Byte3 := Byte(%10000000 or (Value.Value and %0000000000111111000000) shr 6);
            Result.Byte4 := Byte(%10000000 or (Value.Value and %0000000000000000111111));
            exit;
         end;
      else
         ; // see range checks below
   end;
   Assert((Value.Value < $D800) or (Value.Value > $DFFF), 'Surrogate character passed to CodepointToUTF8()');
   Assert(Value.Value <= $10FFFF, 'Out of range value passed to CodepointToUTF8()');
   Assert(False, 'Internal error in CodepointToUTF8() for value ' + IntToStr(Value.Value));
   Result.AsString := '';
end;

function CodepointToUTF8Length(const Value: TUnicodeCodepoint): TUTF8SequenceLength;
begin
   Assert(Value.Value <> kEOF, 'Cannot convert virtual EOF codepoint to UTF-8');
   case (Value.Value) of
      $000000 .. $00007F: Result := 1;
      $000080 .. $0007FF: Result := 2;
      $000800 .. $00D7FF,
      $00E000 .. $00FFFF: Result := 3;
      $010000 .. $10FFFF: Result := 4;
      else
         Assert((Value.Value < $D800) or (Value.Value > $DFFF), 'Surrogate character passed to CodepointToUTF8Length()');
         Assert(Value.Value <= $10FFFF, 'Out of range value passed to CodepointToUTF8Length()');
         Assert(False, 'Internal error in CodepointToUTF8Length() for value ' + IntToStr(Value.Value));
         Result := 1;
   end;
end;

function CodepointArrayToUTF8String(const Value: TUnicodeCodepointArray): UTF8String;
var
   Index, NewLength, Offset: Cardinal;
   Character: TUTF8Sequence;
begin
   if (Length(Value) = 0) then
   begin
      Result := '';
      exit;
   end;
   NewLength := 0;
   for Index := Low(Value) to High(Value) do // $R-
      Inc(NewLength, CodepointToUTF8Length(Value[Index]));
   SetLength(Result, NewLength);
   Offset := 1;
   for Index := Low(Value) to High(Value) do // $R-
   begin
      Character := CodepointToUTF8(Value[Index]);
      Move(Character.Start, Result[Offset], Character.Length);
      Inc(Offset, Character.Length);
   end;
end;

{$IFDEF DEBUG}
procedure TestUTF8Decoder();
var
   S: UTF8String;
   Incrementor: TUTF8SequenceLength;
   Index, TestIndex: Integer;
begin
   S := #$ed#$a0#$80#$ed#$b0#$80#0#0#0#0; // U+D800 U+DC00, paired UTF-8 surrogates
   Assert(Length(S) = 10, IntToStr(Length(S)));
   Index := 1;
   Assert(UTF8ToCodepoint(PByte(@S[Index]), Incrementor) = $FFFD);
   Assert(Incrementor = 1); Inc(Index, Incrementor);
   Assert(UTF8ToCodepoint(PByte(@S[Index]), Incrementor) = $FFFD);
   Assert(Incrementor = 1); Inc(Index, Incrementor);
   Assert(UTF8ToCodepoint(PByte(@S[Index]), Incrementor) = $FFFD);
   Assert(Incrementor = 1); Inc(Index, Incrementor);
   Assert(UTF8ToCodepoint(PByte(@S[Index]), Incrementor) = $FFFD);
   Assert(Incrementor = 1); Inc(Index, Incrementor);
   Assert(UTF8ToCodepoint(PByte(@S[Index]), Incrementor) = $FFFD);
   Assert(Incrementor = 1); Inc(Index, Incrementor);
   Assert(UTF8ToCodepoint(PByte(@S[Index]), Incrementor) = $FFFD);
   Assert(Incrementor = 1); Inc(Index, Incrementor);
   Assert(Index = 7);
   S := #$ef#$bf#$be#0#0#0#0; // U+FFFE
   Index := 1;
   Assert(UTF8ToCodepoint(PByte(@S[Index]), Incrementor) = $FFFE);
   Assert(Incrementor = 3); Inc(Index, Incrementor);
   Assert(Index = 4);
   S := #$c0#$80#$e0#$80#$80#$f0#$80#$80#$80#$f8#$80#$80#$80#$80#$fc#$80#$80#$80#$80#$80#0#0#0#0; // overlong nulls
   Index := 1;
   for TestIndex := 1 to 20 do // $R-
   begin
      Assert(UTF8ToCodepoint(PByte(@S[Index]), Incrementor) = $FFFD);
   Assert(Incrementor = 1); Inc(Index, Incrementor);
   end;
   Assert(Index = 21);
   S := #$ef#$bf#$be#0#0#0#0; // U+FFFE
   Index := 1;
   Assert(UTF8ToCodepoint(PByte(@S[Index]), Incrementor) = $FFFE);
   Assert(Incrementor = 3); Inc(Index, Incrementor);
   Assert(Index = 4);
   S := #$F0#$A4#$AD#$A2; // U+24B62
   Index := 1;
   Assert(UTF8ToCodepoint(PByte(@S[Index]), Incrementor) = $24B62);
   Assert(Incrementor = 4); Inc(Index, Incrementor);
   Assert(Index = 5);
end;

procedure TestUTF8Encoder();
begin
   Assert(CodepointToUTF8($0024).AsString = #$24);
   Assert(CodepointToUTF8($00A2).AsString = #$C2#$A2);
   Assert(CodepointToUTF8($20AC).AsString = #$E2#$82#$AC);
   Assert(CodepointToUTF8($24B62).AsString = #$F0#$A4#$AD#$A2);
end;

procedure TestUTF8();
var
   S: UTF8String;
   SubS: TUTF8Sequence;
   Index: TUnicodeCodepointRange;
   Position: Cardinal;
   Incrementor: TUTF8SequenceLength;
begin
   SetLength(S, ($80 * 1) + (($7FF-$7F) * 2) + ((($FFFF-$7FF) - ($E000-$D800)) * 3) + (($10FFFF - $FFFF) * 4) + 1); // $DFA- for S
   S[Length(S)] := #0; // fence for UTF8ToCodepoint()
   Position := 1;
   for Index := $0 to $D7FF do
   begin
      SubS := CodepointToUTF8(Index);
      Move(SubS.Start, S[Position], SubS.Length);
      Inc(Position, SubS.Length);
   end;
   for Index := $E000 to $10FFFF do
   begin
      SubS := CodepointToUTF8(Index);
      Move(SubS.Start, S[Position], SubS.Length);
      Inc(Position, SubS.Length);
   end;
   Assert(Position = Length(S));
   Index := -1;
   Position := 1;
   while (Position < Length(S)) do
   begin
      Inc(Index);
      if (Index = $D800) then
         Index := $E000;
      if (UTF8ToCodepoint(PByte(@S[Position]), Incrementor) <> Index) then
      begin
         Writeln('looking for character with index U+', IntToHex(Index, 4), ' at position ', Position);
         Writeln('  ', IntToHex(Ord(S[Position]), 2), ' ',
                       IntToHex(Ord(S[Position+1]), 2), ' ',
                       IntToHex(Ord(S[Position+2]), 2), ' ',
                       IntToHex(Ord(S[Position+3]), 2), ' ',
                 'Value found was: U+', IntToHex(UTF8ToCodepoint(PByte(@S[Position]), Incrementor).Value, 4), ' using ', Incrementor, ' bytes');
      end;
      Assert(UTF8ToCodepoint(PByte(@S[Position]), Incrementor) = Index);
      Inc(Position, Incrementor);
   end;
   Assert(Index = $10FFFF);
   Assert(Position = Length(S));
end;
{$ENDIF}

initialization
   {$IFDEF TESTS}
      TestUTF8Decoder();
      TestUTF8Encoder();
      TestUTF8();
   {$ENDIF}
end.