Skip to content

Commit

Permalink
Fix charset conversion from utf16.
Browse files Browse the repository at this point in the history
  • Loading branch information
smimram committed Jun 1, 2023
1 parent a437aba commit ea40a33
Show file tree
Hide file tree
Showing 5 changed files with 34 additions and 36 deletions.
1 change: 1 addition & 0 deletions CHANGES.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
- Add support for FLAC.
- id3v2: use "bpm" instead of "tempo".
- id3v2: convert "tlen" to "duration".
- Fix charset conversion from utf16.

0.1.0 (2023-02-08)
=====
Expand Down
22 changes: 9 additions & 13 deletions src/metadataCharEncoding.ml
Original file line number Diff line number Diff line change
Expand Up @@ -8,26 +8,22 @@ end
module Naive : T = struct
let convert ?source s =
let source = match source with None -> `UTF_8 | Some x -> x in
let endianness = ref `BE in
let endianness = ref (if source = `UTF_16LE then `LE else `BE) in
let buf = Buffer.create 10 in
match source with
| (`UTF_16 | `UTF_16LE | `UTF_16BE) as source ->
let get_char =
match source with
| `UTF_16LE -> String.get_utf_16le_uchar
| `UTF_16BE -> String.get_utf_16be_uchar
| `UTF_16 -> (
match !endianness with
| `LE -> String.get_utf_16le_uchar
| `BE -> String.get_utf_16be_uchar)
in
| `UTF_16 | `UTF_16LE | `UTF_16BE ->
let len = String.length s in
let rec f pos =
let get_char =
match !endianness with
| `LE -> String.get_utf_16le_uchar
| `BE -> String.get_utf_16be_uchar
in
if pos = len then Buffer.contents buf
else if pos + 2 <= len && s.[pos] = '\xfe' && s.[pos] = '\xff' then (
else if pos + 2 <= len && s.[pos] = '\xfe' && s.[pos+1] = '\xff' then (
endianness := `BE;
f (pos + 2))
else if pos + 2 <= len && s.[pos] = '\xff' && s.[pos] = '\xfe' then (
else if pos + 2 <= len && s.[pos] = '\xff' && s.[pos+1] = '\xfe' then (
endianness := `LE;
f (pos + 2))
else (
Expand Down
37 changes: 15 additions & 22 deletions src/metadataID3v2.ml
Original file line number Diff line number Diff line change
Expand Up @@ -38,15 +38,20 @@ let next_substring encoding ?(offset = 0) s =
let ans = ref 0 in
let utf16 = encoding = 1 || encoding = 2 in
try
for i = offset to String.length s - if utf16 then 2 else 1 do
if utf16 then (
if s.[i] = '\000' && s.[i + 1] = '\000' then (
ans := i + 2;
raise Exit))
else if s.[i] = '\000' then (
ans := i + 1;
raise Exit)
done;
if utf16 then
for i = offset to String.length s / 2 - 1 do
if s.[2*i] = '\000' && s.[2*i+1] = '\000' then (
ans := 2*i+2;
raise Exit
)
done
else
for i = offset to String.length s - 1 do
if s.[i] = '\000' then (
ans := i + 1;
raise Exit
)
done;
raise Not_found
with Exit -> !ans

Expand Down Expand Up @@ -78,19 +83,7 @@ let make_recode recode =
in
let recode : int -> string -> string = function
| 0 -> recode ~source:`ISO_8859_1
| 1 -> (
fun s ->
match String.length s with
(* Probably invalid string *)
| n when n < 2 -> s
| n -> (
match String.sub s 0 2 with
| "\255\254" | "\255\246" ->
recode ~source:`UTF_16LE (String.sub s 2 (n - 2))
| "\254\255" | "\246\255" ->
recode ~source:`UTF_16BE (String.sub s 2 (n - 2))
(* Probably invalid string *)
| _ -> recode ~source:`UTF_16 s))
| 1 -> recode ~source:`UTF_16
| 2 -> recode ~source:`UTF_16
| 3 -> recode ~source:`UTF_8
(* Invalid encoding. *)
Expand Down
2 changes: 2 additions & 0 deletions test/test
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
#!/bin/sh
dune exec --no-print-directory ./test.exe -- $@
8 changes: 7 additions & 1 deletion test/test.ml
Original file line number Diff line number Diff line change
@@ -1,2 +1,8 @@
let () =
assert (Metadata.ID3v2.unterminate 2 "\000ab\000de\000\000" = "\000ab\000de")
assert (Metadata.ID3v2.unterminate 2 "\000ab\000de\000\000" = "\000ab\000de");
(* Little endian. *)
assert (Metadata.CharEncoding.Naive.convert ~source:`UTF_16LE "a\x00b\x00c\x00" = "abc");
assert (Metadata.CharEncoding.Naive.convert ~source:`UTF_16 "\xff\xfea\x00b\x00c\x00" = "abc");
(* Big endian. *)
assert (Metadata.CharEncoding.Naive.convert ~source:`UTF_16BE "\x00a\x00b\x00c" = "abc");
assert (Metadata.CharEncoding.Naive.convert ~source:`UTF_16 "\xfe\xff\x00a\x00b\x00c" = "abc")

0 comments on commit ea40a33

Please sign in to comment.