Skip to content

Commit

Permalink
Remove runtime dep on Uucp.
Browse files Browse the repository at this point in the history
Use internal Unicode property database instead.

- Con: An 84K larger Notty.
- Pro: 10M of deps less.
  • Loading branch information
pqwy committed Sep 1, 2022
1 parent 3a09179 commit f500a80
Show file tree
Hide file tree
Showing 12 changed files with 724 additions and 7 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -9,5 +9,6 @@ tmp
\.\#*
\#*#

*.json
gmon.out
rondom
3 changes: 0 additions & 3 deletions notty.opam
Original file line number Diff line number Diff line change
Expand Up @@ -17,9 +17,6 @@ build: [ [ "dune" "subst" ] {pinned}
depends: [
"ocaml" {>= "4.05.0"}
"dune" {build & >= "1.7"}
"uchar"
"uucp" {>= "2.0.0"}
"uuseg" {>= "1.0.0"}
"uutf" {>= "1.0.0"}
]
depopts: [ "lwt" ]
Expand Down
7 changes: 5 additions & 2 deletions src/dune
Original file line number Diff line number Diff line change
@@ -1,9 +1,12 @@
(include_subdirs unqualified)

(library
(public_name notty)
(synopsis "Declaring terminals")
(libraries uchar uuseg uutf)
(libraries uchar uutf)
(wrapped false)
(modules notty))
(modules notty notty_grapheme_cluster notty_uucp notty_uucp_data)
(private_modules notty_grapheme_cluster notty_uucp notty_uucp_data))

(library
(public_name notty.top)
Expand Down
13 changes: 13 additions & 0 deletions src/no-uucp/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
Cannibalized bits of Uucp:

- `Notty_uucp_data` is generated from an actual Uucp installation.
- `Notty_uucp` uses it to provide the few Unicode properties that Notty needs.
- `Notty_grapheme_cluster` is `Grapheme_cluster` from Uuseg, adapted to use the
above.

Compiled size of these is on the order of 70K. Uucp is presently a monolithic 10M.

The idea is to remove these in favor of the actual Uucp/Uuseg, as soon as it
becomes possible to depend only on the necessary parts of Uucp.

Uucp and Uuseg are Copyright (c) 2014 Daniel C. Bünzli.
133 changes: 133 additions & 0 deletions src/no-uucp/notty_grapheme_cluster.ml
Original file line number Diff line number Diff line change
@@ -0,0 +1,133 @@
(*---------------------------------------------------------------------------
Copyright (c) 2014 Daniel C. Bünzli. All rights reserved.
Distributed under the ISC license, see terms at the end of the file.
%%NAME%% %%VERSION%%
---------------------------------------------------------------------------*)

(* These are the rules as found in [1], with property values aliases [2]
substituted.
GB1. sot ÷ Any
GB2. Any ÷ eot
GB3. CR × LF
GB4. (CN|CR|LF) ÷
GB5. ÷ (CN|CR|LF)
GB6. L × (L|V|LV|LVT)
GB7. (LV|V) × (V|T)
GB8. (LVT|T) × T
GB9. × (EX|ZWJ)
GB9a. × SM
GB9b. PP ×
GB10. (v10.0.0) (EB|EBG) EX* × EM
GB11. (v10.0.0) ZWJ × (GAZ|EBG)
GB12. sot (RI RI)* RI × RI
GB13. [^RI] (RI RI)* × RI
GB999. Any ÷ Any
[1]: http://www.unicode.org/reports/tr29/#Grapheme_Cluster_Boundaries
[2]: http://www.unicode.org/Public/7.0.0/ucd/PropertyValueAliases.txt
[3]: http://www.unicode.org/Public/7.0.0/ucd/auxiliary/GraphemeBreakTest.html
By the structure of the rules we see that grapheme clusters
boundaries can *mostly* be determined by simply looking at the
grapheme cluster break property value of the character on the left
and on the right of a boundary. The exceptions are GB10 and GB12-13
which are handled specially by enriching the segmenter state in
a horribly ad-hoc fashion. *)

type ret = [ `Await | `Boundary | `End | `Uchar of Uchar.t ]

type gcb =
| CN | CR | EX | EB | EBG | EM | GAZ | L | LF | LV | LVT | PP | RI
| SM | T | V | XX | ZWJ | Sot

(* WARNING. The indexes used here need to be synchronized with those
assigned by uucp for Uucp.Break.Low.grapheme_cluster. *)

let byte_to_gcb =
[| CN; CR; EX; EB; EBG; EM; GAZ; L; LF; LV; LVT; PP; RI;
SM; T; V; XX; ZWJ; |]

let gcb u = byte_to_gcb.(Notty_uucp.grapheme_cluster_boundary u)

type state =
| Fill (* get next uchar to decide boundary. *)
| Flush (* an uchar is buffered, client needs to get it out with `Await. *)
| End (* `End was added. *)

type t =
{ mutable state : state; (* current state. *)
mutable left : gcb; (* break property value left of boundary. *)
mutable odd_ri : bool; (* odd number of RI on the left. *)
mutable emoji_seq : bool; (* (EB|EBG) Extend* on the left. *)
mutable buf : [ `Uchar of Uchar.t ] } (* bufferized add. *)

let nul_buf = `Uchar (Uchar.unsafe_of_int 0x0000)

let create () =
{ state = Fill; left = Sot;
odd_ri = false; emoji_seq = false;
buf = nul_buf (* overwritten *); }

let break s right = match s.left, right with
| (* GB1 *) Sot, _ -> true
(* GB2 is handled by `End *)
| (* GB3 *) CR, LF -> false
| (* GB4 *) (CN|CR|LF), _ -> true
| (* GB5 *) _, (CN|CR|LF) -> true
| (* GB6 *) L, (L|V|LV|LVT) -> false
| (* GB7 *) (LV|V), (V|T) -> false
| (* GB8 *) (LVT|T), T -> false
| (* GB9+a *) _, (EX|ZWJ|SM) -> false
| (* GB9b *) PP, _ -> false
| (* GB10 *) _, EM when s.emoji_seq -> false
| (* GB11 *) ZWJ, (GAZ|EBG) -> false
| (* GB12+13 *) RI, RI when s.odd_ri -> false
| (* GB999 *) _, _ -> true

let update_left s right =
s.left <- right;
match s.left with
| EX -> (* keep s.emoji_seq as is *) s.odd_ri <- false
| EB | EBG -> s.emoji_seq <- true; s.odd_ri <- false
| RI -> s.emoji_seq <- false; s.odd_ri <- not s.odd_ri
| _ -> s.emoji_seq <- false; s.odd_ri <- false

let add s = function
| `Uchar u as add ->
begin match s.state with
| Fill ->
let right = gcb u in
let break = break s right in
update_left s right;
if not break then add else
(s.state <- Flush; s.buf <- add; `Boundary)
| Flush | End -> assert false
end
| `Await ->
begin match s.state with
| Flush -> s.state <- Fill; (s.buf :> ret)
| End -> `End
| Fill -> `Await
end
| `End ->
begin match s.state with
| Fill -> s.state <- End; if s.left = Sot then `End else `Boundary
| Flush | End -> assert false
end

(*---------------------------------------------------------------------------
Copyright (c) 2014 Daniel C. Bünzli
Permission to use, copy, modify, and/or distribute this software for any
purpose with or without fee is hereby granted, provided that the above
copyright notice and this permission notice appear in all copies.
THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
---------------------------------------------------------------------------*)
27 changes: 27 additions & 0 deletions src/no-uucp/notty_grapheme_cluster.mli
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
(*---------------------------------------------------------------------------
Copyright (c) 2014 Daniel C. Bünzli. All rights reserved.
Distributed under the ISC license, see terms at the end of the file.
%%NAME%% %%VERSION%%
---------------------------------------------------------------------------*)

type ret = [ `Await | `Boundary | `End | `Uchar of Uchar.t ]

type t
val create : unit -> t
val add : t -> [ `Await | `End | `Uchar of Uchar.t ] -> ret

(*---------------------------------------------------------------------------
Copyright (c) 2014 Daniel C. Bünzli
Permission to use, copy, modify, and/or distribute this software for any
purpose with or without fee is hereby granted, provided that the above
copyright notice and this permission notice appear in all copies.
THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
---------------------------------------------------------------------------*)
48 changes: 48 additions & 0 deletions src/no-uucp/notty_uucp.ml
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
(* Copyright (c) 2020 David Kaloper Meršinjak. All rights reserved.
See LICENSE.md. *)

(* Unpacked interval lookup table. *)
let find_i ~def k (xs, _, _ as tab) =
let rec go i j (los, his, vs as tab) (k: int) def =
if i > j then def else
let x = (i + j) / 2 in
if k < Array.unsafe_get los x then go i (x - 1) tab k def else
if k > Array.unsafe_get his x then go (x + 1) j tab k def else
Array.unsafe_get vs x in
go 0 (Array.length xs - 1) tab k def

(* Fixed-depth 8-8-8-bit trie; root is variable, levels 2,3 are either empty
or full. *)
let find_t ~def k tab =
let k = if k > 0xd7ff then k - 0x800 else k in
let b0 = (k lsr 16) land 0xff in
if Array.length tab <= b0 then def else
match Array.unsafe_get tab b0 with
| [||] -> def
| arr -> match Array.unsafe_get arr ((k lsr 8) land 0xff) with
| "" -> def
| str -> String.unsafe_get str (k land 0xff) |> Char.code

(* We catch w = -1 and default to w = 1 to minimize the table. *)
let tty_width_hint u = match Uchar.to_int u with
| 0 -> 0
| u when u <= 0x001F || 0x007F <= u && u <= 0x009F -> -1
| u when u <= 0x02ff -> 1
| u -> find_i ~def:1 u Notty_uucp_data.tty_width_hint

let grapheme_cluster_boundary u =
find_t ~def:16 (Uchar.to_int u) Notty_uucp_data.grapheme_cluster_boundary

(* let check () = *)
(* let pp_u ppf u = Format.fprintf ppf "u+%04x" (Uchar.to_int u) in *)
(* let rec go i u = *)
(* let w1 = tty_width_hint u *)
(* and w2 = Uucp.Break.tty_width_hint u in *)
(* if w1 <> w2 then Format.printf "w: %a here: %d there: %d@." pp_u u w1 w2; *)
(* let gc1 = grapheme_cluster_boundary u *)
(* and gc2 = Uucp.Break.Low.grapheme_cluster u in *)
(* if gc1 <> gc2 then Format.printf "gc: %a here: %d there: %d@." pp_u u gc1 gc2; *)
(* if u = Uchar.max then i else go (i + 1) (Uchar.succ u) in *)
(* let n = go 1 Uchar.min in *)
(* Format.printf "Checked equality for %d code points.@." n *)

13 changes: 13 additions & 0 deletions src/no-uucp/notty_uucp.mli
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
(* Copyright (c) 2020 David Kaloper Meršinjak. All rights reserved.
See LICENSE.md. *)

(* This is a local copy of the (very few) relevant [uucp] properties. *)

val tty_width_hint : Uchar.t -> int
(* [Uucp.Break.tty_width_hint]. *)

val grapheme_cluster_boundary : Uchar.t -> int
(* [Uucp.Break.Low.grapheme_cluster]. *)

(* val check : unit -> unit *)

Loading

0 comments on commit f500a80

Please sign in to comment.