Skip to content

Commit

Permalink
Support working-tree-encoding "UTF-16LE-BOM"
Browse files Browse the repository at this point in the history
Users who want UTF-16 files in the working tree set the .gitattributes
like this:
test.txt working-tree-encoding=UTF-16

The unicode standard itself defines 3 allowed ways how to encode UTF-16.
The following 3 versions convert all back to 'g' 'i' 't' in UTF-8:

a) UTF-16, without BOM, big endian:
$ printf "\000g\000i\000t" | iconv -f UTF-16 -t UTF-8 | od -c
0000000    g   i   t

b) UTF-16, with BOM, little endian:
$ printf "\377\376g\000i\000t\000" | iconv -f UTF-16 -t UTF-8 | od -c
0000000    g   i   t

c) UTF-16, with BOM, big endian:
$ printf "\376\377\000g\000i\000t" | iconv -f UTF-16 -t UTF-8 | od -c
0000000    g   i   t

Git uses libiconv to convert from UTF-8 in the index into ITF-16 in the
working tree.
After a checkout, the resulting file has a BOM and is encoded in "UTF-16",
in the version (c) above.
This is what iconv generates, more details follow below.

iconv (and libiconv) can generate UTF-16, UTF-16LE or UTF-16BE:

d) UTF-16
$ printf 'git' | iconv -f UTF-8 -t UTF-16 | od -c
0000000  376 377  \0   g  \0   i  \0   t

e) UTF-16LE
$ printf 'git' | iconv -f UTF-8 -t UTF-16LE | od -c
0000000    g  \0   i  \0   t  \0

f)  UTF-16BE
$ printf 'git' | iconv -f UTF-8 -t UTF-16BE | od -c
0000000   \0   g  \0   i  \0   t

There is no way to generate version (b) from above in a Git working tree,
but that is what some applications need.
(All fully unicode aware applications should be able to read all 3 variants,
but in practise we are not there yet).

When producing UTF-16 as an output, iconv generates the big endian version
with a BOM. (big endian is probably chosen for historical reasons).

iconv can produce UTF-16 files with little endianess by using "UTF-16LE"
as encoding, and that file does not have a BOM.

Not all users (especially under Windows) are happy with this.
Some tools are not fully unicode aware and can only handle version (b).

Today there is no way to produce version (b) with iconv (or libiconv).
Looking into the history of iconv, it seems as if version (c) will
be used in all future iconv versions (for compatibility reasons).

Solve this dilemma and introduce a Git-specific "UTF-16LE-BOM".
libiconv can not handle the encoding, so Git pick it up, handles the BOM
and uses libiconv to convert the rest of the stream.
(UTF-16BE-BOM is added for consistency)

Rported-by: Adrián Gimeno Balaguer <[email protected]>
Signed-off-by: Torsten Bögershausen <[email protected]>
Signed-off-by: Junio C Hamano <[email protected]>
  • Loading branch information
tboegi authored and gitster committed Jan 31, 2019
1 parent 0d0ac38 commit aab2a1a
Show file tree
Hide file tree
Showing 5 changed files with 48 additions and 14 deletions.
4 changes: 3 additions & 1 deletion Documentation/gitattributes.txt
Original file line number Diff line number Diff line change
Expand Up @@ -344,7 +344,9 @@ automatic line ending conversion based on your platform.

Use the following attributes if your '*.ps1' files are UTF-16 little
endian encoded without BOM and you want Git to use Windows line endings
in the working directory. Please note, it is highly recommended to
in the working directory (use `UTF-16-LE-BOM` instead of `UTF-16LE` if
you want UTF-16 little endian with BOM).
Please note, it is highly recommended to
explicitly define the line endings with `eol` if the `working-tree-encoding`
attribute is used to avoid ambiguity.

Expand Down
2 changes: 1 addition & 1 deletion compat/precompose_utf8.c
Original file line number Diff line number Diff line change
Expand Up @@ -79,7 +79,7 @@ void precompose_argv(int argc, const char **argv)
size_t namelen;
oldarg = argv[i];
if (has_non_ascii(oldarg, (size_t)-1, &namelen)) {
newarg = reencode_string_iconv(oldarg, namelen, ic_precompose, NULL);
newarg = reencode_string_iconv(oldarg, namelen, ic_precompose, 0, NULL);
if (newarg)
argv[i] = newarg;
}
Expand Down
12 changes: 11 additions & 1 deletion t/t0028-working-tree-encoding.sh
Original file line number Diff line number Diff line change
Expand Up @@ -11,9 +11,12 @@ test_expect_success 'setup test files' '
text="hallo there!\ncan you read me?" &&
echo "*.utf16 text working-tree-encoding=utf-16" >.gitattributes &&
echo "*.utf16lebom text working-tree-encoding=UTF-16LE-BOM" >>.gitattributes &&
printf "$text" >test.utf8.raw &&
printf "$text" | iconv -f UTF-8 -t UTF-16 >test.utf16.raw &&
printf "$text" | iconv -f UTF-8 -t UTF-32 >test.utf32.raw &&
printf "\377\376" >test.utf16lebom.raw &&
printf "$text" | iconv -f UTF-8 -t UTF-32LE >>test.utf16lebom.raw &&
# Line ending tests
printf "one\ntwo\nthree\n" >lf.utf8.raw &&
Expand All @@ -32,7 +35,8 @@ test_expect_success 'setup test files' '
# Add only UTF-16 file, we will add the UTF-32 file later
cp test.utf16.raw test.utf16 &&
cp test.utf32.raw test.utf32 &&
git add .gitattributes test.utf16 &&
cp test.utf16lebom.raw test.utf16lebom &&
git add .gitattributes test.utf16 test.utf16lebom &&
git commit -m initial
'

Expand All @@ -51,6 +55,12 @@ test_expect_success 're-encode to UTF-16 on checkout' '
test_cmp_bin test.utf16.raw test.utf16
'

test_expect_success 're-encode to UTF-16-LE-BOM on checkout' '
rm test.utf16lebom &&
git checkout test.utf16lebom &&
test_cmp_bin test.utf16lebom.raw test.utf16lebom
'

test_expect_success 'check $GIT_DIR/info/attributes support' '
test_when_finished "rm -f test.utf32.git" &&
test_when_finished "git reset --hard HEAD" &&
Expand Down
42 changes: 32 additions & 10 deletions utf8.c
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,11 @@

/* This code is originally from http://www.cl.cam.ac.uk/~mgk25/ucs/ */

static const char utf16_be_bom[] = {'\xFE', '\xFF'};
static const char utf16_le_bom[] = {'\xFF', '\xFE'};
static const char utf32_be_bom[] = {'\0', '\0', '\xFE', '\xFF'};
static const char utf32_le_bom[] = {'\xFF', '\xFE', '\0', '\0'};

struct interval {
ucs_char_t first;
ucs_char_t last;
Expand Down Expand Up @@ -470,16 +475,17 @@ int utf8_fprintf(FILE *stream, const char *format, ...)
#else
typedef char * iconv_ibp;
#endif
char *reencode_string_iconv(const char *in, size_t insz, iconv_t conv, size_t *outsz_p)
char *reencode_string_iconv(const char *in, size_t insz, iconv_t conv,
size_t bom_len, size_t *outsz_p)
{
size_t outsz, outalloc;
char *out, *outpos;
iconv_ibp cp;

outsz = insz;
outalloc = st_add(outsz, 1); /* for terminating NUL */
outalloc = st_add(outsz, 1 + bom_len); /* for terminating NUL */
out = xmalloc(outalloc);
outpos = out;
outpos = out + bom_len;
cp = (iconv_ibp)in;

while (1) {
Expand Down Expand Up @@ -540,10 +546,30 @@ char *reencode_string_len(const char *in, size_t insz,
{
iconv_t conv;
char *out;
const char *bom_str = NULL;
size_t bom_len = 0;

if (!in_encoding)
return NULL;

/* UTF-16LE-BOM is the same as UTF-16 for reading */
if (same_utf_encoding("UTF-16LE-BOM", in_encoding))
in_encoding = "UTF-16";

/*
* For writing, UTF-16 iconv typically creates "UTF-16BE-BOM"
* Some users under Windows want the little endian version
*/
if (same_utf_encoding("UTF-16LE-BOM", out_encoding)) {
bom_str = utf16_le_bom;
bom_len = sizeof(utf16_le_bom);
out_encoding = "UTF-16LE";
} else if (same_utf_encoding("UTF-16BE-BOM", out_encoding)) {
bom_str = utf16_be_bom;
bom_len = sizeof(utf16_be_bom);
out_encoding = "UTF-16BE";
}

conv = iconv_open(out_encoding, in_encoding);
if (conv == (iconv_t) -1) {
in_encoding = fallback_encoding(in_encoding);
Expand All @@ -553,9 +579,10 @@ char *reencode_string_len(const char *in, size_t insz,
if (conv == (iconv_t) -1)
return NULL;
}

out = reencode_string_iconv(in, insz, conv, outsz);
out = reencode_string_iconv(in, insz, conv, bom_len, outsz);
iconv_close(conv);
if (out && bom_str && bom_len)
memcpy(out, bom_str, bom_len);
return out;
}
#endif
Expand All @@ -566,11 +593,6 @@ static int has_bom_prefix(const char *data, size_t len,
return data && bom && (len >= bom_len) && !memcmp(data, bom, bom_len);
}

static const char utf16_be_bom[] = {'\xFE', '\xFF'};
static const char utf16_le_bom[] = {'\xFF', '\xFE'};
static const char utf32_be_bom[] = {'\0', '\0', '\xFE', '\xFF'};
static const char utf32_le_bom[] = {'\xFF', '\xFE', '\0', '\0'};

int has_prohibited_utf_bom(const char *enc, const char *data, size_t len)
{
return (
Expand Down
2 changes: 1 addition & 1 deletion utf8.h
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ void strbuf_utf8_replace(struct strbuf *sb, int pos, int width,

#ifndef NO_ICONV
char *reencode_string_iconv(const char *in, size_t insz,
iconv_t conv, size_t *outsz);
iconv_t conv, size_t bom_len, size_t *outsz);
char *reencode_string_len(const char *in, size_t insz,
const char *out_encoding,
const char *in_encoding,
Expand Down

0 comments on commit aab2a1a

Please sign in to comment.