Skip to content

Commit

Permalink
Publish CharacterIODemo
Browse files Browse the repository at this point in the history
  • Loading branch information
wasadigi committed Mar 4, 2019
1 parent 83546e5 commit a800f33
Show file tree
Hide file tree
Showing 2 changed files with 163 additions and 0 deletions.
39 changes: 39 additions & 0 deletions examples/03-CharacterIODemo/CharacterIODemo/pom.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>ch.heigvd.res</groupId>
<artifactId>CharacterIODemo</artifactId>
<version>1.0-SNAPSHOT</version>
<packaging>jar</packaging>
<build>
<plugins>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-shade-plugin</artifactId>
<version>2.3</version>
<executions>
<execution>
<phase>package</phase>
<goals>
<goal>shade</goal>
</goals>
<configuration>
<transformers>
<transformer implementation="org.apache.maven.plugins.shade.resource.ManifestResourceTransformer">
<mainClass>ch.heigvd.res.io.CharacterIODemo</mainClass>
</transformer>
</transformers>
</configuration>
</execution>
</executions>
</plugin>
</plugins>
</build>

<properties>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
<maven.compiler.source>1.7</maven.compiler.source>
<maven.compiler.target>1.7</maven.compiler.target>
</properties>
<name>CharacterIODemo</name>
</project>
Original file line number Diff line number Diff line change
@@ -0,0 +1,124 @@
package ch.heigvd.res.io;

import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.io.OutputStreamWriter;
import java.util.logging.Level;
import java.util.logging.Logger;

/**
* This program shows how to use character encodings in Java. It shows that while
* Java uses Unicode once characters or Strings are created in memory, a translation
* needs to happen when bytes are converted into characters, and the other way around.
*
* The program also highlight typical problems that arise if the developer does not
* control character encodings. Problems that manifest themselves by seeing '?' or
* other strange characters appear in text messages.
*
* It is a good idea to run this program in debug mode, to be able to explore the
* memory representations of the manipulated strings.
*
* @author Olivier Liechti
*/
public class CharacterIODemo {

/**
* This method manipulates the Unicode string passed in parameter. It first
* converts it into a byte array, using the specified encoding (hence, using
* different encodings should produce different byte arrays, possibly of
* different sizes!). It then converts the byte array back into a String. It
* does this twice: a first time using the default encoding (platform specific)
* and a second time using the specified encoding.
*
* @param message the test String we want to encode
* @param encoding the encoding we want to use for encoding/decoding message
* @throws java.io.IOException
*/
public void encodeAndDecode(String message, String encoding) throws IOException {
// Lets create an output stream, which will send written bytes to a memory zone
ByteArrayOutputStream os = new ByteArrayOutputStream();

// Let's wrap an OutputStreamWriter around it. When writing characters on
// this writer, it will be responsible for converting them into sequences of
// bytes. How? By using the encoding that we pass as parameter.
OutputStreamWriter writer = new OutputStreamWriter(os, encoding);
writer.write(message);
writer.flush();

// We have sent characters to the writer, it has translated them into bytes.
// These bytes have been forwarded to the wrapped ByteArrayOutputStream instance.
// We can now fetch these bytes.
byte[] encodedMessage = os.toByteArray();

System.out.println("When I encode '" + message + "' (" + message.length() + " chars) with encoding " + encoding + ", I generate " + encodedMessage.length + " bytes.");
dumpByteArray(encodedMessage);

// Now, let's do the work the other way around. Let's take the byte array and convert it
// to a Java String. Note that there are different constructors in the Java class. Some would let
// us specify the encoding (i.e. specify how we want to translates bytes into characters). In this
// case, we are using the default one (which you can specify by starting your JVM with -Dfile.encoding=XXX).
// So, we expect this call to break in many cases! If we converted our test string using the UTF-16 encoding,
// and then use to the UTF-8 encoding in the other direction, we will have corrupted our string!
String decodedMessage = new String(encodedMessage);
System.out.print("If I decode the result with the default encoding for this JVM (" + System.getProperty("file.encoding") + "), I get: ");
System.out.println(decodedMessage);


// Now, let's use the same encoding for converting the byte array back into a String. That should work better in some
// cases, BUT we will also encounter situations where the message is not displayed correctly. The reason is that encoding
// our test string into bytes MAY result in losing some information. For instance, if our test string contains japanese
// characters and we encode it with ASCII, then the encoder will be lost. It will be unable to process japanese characters
// and will replace them with '?' characters. Once this is done, when we try to go back from the byte array to the String,
// we will have lost some of the information and won't be able to recover the original string.
decodedMessage = new String(encodedMessage, encoding);
System.out.print("If I decode the result with the same encoding (" + encoding + ") for this JVM (" + System.getProperty("file.encoding") + "), I get: ");
System.out.println(decodedMessage);
System.out.println();
}

/**
* A utility method to dump a byte array (showing the binary and decimal values)
* onto the console.
*
* @param array the byte array we want to show
*/
private void dumpByteArray(byte[] array) {
for (int i=0; i<array.length; i++) {
String s1 = String.format("%8s", Integer.toBinaryString(array[i] & 0xFF)).replace(' ', '0');
String s2 = String.format("%5s", array[i]).replace(' ', ' ');
System.out.print(s1 + " " + s2 + "\n");
}

}

/**
* @param args the command line arguments
*/
public static void main(String[] args) {
CharacterIODemo demo = new CharacterIODemo();

// Let's see how 1) plain latin characters, 2) characters with accents
// and 3) japanese characters work with our test procedure

String message = "ABC élève 広島";

try {

// Do a run with the ASCII encoding (no support for accents, nor japanese)
demo.encodeAndDecode(message, "US-ASCII");

// Do a run with the latin western european encoding (support for accents, not japanese)
demo.encodeAndDecode(message, "ISO-8859-15");

// Do a run with the UTF-8 encoding
demo.encodeAndDecode(message, "UTF-8");

// Do a run with the UTF-16 encoding
demo.encodeAndDecode(message, "UTF-16");

} catch (IOException ex) {
Logger.getLogger(CharacterIODemo.class.getName()).log(Level.SEVERE, null, ex);
}
}

}

0 comments on commit a800f33

Please sign in to comment.