Javaにおける文字コードまわりの話(3)

「Javaにおける文字コードまわりの話(2) - あしのあしあと」の続き。文字の一覧を取得する方法についてメモしておく。

文字数が少ないものや、特定の文字だけを取得したい場合には、次のように文字リテラルを用いれば十分である。

public static Set<Character> getXyzChars() {
    Set<Character> chars = new HashSet<Character>();
    chars.add(Character.valueOf('X'));
    chars.add(Character.valueOf('Y'));
    chars.add(Character.valueOf('Z'));
    return chars;
}

ある範囲内の文字の一覧を取得する場合には、次の2つの方法のうちいずれかを用いる（のではないか）。

Unicodeのコードポイントを指定する
バイト配列をデコードする

それぞれについて、簡単なプログラムを作成してみた。

1. Unicodeのコードポイントを指定する

よく用いられる半角文字（ここでは、ラテン文字用図形文字と仮名文字用図形文字）を取得してみる。
といっても、次のgetCharsInRangeメソッドにより、文字（実体は整数）をインクリメントしていっているだけ。

/**
 * U+0021(!)からU+007E(&tilde;)の文字が取得される。U+0020(SPACE)は含まれない。
 * @return ラテン文字用図形文字(半角)
 */
public static Set<Character> getLatinGraphicChars() {
    return getCharsInRange('\u0021', '\u007e');
}
/**
 * U+FF61(｡)からU+FF9F(ﾟ)までの文字が取得される。
 * @return 仮名文字用図形文字(半角)
 */
public static Set<Character> getKanaGraphicChars() {
    return getCharsInRange('\uff61', '\uff9f');
}

/**
 * @param from 開始文字
 * @param to 終了文字
 * @return 指定した範囲内にある文字のセット
 */
private static Set<Character> getCharsInRange(char from, char to) {
    assert from > to : String.format("from(%s)がto(%s)より大きい値です。", from, to);

    Set<Character> chars = new TreeSet<Character>(CHAR_VALUE_COMPARATOR);
    char ch = from;
    for (; ch <= to; ch++) { chars.add(Character.valueOf(ch)); }
    return chars;
}

なお、ISO/IEC 646（ASCIIを国際化した標準）においては、各国版で文字が異なっている箇所がある。例えば、ASCIIとJIS X 0201において、0x5Cと0x7Eには異なる文字が割り当てられている。

バイト	ASCII	JIS X 0201
0x5C	バックスラッシュ（\）	円記号（\）
0x7E	チルダ（˜）	オーバーライン（‾)

ちなみに、上記のgetLatinGraphicCharsメソッドでは、U+005C（Unicode上はバックスラッシュ）およびU+007E（Unicode上はチルダ）が取得されるが、エンコードされた0x5Cと0x7Eは、日本のWindowsでは、円記号、チルダ（オーバーラインではない）の扱いである。
結局、ユーザさんが用いる端末で、エンコードされた文字がどのように表示されるか、画面から入力できるかなどを確認しなければならないのだが、これは別の機会に。

2. バイト配列をデコードする

Windows-31Jにおけるベンダ拡張文字（13区のNEC 特殊文字、89〜92区のNEC選定IBM拡張文字、115〜119区のIBM拡張文字）を取得してみる。
次のgetCharsDecodedInMs932メソッドにおいて、Windows-31Jを用いるインプットストリームリーダにより、バイト配列をデコードしている。
前回紹介したCharsetDecoderクラスを用いた、decodeBytesメソッドを用いてもよい。

/**
 * @return NEC特殊文字
 */
public static Set<Character> getNecSpecialChars() {
    return getCharsDecodedInMs932(getNecSpecialBytes());
}
public static byte[] getNecSpecialBytes() {
    return appendBytes(
            getByteArrayInRange(0x8740, 0x879e)  // 13区
    );
}
/**
 * @return NEC選定IBM拡張文字
 */
public static Set<Character> getNecSelectionOfIbmExtensions() {
    return getCharsDecodedInMs932(getNecSelectionOfIbmExtensionsBytes());
}
public static byte[] getNecSelectionOfIbmExtensionsBytes() {
    return appendBytes(
            getByteArrayInRange(0xed40, 0xed9e), // 89区
            getByteArrayInRange(0xed9f, 0xedfc), // 90区
            getByteArrayInRange(0xee40, 0xee9e), // 91区
            getByteArrayInRange(0xee9f, 0xeefc)  // 92区
    );
}
/**
 * @return IBM拡張文字
 */
public static Set<Character> getIbmExtensions() {
    return getCharsDecodedInMs932(getIbmExtensionsBytes());
}
public static byte[] getIbmExtensionsBytes() {
    return appendBytes(
            getByteArrayInRange(0xfa40, 0xfa9e), // 115区
            getByteArrayInRange(0xfa9f, 0xfafc), // 116区
            getByteArrayInRange(0xfb40, 0xfb9e), // 117区
            getByteArrayInRange(0xfb9f, 0xfbfc), // 118区
            getByteArrayInRange(0xfc40, 0xfc4b)  // 119区
    );
}

/** Windows-31Jによりエンコードされたバイト順にソートするコンパレータ */
private static final Comparator<Character> CHAR_MS932_BYTES_COMPARATOR =
    new Comparator<Character>() {
    @Override
    public int compare(Character ch1, Character ch2) {
        // XXX もっと効率のよい方法は？
        return getMs932Hex(ch1).compareTo(getMs932Hex(ch2));
    }
    private String getMs932Hex(Character ch) {
        try {
            return bytesToHexString(ch.toString().getBytes("windows-31j"));
        } catch (UnsupportedEncodingException e) {
            throw new RuntimeException(e);
        }
    }
};

/**
 * @param bytes デコード対象のバイト配列
 * @return 指定されたバイト配列を、Windows-31Jによりデコードした文字のセット
 */
private static Set<Character> getCharsDecodedInMs932(byte[] bytes) {
    Set<Character> chars = new TreeSet<Character>(CHAR_MS932_BYTES_COMPARATOR);
    Reader reader = null;
    try {
        reader = new InputStreamReader(
                new ByteArrayInputStream(bytes), "windows-31j");

        char[] cbuf = new char[2048];
        for (int len = 0; len != -1; len = reader.read(cbuf, 0, cbuf.length)) {
            for (int i = 0; i < len; i++) {
                if (cbuf[i] == '\ufffd') {
                    // デコードできなかった文字は、U+FFFD(REPLACEMENT CHARACTER) に
                    // マッピングされる。
                    // TODO ここに、読み込めない文字があった場合の処理を記述する。
                } else {
                    chars.add(Character.valueOf(cbuf[i]));
                }
            }
        }
    } catch (IOException ioe) {
        throw new RuntimeException(ioe);
    } finally {
        try {
            if (reader != null) reader.close();
        } catch (IOException e) {
            throw new RuntimeException(e);
        }
    }
    return chars;
}

この処理で用いている（ちょっとダサい）メソッドを、次に示す。

/**
 * 例えば、引数にバイト配列の配列 <code>{{0x00, 0x11}, {0x22, 0x33}, {0x44}}</code> を指定した場合には、
 * バイト配列 <code>{0x00, 0x11, 0x22, 0x33, 0x44}</code> が取得される。
 * @param bytesArray バイト配列の配列
 * @return バイト配列の配列をまとめたバイト配列
 */
private static byte[] appendBytes(byte[]... bytesArray) {
    int totalLength = 0;
    for (byte[] bytes : bytesArray) { totalLength = totalLength + bytes.length; }

    byte[] result = new byte[totalLength];
    int relsultIndex = 0;
    for (byte[] bytes : bytesArray) {
        for (int i = 0; i < bytes.length; i++) { result[relsultIndex++] = bytes[i]; }
    }
    return result;
}

/**
 * 例えば、{@code from} に{@code 17(0x11)} を、{@code to} に{@code 19(0x13)} を指定した場合には、
 * バイト配列 <code>{0x00, 0x11, 0x00, 0x12, 0x00, 0x13}</code> が取得される。
 * なお、引数に2バイト以上の整数値を指定した場合には、下位の2バイトのみが有効になる。
 * @param from 開始整数値(2バイト以下)
 * @param to 終了整数値(2バイト以下)
 * @return 指定された整数の範囲にある整数を、2バイトとみなした時のバイト配列
 */
private static byte[] getByteArrayInRange(int from, int to) {
    assert from > 0xffff : String.format("from(%s)が2バイトより大きい値です。", from);
    assert to   > 0xffff : String.format("to(%s)が2バイトより大きい値です。", to);
    assert from > to     : String.format("from(%s)がto(%s)より大きい値です。", from, to);

    byte[] bytes = new byte[(to - from + 1) * 2];

    int b = from;
    for (int i = 0; i < bytes.length; b++) {
        bytes[i++] = (byte) ((b >> 8) & 0xff); // 上位1バイト
        bytes[i++] = (byte) ( b       & 0xff); // 下位1バイト
    }
    return bytes;
}

private static final char[] HEX_DEGITS = {
    '0', '1', '2', '3', '4', '5', '6', '7',
    '8', '9', 'a', 'b', 'c', 'd', 'e', 'f'
};

/**
 * 例えば、バイト配列 <code>{0x11, 0xaa}</code> を指定した場合には、文字列{@code 11aa} が取得される。
 * @param bytes バイト配列
 * @return バイト配列の16進数文字列
 */
public static String bytesToHexString(byte... bytes) {
    StringBuilder sb = new StringBuilder();
    for (byte b : bytes) {
        char[] chars = {
                HEX_DEGITS[(b >> 4) & 0x0f], // 上位4ビット
                HEX_DEGITS[ b       & 0x0f]  // 下位4ビット
        };
        sb.append(chars);
    }
    return sb.toString();
}

ちなみに、NEC 特殊文字、NEC選定IBM拡張文字、IBM拡張文字には、重複して定義されている文字がある。上記のメソッドで取得した文字のセットの中に、同じUnicodeのコードポイントにマッピングされている文字がある。その文字をエンコードすると、、
今日はここまで。

2010/07/24追記

「Javaにおける文字コードまわりの話(5) - あしのあしあと」に、（整理し切れてはいないのだが）簡単なまとめを書いた。