搜狗拼音输入法SCEL词库文件解析(附java词库导出程序)

源程序下载https://code.google.com/p/dict4cn/source/browse/trunk/importer/src/SogouScelReader.java

Source Code:

import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.io.RandomAccessFile;
import java.nio.ByteBuffer;
import java.nio.ByteOrder;
import java.nio.channels.Channels;
import java.nio.channels.FileChannel;

/**
 * Sougou Pinyin IME SCEL File Reader
 * 
 * SCEL Format overview:
 * 
 * General Information:
 * - Chinese characters and pinyin are all encoded with UTF-16LE.
 * - Numbers are using little endian byte order.
 * 
 * SCEL hex analysis:
 * - 0x0           Pinyin List Offset
 * - 0x120         total number of words
 * - 0x total number of pinyin
 * - ...           List of pinyin as [index, byte length of pinyin, pinyin as string] triples
 * - ...           Dictionary
 * - ...           
 * 
 * Dictionary format:
 * - It can interpreted as a list of 
 *   [alternatives of words, 
 *       byte length of pinyin indexes, pinyin indexes, 
 *       [byte length of word, word as string, length of skip bytes, skip bytes]
 *       ... (alternatives) 
 *   ].
 * 
 * 
 * @author keke
 */
public class SogouScelReader {
    public static void main(String[] args) throws IOException {
        // download from http://pinyin.sogou.com/dict
        String scelFile = "D:\\test.scel";

        // read scel into byte array
        ByteArrayOutputStream dataOut = new ByteArrayOutputStream();
        FileChannel fChannel = new RandomAccessFile(scelFile, "r").getChannel();
        fChannel.transferTo(0, fChannel.size(), Channels.newChannel(dataOut));
        fChannel.close();

        // scel as bytes
        ByteBuffer dataRawBytes = ByteBuffer.wrap(dataOut.toByteArray());
        dataRawBytes.order(ByteOrder.LITTLE_ENDIAN);

        byte[] buf = new byte[1024];
        String[] pyDict = new String[512];

        int totalWords = dataRawBytes.getInt(0x120);

        // pinyin offset
        dataRawBytes.position(dataRawBytes.getInt());
        int totalPinyin = dataRawBytes.getInt();
        for (int i = 0; i < totalPinyin; i++) {
            int idx = dataRawBytes.getShort();
            int len = dataRawBytes.getShort();
            dataRawBytes.get(buf, 0, len);
            pyDict[idx] = new String(buf, 0, len, "UTF-16LE");
        }

        // extract dictionary
        int counter = 0;
        for (int i = 0; i  0) {
                int key = dataRawBytes.getShort();
                if (first) {
                    first = false;
                } else {
                    py.append('\'');
                }
                py.append(pyDict[key]);
            }
            first = true;
            while (alternatives-- > 0) {
                if (first) {
                    first = false;
                } else {
                    word.append(", ");
                }
                int wordlength = dataRawBytes.getShort();
                dataRawBytes.get(buf, 0, wordlength);
                word.append(new String(buf, 0, wordlength, "UTF-16LE"));
                // skip bytes
                dataRawBytes.get(buf, 0, dataRawBytes.getShort());
            }
            System.out.println(word.toString() + "\t" + py.toString());
            counter++;
        }
        System.out.println("\nExtracted '" + scelFile + "': " + counter);
    }
}
Advertisements

百度拼音输入法BCD词库文件解析(附java词库导出程序)

源程序下载:https://dict4cn.googlecode.com/svn/trunk/importer/src/BaiduBcdReader.java

Source Code:

import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.io.RandomAccessFile;
import java.nio.ByteBuffer;
import java.nio.ByteOrder;
import java.nio.channels.Channels;
import java.nio.channels.FileChannel;

/**
 * Baidu Pinyin IME BDICT File Reader
 *
 * BDICT Format overview:
 *
 * General Information:
 * - Chinese characters and pinyin are all encoded with UTF-16LE.
 * - Numbers are using little endian byte order.
 *
 * BDICT hex analysis:
 * - 0x250         total number of words
 * - 0x350         dictionary offset
 * - 0x    Dictionary
 *
 * Dictionary format:
 * - It can interpreted as a list of
 *   [amount of characters (short not integer!)
 *       pinyin construction using fenmu and yunmu,
 *       word as string
 *   ].
 *
 *
 * @author keke
 */
public class BaiduBdictReader {
    private static final String[] FEN_MU = { "c", "d", "b", "f", "g", "h", "ch", "j", "k", "l", "m", "n", "", "p", "q",
            "r", "s", "t", "sh", "zh", "w", "x", "y", "z" };
    private static final String[] YUN_MU = { "uang", "iang", "ong", "ang", "eng", "ian", "iao", "ing", "ong", "uai",
            "uan", "ai", "an", "ao", "ei", "en", "er", "ua", "ie", "in", "iu", "ou", "ia", "ue", "ui", "un", "uo", "a",
            "e", "i", "a", "u", "v" };

    public static void main(String[] args) throws IOException {
        // download from http://r6.mo.baidu.com/web/iw/index/
        String bdictFile = "D:\\test.bcd";

        // read scel into byte array
        ByteArrayOutputStream dataOut = new ByteArrayOutputStream();
        FileChannel fChannel = new RandomAccessFile(bdictFile, "r").getChannel();
        fChannel.transferTo(0, fChannel.size(), Channels.newChannel(dataOut));
        fChannel.close();

        // bdict as bytes
        ByteBuffer dataRawBytes = ByteBuffer.wrap(dataOut.toByteArray());
        dataRawBytes.order(ByteOrder.LITTLE_ENDIAN);

        byte[] buf = new byte[1024];
        int total = dataRawBytes.getInt(0x250);
        // dictionary offset
        dataRawBytes.position(0x350);
        for (int i = 0; i < total; i++) {
            int length = dataRawBytes.getShort();
            dataRawBytes.getShort();
            boolean first = true;
            StringBuilder pinyin = new StringBuilder();
            for (int j = 0; j < length; j++) {
                if (first) {
                    first = false;
                } else {
                    pinyin.append('\'');
                }
                pinyin.append(FEN_MU[dataRawBytes.get()] + YUN_MU[dataRawBytes.get()]);
            }
            dataRawBytes.get(buf, 0, 2 * length);
            String word = new String(buf, 0, 2 * length, "UTF-16LE");
            System.out.println(word+"\t"+pinyin);
        }

        System.out.println("\nExtracted '" + bdictFile + "': " + total);
    }
}

QQ拼音qpyd词库文件解析(附java词库导出程序)

源程序下载https://code.google.com/p/dict4cn/source/browse/trunk/importer/src/QQPinyinQpydReader.java

输出 (汽车品牌.qpyd):

名称:汽车品牌
类型:汽车
子类型:爱好
词库说明:汽车品牌
词库样例:三菱牌 印度斯坦 爱丽舍 吉奥 阿斯顿 京城海狮 风骏 东南汽车 福特牌 御马 富康 华阳汽车 海锋 奇瑞君威 德托 大发牌 都市骏马 利亚纳 法比亚伊比萨
词条数:961
压缩词库数据地址:0x180

三菱牌		san'ling'pai
典雅		dian'ya
奇兵		qi'bing
东风牌		dong'feng'pai
水星		shui'xing
新锋锐		xin'feng'rui
勇士		yong'shi
百利		bai'li
嘉年华		jia'nian'hua
飞碟汽车		fei'die'qi'che
...

Source Code:

import java.io.ByteArrayOutputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.RandomAccessFile;
import java.nio.ByteBuffer;
import java.nio.ByteOrder;
import java.nio.channels.Channels;
import java.nio.channels.FileChannel;
import java.util.Arrays;
import java.util.zip.InflaterOutputStream;

/**
 * QQ Pinyin IME QPYD File Reader
 * 
 * QPYD Format overview:
 * 
 * General Information:
 * - Chinese characters are all encoded with UTF-16LE.
 * - Pinyin are encoded in ascii (or UTF-8).
 * - Numbers are using little endian byte order.
 * 
 * QPYD hex analysis:
 * - 0x00 QPYD file identifier
 * - 0x38 offset of compressed data (word-pinyin-dictionary)
 * - 0x44 total words in qpyd
 * - 0x60 start of header information
 * 
 * Compressed data analysis:
 * - zip/standard (beginning with 0x789C) is used in (all analyzed) qpyd files
 * - data is divided in two parts
 * -- 1. offset and length information (16 bytes for each pinyin-word pair)
 *       0x06 offset points to first pinyin
 *       0x00 length of pinyin
 *       0x01 length of word
 * -- 2. actual data
 *       Dictionary data has the form ((pinyin)(word))* with no separators.
 *       Data can only be read using offset and length information.
 * 
 * 
 * @author keke
 */
public class QQPinyinQpydReader {
    public static void main(String[] args) throws IOException {
        // download from http://dict.py.qq.com/list.php
        String qqydFile = "D:\\test.qpyd";

        // read qpyd into byte array
        ByteArrayOutputStream dataOut = new ByteArrayOutputStream();
        FileChannel fChannel = new RandomAccessFile(qqydFile, "r").getChannel();
        fChannel.transferTo(0, fChannel.size(), Channels.newChannel(dataOut));
        fChannel.close();

        // qpyd as bytes
        ByteBuffer dataRawBytes = ByteBuffer.wrap(dataOut.toByteArray());
        dataRawBytes.order(ByteOrder.LITTLE_ENDIAN);

        // read info of compressed data
        int startZippedDictAddr = dataRawBytes.getInt(0x38);
        int zippedDictLength = dataRawBytes.limit() - startZippedDictAddr;

        // qpys as UTF-16LE string
        String dataString = new String(Arrays.copyOfRange(dataRawBytes.array(), 0x60, startZippedDictAddr), "UTF-16LE");

        // print header
        System.out.println("名称:" + substringBetween(dataString, "Name: ", "\r\n"));
        System.out.println("类型:" + substringBetween(dataString, "Type: ", "\r\n"));
        System.out.println("子类型:" + substringBetween(dataString, "FirstType: ", "\r\n"));
        System.out.println("词库说明:" + substringBetween(dataString, "Intro: ", "\r\n"));
        System.out.println("词库样例:" + substringBetween(dataString, "Example: ", "\r\n"));
        System.out.println("词条数:" + dataRawBytes.getInt(0x44));

        // read zipped qqyd dictionary into byte array
        dataOut.reset();
        Channels.newChannel(new InflaterOutputStream(dataOut)).write(
                ByteBuffer.wrap(dataRawBytes.array(), startZippedDictAddr, zippedDictLength));

        // uncompressed qqyd dictionary as bytes
        ByteBuffer dataUnzippedBytes = ByteBuffer.wrap(dataOut.toByteArray());
        dataUnzippedBytes.order(ByteOrder.LITTLE_ENDIAN);

        // for debugging: save unzipped data to *.unzipped file
        Channels.newChannel(new FileOutputStream(qqydFile + ".unzipped")).write(dataUnzippedBytes);
        System.out.println("压缩数据:0x" + Integer.toHexString(startZippedDictAddr) + " (解压前:" + zippedDictLength
                + " B, 解压后:" + dataUnzippedBytes.limit() + " B)");
        
        // stores the start address of actual dictionary data
        int unzippedDictStartAddr = -1;
        int idx = 0;
        byte[] byteArray = dataUnzippedBytes.array();
        while (unzippedDictStartAddr == -1 || idx < unzippedDictStartAddr) {
            // read word
            int pinyinStartAddr = dataUnzippedBytes.getInt(idx + 0x6);
            int pinyinLength = dataUnzippedBytes.get(idx + 0x0) & 0xff;
            int wordStartAddr = pinyinStartAddr + pinyinLength;
            int wordLength = dataUnzippedBytes.get(idx + 0x1) & 0xff;
            if (unzippedDictStartAddr == -1) {
                unzippedDictStartAddr = pinyinStartAddr;
                System.out.println("词库地址(解压后):0x" + Integer.toHexString(unzippedDictStartAddr) + "\n");
            }

            String pinyin = new String(Arrays.copyOfRange(byteArray, pinyinStartAddr, pinyinStartAddr + pinyinLength),
                    "UTF-8");
            String word = new String(Arrays.copyOfRange(byteArray, wordStartAddr, wordStartAddr + wordLength),
                    "UTF-16LE");
            System.out.println(word + "\t" + pinyin);

            // step up
            idx += 0xa;
        }
    }

    public static final String substringBetween(String text, String start, String end) {
        int nStart = text.indexOf(start);
        int nEnd = text.indexOf(end, nStart + 1);
        if (nStart != -1 && nEnd != -1) {
            return text.substring(nStart + start.length(), nEnd);
        } else {
            return null;
        }
    }
}