QQ拼音qpyd词库文件解析(附java词库导出程序)

源程序下载https://code.google.com/p/dict4cn/source/browse/trunk/importer/src/QQPinyinQpydReader.java

输出 (汽车品牌.qpyd):

名称:汽车品牌
类型:汽车
子类型:爱好
词库说明:汽车品牌
词库样例:三菱牌 印度斯坦 爱丽舍 吉奥 阿斯顿 京城海狮 风骏 东南汽车 福特牌 御马 富康 华阳汽车 海锋 奇瑞君威 德托 大发牌 都市骏马 利亚纳 法比亚伊比萨
词条数:961
压缩词库数据地址:0x180

三菱牌		san'ling'pai
典雅		dian'ya
奇兵		qi'bing
东风牌		dong'feng'pai
水星		shui'xing
新锋锐		xin'feng'rui
勇士		yong'shi
百利		bai'li
嘉年华		jia'nian'hua
飞碟汽车		fei'die'qi'che
...

Source Code:

import java.io.ByteArrayOutputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.RandomAccessFile;
import java.nio.ByteBuffer;
import java.nio.ByteOrder;
import java.nio.channels.Channels;
import java.nio.channels.FileChannel;
import java.util.Arrays;
import java.util.zip.InflaterOutputStream;

/**
 * QQ Pinyin IME QPYD File Reader
 * 
 * QPYD Format overview:
 * 
 * General Information:
 * - Chinese characters are all encoded with UTF-16LE.
 * - Pinyin are encoded in ascii (or UTF-8).
 * - Numbers are using little endian byte order.
 * 
 * QPYD hex analysis:
 * - 0x00 QPYD file identifier
 * - 0x38 offset of compressed data (word-pinyin-dictionary)
 * - 0x44 total words in qpyd
 * - 0x60 start of header information
 * 
 * Compressed data analysis:
 * - zip/standard (beginning with 0x789C) is used in (all analyzed) qpyd files
 * - data is divided in two parts
 * -- 1. offset and length information (16 bytes for each pinyin-word pair)
 *       0x06 offset points to first pinyin
 *       0x00 length of pinyin
 *       0x01 length of word
 * -- 2. actual data
 *       Dictionary data has the form ((pinyin)(word))* with no separators.
 *       Data can only be read using offset and length information.
 * 
 * 
 * @author keke
 */
public class QQPinyinQpydReader {
    public static void main(String[] args) throws IOException {
        // download from http://dict.py.qq.com/list.php
        String qqydFile = "D:\\test.qpyd";

        // read qpyd into byte array
        ByteArrayOutputStream dataOut = new ByteArrayOutputStream();
        FileChannel fChannel = new RandomAccessFile(qqydFile, "r").getChannel();
        fChannel.transferTo(0, fChannel.size(), Channels.newChannel(dataOut));
        fChannel.close();

        // qpyd as bytes
        ByteBuffer dataRawBytes = ByteBuffer.wrap(dataOut.toByteArray());
        dataRawBytes.order(ByteOrder.LITTLE_ENDIAN);

        // read info of compressed data
        int startZippedDictAddr = dataRawBytes.getInt(0x38);
        int zippedDictLength = dataRawBytes.limit() - startZippedDictAddr;

        // qpys as UTF-16LE string
        String dataString = new String(Arrays.copyOfRange(dataRawBytes.array(), 0x60, startZippedDictAddr), "UTF-16LE");

        // print header
        System.out.println("名称:" + substringBetween(dataString, "Name: ", "\r\n"));
        System.out.println("类型:" + substringBetween(dataString, "Type: ", "\r\n"));
        System.out.println("子类型:" + substringBetween(dataString, "FirstType: ", "\r\n"));
        System.out.println("词库说明:" + substringBetween(dataString, "Intro: ", "\r\n"));
        System.out.println("词库样例:" + substringBetween(dataString, "Example: ", "\r\n"));
        System.out.println("词条数:" + dataRawBytes.getInt(0x44));

        // read zipped qqyd dictionary into byte array
        dataOut.reset();
        Channels.newChannel(new InflaterOutputStream(dataOut)).write(
                ByteBuffer.wrap(dataRawBytes.array(), startZippedDictAddr, zippedDictLength));

        // uncompressed qqyd dictionary as bytes
        ByteBuffer dataUnzippedBytes = ByteBuffer.wrap(dataOut.toByteArray());
        dataUnzippedBytes.order(ByteOrder.LITTLE_ENDIAN);

        // for debugging: save unzipped data to *.unzipped file
        Channels.newChannel(new FileOutputStream(qqydFile + ".unzipped")).write(dataUnzippedBytes);
        System.out.println("压缩数据:0x" + Integer.toHexString(startZippedDictAddr) + " (解压前:" + zippedDictLength
                + " B, 解压后:" + dataUnzippedBytes.limit() + " B)");
        
        // stores the start address of actual dictionary data
        int unzippedDictStartAddr = -1;
        int idx = 0;
        byte[] byteArray = dataUnzippedBytes.array();
        while (unzippedDictStartAddr == -1 || idx < unzippedDictStartAddr) {
            // read word
            int pinyinStartAddr = dataUnzippedBytes.getInt(idx + 0x6);
            int pinyinLength = dataUnzippedBytes.get(idx + 0x0) & 0xff;
            int wordStartAddr = pinyinStartAddr + pinyinLength;
            int wordLength = dataUnzippedBytes.get(idx + 0x1) & 0xff;
            if (unzippedDictStartAddr == -1) {
                unzippedDictStartAddr = pinyinStartAddr;
                System.out.println("词库地址(解压后):0x" + Integer.toHexString(unzippedDictStartAddr) + "\n");
            }

            String pinyin = new String(Arrays.copyOfRange(byteArray, pinyinStartAddr, pinyinStartAddr + pinyinLength),
                    "UTF-8");
            String word = new String(Arrays.copyOfRange(byteArray, wordStartAddr, wordStartAddr + wordLength),
                    "UTF-16LE");
            System.out.println(word + "\t" + pinyin);

            // step up
            idx += 0xa;
        }
    }

    public static final String substringBetween(String text, String start, String end) {
        int nStart = text.indexOf(start);
        int nEnd = text.indexOf(end, nStart + 1);
        if (nStart != -1 && nEnd != -1) {
            return text.substring(nStart + start.length(), nEnd);
        } else {
            return null;
        }
    }
}
Advertisements

发表评论

Fill in your details below or click an icon to log in:

WordPress.com Logo

You are commenting using your WordPress.com account. Log Out / 更改 )

Twitter picture

You are commenting using your Twitter account. Log Out / 更改 )

Facebook photo

You are commenting using your Facebook account. Log Out / 更改 )

Google+ photo

You are commenting using your Google+ account. Log Out / 更改 )

Connecting to %s