搜狗sgim_core.bin文件解读(java)

源程序:https://dict4cn.googlecode.com/svn/trunk/importer/src/SogouSgimCoreBinReader.java

 

import java.io.IOException;
import java.io.RandomAccessFile;
import java.nio.ByteBuffer;
import java.nio.ByteOrder;
import java.nio.channels.FileChannel;

/**
 * Sogou sgim_core.bin Reader
 * 
 * 
 * 地址:
 * 0x0C:单词数量
 * ????:单词长度(byte),单词(编码:UTF-16LE)
 * 
 * For files like sgim_eng.bin etc., the implementation has to be littlely modified.
 * 
 * @author keke
 */
public class SogouSgimCoreBinReader {
    public static void main(String[] args) throws IOException {
        String binFile = "D:\\sgim_core.bin";
        // String binFile = "D:\\sgim_eng.bin";

        // read scel into byte array
        FileChannel fChannel = new RandomAccessFile(binFile, "r").getChannel();
        ByteBuffer bb = ByteBuffer.allocate((int) fChannel.size());
        fChannel.read(bb);
        bb.order(ByteOrder.LITTLE_ENDIAN);
        bb.rewind();

        int words = bb.getInt(0xC);
        System.out.println("读入文件: " + binFile + ",单词:" + words);

        int i;
        int startPos = -1;
        while (bb.hasRemaining()) {
            i = bb.getInt();
            if (i == 0x554a0002) { // core, 6.1.0.6700
                // if (i == 0x00610002) { // eng, 6.1.0.6700
                startPos = bb.position() - 4;
                break;
            }
        }

        if (startPos > -1) {
            short s;
            int counter = 0;
            ByteBuffer buffer = ByteBuffer.allocate(Short.MAX_VALUE);
            System.out.println("单词起始位置:0x" + Integer.toHexString(startPos));
            bb.position(startPos);
            while (bb.hasRemaining() && words-- > 0) {
                s = bb.getShort();
                bb.get(buffer.array(), 0, s);
                counter++;
                // System.out.println(new String(buffer.array(), 0, s, "UTF-16LE"));
            }
            int endPos = bb.position();
            int diff = endPos - startPos;
            System.out.println("读出单词'" + binFile + "':" + counter);
            System.out.println("单词结尾位置:0x" + Integer.toHexString(endPos));
            System.out.println("单词词典长度:0x" + Integer.toHexString(diff));
        }

        fChannel.close();
    }
}

Lingoes灵格斯电子词典LD2(LDF)文件解析(附java词典导出程序)

新 Lingoes灵格斯电子词典LD2(LDF)文件单词提取器

http://code.google.com/p/lingoes-extractor/

下载

1. Windows版: http://lingoes-extractor.googlecode.com/files/lingoes-extractor-1.0.exe

2. Java版:http://lingoes-extractor.googlecode.com/files/lingoes-extractor-1.0.jar

程序演示

选择LD2文件跟导出文件:

导出后的文件:

 

 

 

支持已知所有Lingoes词典版本(2.x)。自动导出索引组(*.idx),所有词组(*.words),翻译(*.output)文件等。

Lingoes Reader / Exporter源程序下载https://dict4cn.googlecode.com/svn/trunk/importer/src/LingoesLd2Reader.java

源文件:

import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.FileWriter;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.io.RandomAccessFile;
import java.io.UnsupportedEncodingException;
import java.nio.ByteBuffer;
import java.nio.ByteOrder;
import java.nio.channels.Channels;
import java.nio.channels.FileChannel;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Pattern;
import java.util.zip.Inflater;
import java.util.zip.InflaterInputStream;

/**
 * Lingoes LD2/LDF File Reader
 * 
 * <pre>
 * Lingoes Format overview:
 * 
 * General Information:
 * - Dictionary data are stored in deflate streams.
 * - Index group information is stored in an index array in the LD2 file itself.
 * - Numbers are using little endian byte order.
 * - Definitions and xml data have UTF-8 or UTF-16LE encodings.
 * 
 * LD2 file schema:
 * - File Header
 * - File Description
 * - Additional Information (optional)
 * - Index Group (corresponds to definitions in dictionary) 
 * - Deflated Dictionary Streams
 * -- Index Data
 * --- Offsets of definitions
 * --- Offsets of translations
 * --- Flags
 * --- References to other translations
 * -- Definitions
 * -- Translations (xml)
 * 
 * TODO: find encoding / language fields to replace auto-detect of encodings
 * 
 * </pre>
 * 
 * @author keke
 * 
 */
public class LingoesLd2Reader {
    private static final String[] AVAIL_ENCODINGS = { "UTF-8", "UTF-16LE", "UTF-16BE" };

    public static void main(String[] args) throws IOException {
        // download from
        // https://skydrive.live.com/?cid=a10100d37adc7ad3&sc=documents&id=A10100D37ADC7AD3%211172#cid=A10100D37ADC7AD3&sc=documents
        String ld2File = "X:\\kkdict\\dicts\\lingoes\\Prodic English-Vietnamese Business.ld2";

        // read lingoes ld2 into byte array
        ByteArrayOutputStream dataOut = new ByteArrayOutputStream();
        FileChannel fChannel = new RandomAccessFile(ld2File, "r").getChannel();
        fChannel.transferTo(0, fChannel.size(), Channels.newChannel(dataOut));
        fChannel.close();

        // as bytes
        ByteBuffer dataRawBytes = ByteBuffer.wrap(dataOut.toByteArray());
        dataRawBytes.order(ByteOrder.LITTLE_ENDIAN);

        System.out.println("文件:" + ld2File);
        System.out.println("类型:" + new String(dataRawBytes.array(), 0, 4, "ASCII"));
        System.out.println("版本:" + dataRawBytes.getShort(0x18) + "." + dataRawBytes.getShort(0x1A));
        System.out.println("ID: 0x" + Long.toHexString(dataRawBytes.getLong(0x1C)));

        int offsetData = dataRawBytes.getInt(0x5C) + 0x60;
        if (dataRawBytes.limit() > offsetData) {
            System.out.println("简介地址:0x" + Integer.toHexString(offsetData));
            int type = dataRawBytes.getInt(offsetData);
            System.out.println("简介类型:0x" + Integer.toHexString(type));
            int offsetWithInfo = dataRawBytes.getInt(offsetData + 4) + offsetData + 12;
            if (type == 3) {
                // without additional information
                readDictionary(ld2File, dataRawBytes, offsetData);
            } else if (dataRawBytes.limit() > offsetWithInfo - 0x1C) {
                readDictionary(ld2File, dataRawBytes, offsetWithInfo);
            } else {
                System.err.println("文件不包含字典数据。网上字典?");
            }
        } else {
            System.err.println("文件不包含字典数据。网上字典?");
        }
    }

    private static final long decompress(final String inflatedFile, final ByteBuffer data, final int offset,
            final int length, final boolean append) throws IOException {
        Inflater inflator = new Inflater();
        InflaterInputStream in = new InflaterInputStream(new ByteArrayInputStream(data.array(), offset, length),
                inflator, 1024 * 8);
        FileOutputStream out = new FileOutputStream(inflatedFile, append);
        writeInputStream(in, out);
        long bytesRead = inflator.getBytesRead();
        in.close();
        out.close();
        inflator.end();
        return bytesRead;
    }

    private static final String[] detectEncodings(final ByteBuffer inflatedBytes, final int offsetWords,
            final int offsetXml, final int defTotal, final int dataLen, final int[] idxData, final String[] defData)
            throws UnsupportedEncodingException {
        final int tests = Math.min(defTotal, 10);
        int defEnc = 0;
        int xmlEnc = 0;
        Pattern p = Pattern.compile("^.*[\\x00-\\x1f].*$");
        for (int i = 0; i < tests; i++) {
            readDefinitionData(inflatedBytes, offsetWords, offsetXml, dataLen, AVAIL_ENCODINGS[defEnc],
                    AVAIL_ENCODINGS[xmlEnc], idxData, defData, i);
            if (p.matcher(defData[0]).matches()) {
                if (defEnc < AVAIL_ENCODINGS.length - 1) {
                    defEnc++;
                }
                i = 0;
            }
            if (p.matcher(defData[1]).matches()) {
                if (xmlEnc < AVAIL_ENCODINGS.length - 1) {
                    xmlEnc++;
                }
                i = 0;
            }
        }
        System.out.println("词组编码:" + AVAIL_ENCODINGS[defEnc]);
        System.out.println("XML编码:" + AVAIL_ENCODINGS[xmlEnc]);
        return new String[] { AVAIL_ENCODINGS[defEnc], AVAIL_ENCODINGS[xmlEnc] };
    }

    private static final void extract(final String inflatedFile, final String indexFile,
            final String extractedWordsFile, final String extractedXmlFile, final String extractedOutputFile,
            final int[] idxArray, final int offsetDefs, final int offsetXml) throws IOException, FileNotFoundException,
            UnsupportedEncodingException {
        System.out.println("写入'" + extractedOutputFile + "'。。。");

        FileWriter indexWriter = new FileWriter(indexFile);
        FileWriter defsWriter = new FileWriter(extractedWordsFile);
        FileWriter xmlWriter = new FileWriter(extractedXmlFile);
        FileWriter outputWriter = new FileWriter(extractedOutputFile);
        // read inflated data
        ByteArrayOutputStream dataOut = new ByteArrayOutputStream();
        FileChannel fChannel = new RandomAccessFile(inflatedFile, "r").getChannel();
        fChannel.transferTo(0, fChannel.size(), Channels.newChannel(dataOut));
        fChannel.close();
        ByteBuffer dataRawBytes = ByteBuffer.wrap(dataOut.toByteArray());
        dataRawBytes.order(ByteOrder.LITTLE_ENDIAN);

        final int dataLen = 10;
        final int defTotal = offsetDefs / dataLen - 1;

        String[] words = new String[defTotal];
        int[] idxData = new int[6];
        String[] defData = new String[2];

        final String[] encodings = detectEncodings(dataRawBytes, offsetDefs, offsetXml, defTotal, dataLen, idxData,
                defData);

        dataRawBytes.position(8);
        int counter = 0;
        final String defEncoding = encodings[0];
        final String xmlEncoding = encodings[1];
        for (int i = 0; i < defTotal; i++) {
            readDefinitionData(dataRawBytes, offsetDefs, offsetXml, dataLen, defEncoding, xmlEncoding, idxData,
                    defData, i);

            words[i] = defData[0];
            defsWriter.write(defData[0]);
            defsWriter.write("\n");

            xmlWriter.write(defData[1]);
            xmlWriter.write("\n");

            outputWriter.write(defData[0]);
            outputWriter.write("=");
            outputWriter.write(defData[1]);
            outputWriter.write("\n");

            System.out.println(defData[0] + " = " + defData[1]);
            counter++;
        }

        for (int i = 0; i < idxArray.length; i++) {
            int idx = idxArray[i];
            indexWriter.write(words[idx]);
            indexWriter.write(", ");
            indexWriter.write(String.valueOf(idx));
            indexWriter.write("\n");
        }
        indexWriter.close();
        defsWriter.close();
        xmlWriter.close();
        outputWriter.close();
        System.out.println("成功读出" + counter + "组数据。");
    }

    private static final void getIdxData(final ByteBuffer dataRawBytes, final int position, final int[] wordIdxData) {
        dataRawBytes.position(position);
        wordIdxData[0] = dataRawBytes.getInt();
        wordIdxData[1] = dataRawBytes.getInt();
        wordIdxData[2] = dataRawBytes.get() & 0xff;
        wordIdxData[3] = dataRawBytes.get() & 0xff;
        wordIdxData[4] = dataRawBytes.getInt();
        wordIdxData[5] = dataRawBytes.getInt();
    }

    private static final void inflate(final ByteBuffer dataRawBytes, final List<Integer> deflateStreams,
            final String inflatedFile) {
        System.out.println("解压缩'" + deflateStreams.size() + "'个数据流至'" + inflatedFile + "'。。。");
        int startOffset = dataRawBytes.position();
        int offset = -1;
        int lastOffset = startOffset;
        boolean append = false;
        try {
            for (Integer offsetRelative : deflateStreams) {
                offset = startOffset + offsetRelative.intValue();
                decompress(inflatedFile, dataRawBytes, lastOffset, offset - lastOffset, append);
                append = true;
                lastOffset = offset;
            }
        } catch (Throwable e) {
            System.err.println("解压缩失败: 0x" + Integer.toHexString(offset) + ": " + e.toString());
        }
    }

    private static final void readDefinitionData(final ByteBuffer inflatedBytes, final int offsetWords,
            final int offsetXml, final int dataLen, final String wordEncoding, final String xmlEncoding,
            final int[] idxData, final String[] defData, final int i) throws UnsupportedEncodingException {
        getIdxData(inflatedBytes, dataLen * i, idxData);
        int lastWordPos = idxData[0];
        int lastXmlPos = idxData[1];
        final int flags = idxData[2];
        int refs = idxData[3];
        int currentWordOffset = idxData[4];
        int currenXmlOffset = idxData[5];
        String xml = strip(new String(inflatedBytes.array(), offsetXml + lastXmlPos, currenXmlOffset - lastXmlPos,
                xmlEncoding));
        while (refs-- > 0) {
            int ref = inflatedBytes.getInt(offsetWords + lastWordPos);
            getIdxData(inflatedBytes, dataLen * ref, idxData);
            lastXmlPos = idxData[1];
            currenXmlOffset = idxData[5];
            if (xml.isEmpty()) {
                xml = strip(new String(inflatedBytes.array(), offsetXml + lastXmlPos, currenXmlOffset - lastXmlPos,
                        xmlEncoding));
            } else {
                xml = strip(new String(inflatedBytes.array(), offsetXml + lastXmlPos, currenXmlOffset - lastXmlPos,
                        xmlEncoding)) + ", " + xml;
            }
            lastWordPos += 4;
        }
        defData[1] = xml;

        String word = new String(inflatedBytes.array(), offsetWords + lastWordPos, currentWordOffset - lastWordPos,
                wordEncoding);
        defData[0] = word;
    }

    private static final void readDictionary(final String ld2File, final ByteBuffer dataRawBytes,
            final int offsetWithIndex) throws IOException, FileNotFoundException, UnsupportedEncodingException {
        System.out.println("词典类型:0x" + Integer.toHexString(dataRawBytes.getInt(offsetWithIndex)));
        int limit = dataRawBytes.getInt(offsetWithIndex + 4) + offsetWithIndex + 8;
        int offsetIndex = offsetWithIndex + 0x1C;
        int offsetCompressedDataHeader = dataRawBytes.getInt(offsetWithIndex + 8) + offsetIndex;
        int inflatedWordsIndexLength = dataRawBytes.getInt(offsetWithIndex + 12);
        int inflatedWordsLength = dataRawBytes.getInt(offsetWithIndex + 16);
        int inflatedXmlLength = dataRawBytes.getInt(offsetWithIndex + 20);
        int definitions = (offsetCompressedDataHeader - offsetIndex) / 4;
        List<Integer> deflateStreams = new ArrayList<Integer>();
        dataRawBytes.position(offsetCompressedDataHeader + 8);
        int offset = dataRawBytes.getInt();
        while (offset + dataRawBytes.position() < limit) {
            offset = dataRawBytes.getInt();
            deflateStreams.add(Integer.valueOf(offset));
        }
        int offsetCompressedData = dataRawBytes.position();
        System.out.println("索引词组数目:" + definitions);
        System.out.println("索引地址/大小:0x" + Integer.toHexString(offsetIndex) + " / "
                + (offsetCompressedDataHeader - offsetIndex) + " B");
        System.out.println("压缩数据地址/大小:0x" + Integer.toHexString(offsetCompressedData) + " / "
                + (limit - offsetCompressedData) + " B");
        System.out.println("词组索引地址/大小(解压缩后):0x0 / " + inflatedWordsIndexLength + " B");
        System.out.println("词组地址/大小(解压缩后):0x" + Integer.toHexString(inflatedWordsIndexLength) + " / "
                + inflatedWordsLength + " B");
        System.out.println("XML地址/大小(解压缩后):0x" + Integer.toHexString(inflatedWordsIndexLength + inflatedWordsLength)
                + " / " + inflatedXmlLength + " B");
        System.out.println("文件大小(解压缩后):" + (inflatedWordsIndexLength + inflatedWordsLength + inflatedXmlLength) / 1024
                + " KB");
        String inflatedFile = ld2File + ".inflated";
        inflate(dataRawBytes, deflateStreams, inflatedFile);

        if (new File(inflatedFile).isFile()) {
            String indexFile = ld2File + ".idx";
            String extractedFile = ld2File + ".words";
            String extractedXmlFile = ld2File + ".xml";
            String extractedOutputFile = ld2File + ".output";

            dataRawBytes.position(offsetIndex);
            int[] idxArray = new int[definitions];
            for (int i = 0; i < definitions; i++) {
                idxArray[i] = dataRawBytes.getInt();
            }
            extract(inflatedFile, indexFile, extractedFile, extractedXmlFile, extractedOutputFile, idxArray,
                    inflatedWordsIndexLength, inflatedWordsIndexLength + inflatedWordsLength);
        }
    }

    private static final String strip(final String xml) {
        int open = 0;
        int end = 0;
        if ((open = xml.indexOf("<![CDATA[")) != -1) {
            if ((end = xml.indexOf("]]>", open)) != -1) {
                return xml.substring(open + "<![CDATA[".length(), end).replace('\t', ' ').replace('\n', ' ')
                        .replace('\u001e', ' ').replace('\u001f', ' ');
            }
        } else if ((open = xml.indexOf("<Ô")) != -1) {
            if ((end = xml.indexOf("</Ô", open)) != -1) {
                open = xml.indexOf(">", open + 1);
                return xml.substring(open + 1, end).replace('\t', ' ').replace('\n', ' ').replace('\u001e', ' ')
                        .replace('\u001f', ' ');
            }
        } else {
            StringBuilder sb = new StringBuilder();
            end = 0;
            open = xml.indexOf('<');
            do {
                if (open - end > 1) {
                    sb.append(xml.substring(end + 1, open));
                }
                open = xml.indexOf('<', open + 1);
                end = xml.indexOf('>', end + 1);
            } while (open != -1 && end != -1);
            return sb.toString().replace('\t', ' ').replace('\n', ' ').replace('\u001e', ' ').replace('\u001f', ' ');
        }
        return "";
    }

    private static final void writeInputStream(final InputStream in, final OutputStream out) throws IOException {
        byte[] buffer = new byte[1024 * 8];
        int len;
        while ((len = in.read(buffer)) > 0) {
            out.write(buffer, 0, len);
        }
    }

}