highcharts 1.5 ExportController.java using servlet-api-2.5

Recently we have integrated Moxie Group’s GWT Highchart in our project and ran into the conflict with ExportController’s servlet-api-3.0 multi-part issues.

Like the old GWT does, it uses and includes servlet-api-2.5 or rather jetty 7.  Fortunately our project has already got apache-fileupload in its dependency and we could use the fileitemiterator to read the multi-part items. Changes are made in processrequest(HttpServletRequest request, HttpServletResponse response) and getParameter(HttpServletRequest request, String name) Here comes the modified code:

package com.highcharts.export.controller;

import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.io.PrintWriter;
import java.util.Enumeration;

import javax.servlet.ServletException;
import javax.servlet.ServletOutputStream;
import javax.servlet.http.HttpServlet;
import javax.servlet.http.HttpServletRequest;
import javax.servlet.http.HttpServletResponse;

import org.apache.batik.transcoder.TranscoderException;
import org.apache.commons.fileupload.FileItemFactory;
import org.apache.commons.fileupload.FileItemIterator;
import org.apache.commons.fileupload.FileItemStream;
import org.apache.commons.fileupload.disk.DiskFileItemFactory;
import org.apache.commons.fileupload.servlet.ServletFileUpload;
import org.apache.log4j.Logger;
import org.mortbay.log.Log;

import com.highcharts.export.util.MimeType;
import com.highcharts.export.util.SVGRasterizer;
import com.highcharts.export.util.SVGRasterizerException;

public class ExportController extends HttpServlet
{
private static final long serialVersionUID = 1L;

private static final String REQUEST_METHOD_POST = "POST";

private static final String CONTENT_TYPE_MULTIPART = "multipart/";

private static final String FORBIDDEN_WORD = "<!ENTITY";

protected static Logger logger = Logger.getLogger("exportservlet");

public ExportController()
{
super();
}

public void init()
{
}

protected void doGet(HttpServletRequest request, HttpServletResponse response) throws ServletException, IOException
{
processrequest(request, response);
}

protected void doPost(HttpServletRequest request, HttpServletResponse response) throws ServletException, IOException
{
processrequest(request, response);
}

public void processrequest(HttpServletRequest request, HttpServletResponse response) throws IOException,
ServletException
{
String svg = getParameter(request, "svg");
String filename = getFilename(getParameter(request, "filename"));
Float width = getWidth(getParameter(request, "width"));
MimeType mime = getMime(getParameter(request, "type"));

try
{
boolean multi = isMultipartRequest(request);

if (multi)
{
try
{
final FileItemFactory factory = new DiskFileItemFactory();
final ServletFileUpload upload = new ServletFileUpload(factory);
final FileItemIterator filesIterator = upload.getItemIterator(request);
while (filesIterator.hasNext())
{
final FileItemStream item = filesIterator.next();
final String name = item.getFieldName();
if ("svg".equals(name))
{
svg = new String(toByteArray(item), "UTF-8");
} else if ("filename".equals(name))
{
filename = getFilename(new String(toByteArray(item), "UTF-8"));
} else if ("width".equals(name))
{
width = getWidth(new String(toByteArray(item), "UTF-8"));
} else if ("type".equals(name))
{
mime = getMime(new String(toByteArray(item), "UTF-8"));
}
}
} catch (Exception e)
{
Log.warn("Failed to parse multi-part parameter: " + e);
}
} else
{

}
if (svg == null || svg.isEmpty())
{
throw new ServletException("The required - svg - post parameter is missing");
}
if (svg.indexOf(FORBIDDEN_WORD) > -1 || svg.indexOf(FORBIDDEN_WORD.toLowerCase()) > -1)
{
throw new ServletException("The - svg - post parameter could contain a malicious attack");
}

ExportController.writeFileContentToHttpResponse(svg, filename, width, mime, response);

} catch (IOException ioe)
{
logger.error("Oops something happened here redirect to error-page, " + ioe.getMessage());
sendError(request, response, ioe);
} catch (ServletException sce)
{
logger.error("Oops something happened here redirect to error-page, " + sce.getMessage());
sendError(request, response, sce);
}
}

public static byte[] toByteArray(final FileItemStream item) throws IOException
{
final InputStream in = item.openStream();
final ByteArrayOutputStream out = new ByteArrayOutputStream();
copy(in, out);
final byte[] byteArray = out.toByteArray();
in.close();
return byteArray;
}

public static void copy(final InputStream in, final OutputStream out) throws IOException
{
final byte[] buf = new byte[1024 * 8];
while (true)
{
final int length = in.read(buf);
if (length == -1)
{
break;
}
out.write(buf, 0, length);
}
out.flush();
}

/*
* Util methods
*/

public static void writeFileContentToHttpResponse(String svg, String filename, Float width, MimeType mime,
HttpServletResponse response) throws IOException, ServletException
{

ByteArrayOutputStream stream = new ByteArrayOutputStream();

if (!MimeType.SVG.equals(mime))
{
try
{
stream = SVGRasterizer.getInstance().transcode(stream, svg, mime, width);
} catch (SVGRasterizerException sre)
{
logger.error("Error while transcoding svg file to an image", sre);
stream.close();
throw new ServletException("Error while transcoding svg file to an image");
} catch (TranscoderException te)
{
logger.error("Error while transcoding svg file to an image", te);
stream.close();
throw new ServletException("Error while transcoding svg file to an image");
}
} else
{
stream.write(svg.getBytes());
}

// prepare response
response.reset();
response.setContentLength(stream.size());
response.setCharacterEncoding("utf-8");
response.setHeader("Content-disposition", "attachment; filename=" + filename + "." + mime.name().toLowerCase());
response.setHeader("Content-type", mime.getType());
// set encoding before writing to out, check this
ServletOutputStream out = response.getOutputStream();
// Send content to Browser
out.write(stream.toByteArray());
out.flush();
}

public static final boolean isMultipartRequest(HttpServletRequest request)
{
// inspired by org.apache.commons.fileupload
logger.debug("content-type " + request.getContentType());
return REQUEST_METHOD_POST.equalsIgnoreCase(request.getMethod()) && request.getContentType() != null
&& request.getContentType().toLowerCase().startsWith(CONTENT_TYPE_MULTIPART);
}

private String getParameter(HttpServletRequest request, String name) throws IOException, ServletException
{

return request.getParameter(name);

}

private String getFilename(String name)
{
return (name != null) ? name : "chart";
}

private static Float getWidth(String width)
{
if (width != null && !width.isEmpty())
{
Float parsedWidth = Float.valueOf(width);
if (parsedWidth.compareTo(0.0F) > 0)
{
return parsedWidth;
}
}
return null;
}

private static MimeType getMime(String mime)
{
MimeType type = MimeType.get(mime);
if (type != null)
{
return type;
}
return MimeType.PNG;
}

protected void sendError(HttpServletRequest request, HttpServletResponse response, Throwable ex) throws IOException,
ServletException
{
String headers = null;
String htmlHeader =
"<HTML><HEAD><TITLE>Highcharts Export error</TITLE><style type=\"text/css\">"
+ "body {font-family: \"Trebuchet MS\", Arial, Helvetica, sans-serif;} table {border-collapse: collapse;}th {background-color:green;color:white;} td, th {border: 1px solid #98BF21;} </style></HEAD><BODY>";
String htmlFooter = "</BODY></HTML>";

response.setContentType("text/html");

PrintWriter out = response.getWriter();
Enumeration<String> e = request.getHeaderNames();
String svg = this.getParameter(request, "svg");

out.println(htmlHeader);
out.println("<h3>Error while converting SVG</h3>");
out.println("<h4>Error message</h4>");
out.println("<p>" + ex.getMessage() + "</p>");
out.println("<h4>Debug steps</h4><ol>"
+ "<li>Copy the SVG:<br/><textarea cols=100 rows=5>"
+ svg
+ "</textarea></li>"
+ "<li>Go to <a href='http://validator.w3.org/#validate_by_input' target='_blank'>validator.w3.org/#validate_by_input</a></li>"
+ "<li>Paste the SVG</li>" + "<li>Click More Options and select SVG 1.1 for Use Doctype</li>"
+ "<li>Click the Check button</li></ol>");

out.println("<h4>Request Headers</h4>");
out.println("<TABLE>");
out.println("<tr><th> Header </th><th> Value </th>");

while (e.hasMoreElements())
{
headers = (String) e.nextElement();
if (headers != null)
{
out.println("<tr><td align=center><b>" + headers + "</td>");
out.println("<td align=center>" + request.getHeader(headers) + "</td></tr>");
}
}
out.println("</TABLE><BR>");
out.println(htmlFooter);

}
}
<pre>

travelmix – Ajax/JSON-RPC-based realtime on map public transport routing using EFA

应用EFA实现在线即时地图导航

(EFA是大部分德国公共交通公司使用的导航时刻表系统)

地图使用cloudemade.(采用openstreetmap(OSM)地图)

Ajax/JSON-RPC-based realtime on map public transport routing using EFA

例图:

1. 德国曼海姆,海德堡,卡鲁(Germany – Mannheim, Heidelberg, Karlsruhe)

2. 德国弗莱堡(Germany – Freiburg)

3. 德国曼海姆市中心 (Germany – Mannheim city)

4. 奥地利Linz (Austria – Linz)

5. 英国伦敦 (UK – London)

6. 瑞士苏黎世 (Switzerland – Zurich)

Demo安装&使用方法

打开travelmix-service.exe (或java -jar travalmix-service.jar)

用浏览器打开travelmix-app里的index.html文件

注:travelmix-service初始端口为9080,电脑必须连接网络

下载(download)

travelmix后台服务Windows: http://travelmix.googlecode.com/files/travelmix-service-1.0.exe

travelmix后台服务Java Jar(与windows版相同): http://travelmix.googlecode.com/files/travelmix-service-1.0.jar

travelmix前端程序:http://travelmix.googlecode.com/files/travelmix-app-1.0.zip

结构演示

e下 1.7 (exia) – e-hentai.org批量下载器

e下1.7下载:e-hentai.org批量下载工具 (e-hentai.org galleries downloader)

exia 1.7版本更新(2012-10-11):

1. 更正漫画名称乱码 (更好的支持HTML字符)
2. 增加下载稳定度 (续下漫画册,错误自动等待,重新下载等,更正下载地址含特殊符号的错误)
3. 探测并重新下载错误图片

 

e-hentai.org是现今最大的”假”p2p漫画分享网站,一个很好的同人网站。。上面的漫画无奇不有,除了最多的英文日文外还有非常多的中文漫画册。值得推荐。

  • 支持代理设置
  • 支持Cookie, User Agent等设置
  • 支持批量下载搜索结果
  • 智能执行错误分析
  • 注:由于服务器下载速度受限制。如需要下载多个文档,请同时开启多个”e下”程序然后设置不同代理再下载。(未来的版本将自动更换代理。)

下载:

* Windows: [http://exia.googlecode.com/files/exia-1.7.exe]

* Linux, Mac etc.: [http://exia.googlecode.com/files/exia-1.7.jar]

使用方法:

例:

1. 关键词搜索批量下载:如”chinese”

–> 下载所有包含此关键词的漫画册(>400部)

2. 通过输入漫画册网址下载整部漫画册:http://g.e-hentai.org/g/494953/7c3ec35c08/

–> 下载整本漫画

3. 输入漫画图网址下载:http://g.e-hentai.org/s/14b9c859ed/493328-1

–> 下载从本页开始所有的漫画

源文件: https://code.google.com/p/exia/source/browse/#git%2Fsrc%2Fcn%2Fkk%2Fexia

从jar里提取资源

使用Java时经常会碰到需要从jar文件里提取资源的情况。Java里提供了getClass().getResource()和getResourceAsStream()这两个函数来读取jar里的文件。但有时候要是不是直接使用该资源,而是需要把资源文件名称位置传递给第三方的时候就不怎么方便了。下面这个帮助函数把资源文件先提取出来储存在一个临时文件里用来继续加工。getResourceFile返回的File为该资源的临时文件:

如从jar里提取根目录下的cfg.properties文件:File cfgFile = getResourceFile(“/cfg.properties”);

public final File getResourceFile(String resourceFile) {
try {
File tmpFile = File.createTempFile("resource", null);
writeStream(getClass().getResourceAsStream(resourceFile), tmpFile);
return tmpFile;
} catch (Exception e) {
e.printStackTrace();
return null;
}
}

private static final void writeStream(InputStream in, File file) throws IOException {
BufferedOutputStream out = new BufferedOutputStream(new FileOutputStream(file));
int len;
byte[] buffer = new byte[1024 * 8];
while ((len = in.read(buffer)) &gt; 0) {
out.write(buffer, 0, len);
}
out.close();
}

boolean array / BitSet与String之间的转换

在安卓里,shared preferences只能储存简单类型(如int, boolean, string等)。其它的类型要么需要储存到数据库或其它缓存里,要么就要先转换成shared preferences所支持的类型。

这里我们将boolean array或BitSet储存在为String。然后在转换回来。

    public final static byte[] toByteArray(final BitSet bs) {
        final int bitsLength = bs.length();
        final int bytesLength = (bitsLength + 7) / 8;
        final byte[] bytes = new byte[bytesLength];
        int byteIdx;
        int bitIdx;
        for (int idx = bs.nextSetBit(0); idx >= 0; idx = bs.nextSetBit(idx + 1)) {
            byteIdx = bytesLength - idx / 8 - 1;
            bitIdx = idx % 8;
            bytes[byteIdx] |= 1 << bitIdx;
        }
        return bytes;
    }

    public final static BitSet fromByteArray(final byte[] bytes) {
        final int bitsLength = bytes.length * 8;
        final BitSet bs = new BitSet(bitsLength);
        for (int i = 0; i < bitsLength; i++) {
            if (((bytes[bytes.length - i / 8 - 1] >> (i % 8)) & 1) == 1) {
                bs.set(i);
            }
        }
        return bs;
    }

    public final static String toString(final BitSet bs) {
        return new String(toByteArray(bs), Charset.forName("ISO-8859-1"));
    }

    public final static BitSet fromString(final String data) {
        return fromByteArray(data.getBytes(Charset.forName("ISO-8859-1")));
    }

在Activity里面就很方便了:

    @Override
    protected void onPause() {
        super.onPause();
        getPreferences(Activity.MODE_PRIVATE).edit().putString("BITS", toString(this.bs)).commit();
    }

    @Override
    protected void onResume() {
        super.onResume();
        this.bs = fromString(getPreferences(Activity.MODE_PRIVATE).getString("BITS", ""));
    }

测试:

    static public void main(String[] args) throws IOException, AWTException, SecurityException, NoSuchFieldException,
            IllegalArgumentException, IllegalAccessException, NoSuchMethodException, InvocationTargetException {
        BitSet bs = new BitSet();
        for (int i = 0; i < 1000; i++) {
            if (Math.random() < 0.5) {
                bs.set(i);
            }
        }

        String storedData = toString(bs);
        BitSet resumedData = fromString(storedData);
        System.out.println(resumedData.equals(bs));
    }

把boolean[]转换成BitSet:(boolean[] length必须另外存储进shared preferences里)
(注:推荐使用BitSet。在java里每个boolean都占用一个byte的位置。这样当boolean[]很大时,就很浪费内存了。在手机,平板电脑上内存可是很珍贵的。):

    public static final boolean[] fromBitSet(final BitSet bs, final int length) {
        final boolean[] b= new boolean[length];
        for (int idx = bs.nextSetBit(0); idx >= 0; idx = bs.nextSetBit(idx + 1)) {
            b[idx] = true;
        }
        return b;
    }

    public static final BitSet toBitSet(final boolean[] b) {
        final int l = b.length;
        final BitSet bs = new BitSet(l);
        for (int i = 0; i < l; i++) {
            if (b[i]) {
                bs.set(i);
            }
        }
        return bs;
    }

QQ火拼俄罗斯方块外挂(QQTetris bot with java source)

下载

Windows:http://xytetrisbot.googlecode.com/files/xytetris-1.3-fast.exe

快捷键dll(32bit):http://xytetrisbot.googlecode.com/files/JIntellitype32.dll

需改名为JIntellitype.dll并放在与xytetris.exe同个文件夹里。

快捷键dll(64bit):http://xytetrisbot.googlecode.com/files/JIntellitype.dll

玩了几圈QQ火拼俄罗斯方块。一时来兴编了个机器人程序。算法比较死板,算得分,然后采用得分最高的步骤。

运算得分的计数值经过初步预算(simulator)。每种方案(normal,long life,等)平均算了1天左右。所以还不是最理想的。如有兴趣可以试着修改一下。

游戏块,道具等是通过截屏分析得出的。我没有花更多的时间分析QQTetris的内存结构或网络流。相信通过读出内存可以更快的预知数据的。(QQTetris像是会预读20左右的游戏块和道具在内存里。)

xytetrisbot xytetris

 

源代码:https://code.google.com/p/xytetrisbot/

Java:在Windows里实现快速截图

在Java里使用java.awt.Robot来屏幕截图非常的慢。直接使用RobotPeer或者native JNI的函数能数倍的提高速度,实现即时截图。

  • RobotPeer可以通过Toolkit直接生成。
  • WRobotPeer里“private native getRGBPixels”的应用是通过reflection实现的。

下面是测试结果:

// 使用Robot
Robot.getPixelColor(1024 * 768): 3850 ms
Robot.createScreenCapture(1024 * 768): 19 ms

// 使用RobotPeer
RobotPeer.getRGBPixel(1024 * 768): 3686 ms
RobotPeer.getRGBPixels(1024 * 768): 10 ms

// 使用RobotPeer.getRGBPixels(int x, int y, int w, int h, int[] buffer) (native)
RobotPeer.getRGBPixels(1024 * 768, buffer): 7 ms

测试代码:

//
// 使用Robot
//
final Robot robot = new Robot();
long start = System.currentTimeMillis();
int x = 0;
int y = 0;
for (int i = 0; i &lt; 1024 * 768; i++) {
robot.getPixelColor(x++, y);
if (x == 1024) {
y++;
}
}
System.out.println("Robot.getPixelColor(1024 * 768): " + (System.currentTimeMillis() - start) + " ms");
start = System.currentTimeMillis();
robot.createScreenCapture(new Rectangle(0, 0, 1024, 768));
System.out.println("Robot.createScreenCapture(1024 * 768): " + (System.currentTimeMillis() - start) + " ms");</code>

//
// 使用RobotPeer
//
final RobotPeer peer = ((ComponentFactory) Toolkit.getDefaultToolkit()).createRobot(null, null);
start = System.currentTimeMillis();
for (int i = 0; i &lt; 1024 * 768; i++) {
peer.getRGBPixel(x++, y);
if (x == 1024) {
y++;
}
}
System.out.println("RobotPeer.getRGBPixel(1024 * 768): " + (System.currentTimeMillis() - start) + " ms");
start = System.currentTimeMillis();
peer.getRGBPixels(new Rectangle(0, 0, 1024, 768));
System.out.println("RobotPeer.getRGBPixels(1024 * 768): " + (System.currentTimeMillis() - start) + " ms");

//
// 使用RobotPeer.getRGBPixels(int x, int y, int w, int h, int[] buffer) (native)
//
final Class[] params = new Class[] { int.class, int.class, int.class, int.class, int[].class };
final Method getRGBPixelsMethod = peer.getClass().getDeclaredMethod("getRGBPixels", params);
getRGBPixelsMethod.setAccessible(true);
final int[] buffer = new int[1024 * 768];
start = System.currentTimeMillis();
getRGBPixelsMethod.invoke(peer, 0, 0, 1024, 768, buffer);
System.out.println("RobotPeer.getRGBPixels(1024 * 768, buffer): " + (System.currentTimeMillis() - start) + " ms");

如果是纯粹想在自己的电脑上提升速度。也不妨试一下binary weaving。就是覆盖rt.jar里的WRobotPeer.java文件。

测试结果:

Robot.getPixelColor(1024 * 768): 3446 ms
Robot.createScreenCapture(1024 * 768): 23 ms
RobotPeer.getRGBPixel(1024 * 768): 3387 ms
RobotPeer.getRGBPixels(1024 * 768): 10 ms
RobotPeer.getRGBPixels(1024 * 768, buffer): 8 ms
RobotPeer.getRGBPixels(1024 * 768, buffer) direct: 7 ms

WRobotPeer.java文件:

package sun.awt.windows;

import java.awt.Rectangle;
import java.awt.peer.RobotPeer;

public class WRobotPeer extends WObjectPeer
        implements RobotPeer {
    public WRobotPeer() {
        create();
    }
    
    private synchronized native void _dispose();

    protected void disposeImpl() {
        _dispose();
    }

    public native void create();

    public native void mouseMoveImpl(int paramInt1, int paramInt2);

    public void mouseMove(int paramInt1, int paramInt2) {
        mouseMoveImpl(paramInt1, paramInt2);
    }

    public native void mousePress(int paramInt);

    public native void mouseRelease(int paramInt);

    public native void mouseWheel(int paramInt);

    public native void keyPress(int paramInt);

    public native void keyRelease(int paramInt);

    public int getRGBPixel(int paramInt1, int paramInt2) {
        return getRGBPixelImpl(paramInt1, paramInt2);
    }

    public native int getRGBPixelImpl(int paramInt1, int paramInt2);

    public int[] getRGBPixels(Rectangle paramRectangle) {
        int[] arrayOfInt = new int[paramRectangle.width * paramRectangle.height];
        getRGBPixels(paramRectangle.x, paramRectangle.y, paramRectangle.width, paramRectangle.height, arrayOfInt);
        return arrayOfInt;
    }

    public native void getRGBPixels(int x, int y, int w, int h, int[] buffer);
}

搜狗sgim_core.bin文件解读(java)

源程序:https://dict4cn.googlecode.com/svn/trunk/importer/src/SogouSgimCoreBinReader.java

 

import java.io.IOException;
import java.io.RandomAccessFile;
import java.nio.ByteBuffer;
import java.nio.ByteOrder;
import java.nio.channels.FileChannel;

/**
 * Sogou sgim_core.bin Reader
 * 
 * 
 * 地址:
 * 0x0C:单词数量
 * ????:单词长度(byte),单词(编码:UTF-16LE)
 * 
 * For files like sgim_eng.bin etc., the implementation has to be littlely modified.
 * 
 * @author keke
 */
public class SogouSgimCoreBinReader {
    public static void main(String[] args) throws IOException {
        String binFile = "D:\\sgim_core.bin";
        // String binFile = "D:\\sgim_eng.bin";

        // read scel into byte array
        FileChannel fChannel = new RandomAccessFile(binFile, "r").getChannel();
        ByteBuffer bb = ByteBuffer.allocate((int) fChannel.size());
        fChannel.read(bb);
        bb.order(ByteOrder.LITTLE_ENDIAN);
        bb.rewind();

        int words = bb.getInt(0xC);
        System.out.println("读入文件: " + binFile + ",单词:" + words);

        int i;
        int startPos = -1;
        while (bb.hasRemaining()) {
            i = bb.getInt();
            if (i == 0x554a0002) { // core, 6.1.0.6700
                // if (i == 0x00610002) { // eng, 6.1.0.6700
                startPos = bb.position() - 4;
                break;
            }
        }

        if (startPos > -1) {
            short s;
            int counter = 0;
            ByteBuffer buffer = ByteBuffer.allocate(Short.MAX_VALUE);
            System.out.println("单词起始位置:0x" + Integer.toHexString(startPos));
            bb.position(startPos);
            while (bb.hasRemaining() && words-- > 0) {
                s = bb.getShort();
                bb.get(buffer.array(), 0, s);
                counter++;
                // System.out.println(new String(buffer.array(), 0, s, "UTF-16LE"));
            }
            int endPos = bb.position();
            int diff = endPos - startPos;
            System.out.println("读出单词'" + binFile + "':" + counter);
            System.out.println("单词结尾位置:0x" + Integer.toHexString(endPos));
            System.out.println("单词词典长度:0x" + Integer.toHexString(diff));
        }

        fChannel.close();
    }
}

Lingoes灵格斯电子词典LD2(LDF)文件解析(附java词典导出程序)

新 Lingoes灵格斯电子词典LD2(LDF)文件单词提取器

http://code.google.com/p/lingoes-extractor/

下载

1. Windows版: http://lingoes-extractor.googlecode.com/files/lingoes-extractor-1.0.exe

2. Java版:http://lingoes-extractor.googlecode.com/files/lingoes-extractor-1.0.jar

程序演示

选择LD2文件跟导出文件:

导出后的文件:

 

 

 

支持已知所有Lingoes词典版本(2.x)。自动导出索引组(*.idx),所有词组(*.words),翻译(*.output)文件等。

Lingoes Reader / Exporter源程序下载https://dict4cn.googlecode.com/svn/trunk/importer/src/LingoesLd2Reader.java

源文件:

import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.FileWriter;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.io.RandomAccessFile;
import java.io.UnsupportedEncodingException;
import java.nio.ByteBuffer;
import java.nio.ByteOrder;
import java.nio.channels.Channels;
import java.nio.channels.FileChannel;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Pattern;
import java.util.zip.Inflater;
import java.util.zip.InflaterInputStream;

/**
 * Lingoes LD2/LDF File Reader
 * 
 * <pre>
 * Lingoes Format overview:
 * 
 * General Information:
 * - Dictionary data are stored in deflate streams.
 * - Index group information is stored in an index array in the LD2 file itself.
 * - Numbers are using little endian byte order.
 * - Definitions and xml data have UTF-8 or UTF-16LE encodings.
 * 
 * LD2 file schema:
 * - File Header
 * - File Description
 * - Additional Information (optional)
 * - Index Group (corresponds to definitions in dictionary) 
 * - Deflated Dictionary Streams
 * -- Index Data
 * --- Offsets of definitions
 * --- Offsets of translations
 * --- Flags
 * --- References to other translations
 * -- Definitions
 * -- Translations (xml)
 * 
 * TODO: find encoding / language fields to replace auto-detect of encodings
 * 
 * </pre>
 * 
 * @author keke
 * 
 */
public class LingoesLd2Reader {
    private static final String[] AVAIL_ENCODINGS = { "UTF-8", "UTF-16LE", "UTF-16BE" };

    public static void main(String[] args) throws IOException {
        // download from
        // https://skydrive.live.com/?cid=a10100d37adc7ad3&sc=documents&id=A10100D37ADC7AD3%211172#cid=A10100D37ADC7AD3&sc=documents
        String ld2File = "X:\\kkdict\\dicts\\lingoes\\Prodic English-Vietnamese Business.ld2";

        // read lingoes ld2 into byte array
        ByteArrayOutputStream dataOut = new ByteArrayOutputStream();
        FileChannel fChannel = new RandomAccessFile(ld2File, "r").getChannel();
        fChannel.transferTo(0, fChannel.size(), Channels.newChannel(dataOut));
        fChannel.close();

        // as bytes
        ByteBuffer dataRawBytes = ByteBuffer.wrap(dataOut.toByteArray());
        dataRawBytes.order(ByteOrder.LITTLE_ENDIAN);

        System.out.println("文件:" + ld2File);
        System.out.println("类型:" + new String(dataRawBytes.array(), 0, 4, "ASCII"));
        System.out.println("版本:" + dataRawBytes.getShort(0x18) + "." + dataRawBytes.getShort(0x1A));
        System.out.println("ID: 0x" + Long.toHexString(dataRawBytes.getLong(0x1C)));

        int offsetData = dataRawBytes.getInt(0x5C) + 0x60;
        if (dataRawBytes.limit() > offsetData) {
            System.out.println("简介地址:0x" + Integer.toHexString(offsetData));
            int type = dataRawBytes.getInt(offsetData);
            System.out.println("简介类型:0x" + Integer.toHexString(type));
            int offsetWithInfo = dataRawBytes.getInt(offsetData + 4) + offsetData + 12;
            if (type == 3) {
                // without additional information
                readDictionary(ld2File, dataRawBytes, offsetData);
            } else if (dataRawBytes.limit() > offsetWithInfo - 0x1C) {
                readDictionary(ld2File, dataRawBytes, offsetWithInfo);
            } else {
                System.err.println("文件不包含字典数据。网上字典?");
            }
        } else {
            System.err.println("文件不包含字典数据。网上字典?");
        }
    }

    private static final long decompress(final String inflatedFile, final ByteBuffer data, final int offset,
            final int length, final boolean append) throws IOException {
        Inflater inflator = new Inflater();
        InflaterInputStream in = new InflaterInputStream(new ByteArrayInputStream(data.array(), offset, length),
                inflator, 1024 * 8);
        FileOutputStream out = new FileOutputStream(inflatedFile, append);
        writeInputStream(in, out);
        long bytesRead = inflator.getBytesRead();
        in.close();
        out.close();
        inflator.end();
        return bytesRead;
    }

    private static final String[] detectEncodings(final ByteBuffer inflatedBytes, final int offsetWords,
            final int offsetXml, final int defTotal, final int dataLen, final int[] idxData, final String[] defData)
            throws UnsupportedEncodingException {
        final int tests = Math.min(defTotal, 10);
        int defEnc = 0;
        int xmlEnc = 0;
        Pattern p = Pattern.compile("^.*[\\x00-\\x1f].*$");
        for (int i = 0; i < tests; i++) {
            readDefinitionData(inflatedBytes, offsetWords, offsetXml, dataLen, AVAIL_ENCODINGS[defEnc],
                    AVAIL_ENCODINGS[xmlEnc], idxData, defData, i);
            if (p.matcher(defData[0]).matches()) {
                if (defEnc < AVAIL_ENCODINGS.length - 1) {
                    defEnc++;
                }
                i = 0;
            }
            if (p.matcher(defData[1]).matches()) {
                if (xmlEnc < AVAIL_ENCODINGS.length - 1) {
                    xmlEnc++;
                }
                i = 0;
            }
        }
        System.out.println("词组编码:" + AVAIL_ENCODINGS[defEnc]);
        System.out.println("XML编码:" + AVAIL_ENCODINGS[xmlEnc]);
        return new String[] { AVAIL_ENCODINGS[defEnc], AVAIL_ENCODINGS[xmlEnc] };
    }

    private static final void extract(final String inflatedFile, final String indexFile,
            final String extractedWordsFile, final String extractedXmlFile, final String extractedOutputFile,
            final int[] idxArray, final int offsetDefs, final int offsetXml) throws IOException, FileNotFoundException,
            UnsupportedEncodingException {
        System.out.println("写入'" + extractedOutputFile + "'。。。");

        FileWriter indexWriter = new FileWriter(indexFile);
        FileWriter defsWriter = new FileWriter(extractedWordsFile);
        FileWriter xmlWriter = new FileWriter(extractedXmlFile);
        FileWriter outputWriter = new FileWriter(extractedOutputFile);
        // read inflated data
        ByteArrayOutputStream dataOut = new ByteArrayOutputStream();
        FileChannel fChannel = new RandomAccessFile(inflatedFile, "r").getChannel();
        fChannel.transferTo(0, fChannel.size(), Channels.newChannel(dataOut));
        fChannel.close();
        ByteBuffer dataRawBytes = ByteBuffer.wrap(dataOut.toByteArray());
        dataRawBytes.order(ByteOrder.LITTLE_ENDIAN);

        final int dataLen = 10;
        final int defTotal = offsetDefs / dataLen - 1;

        String[] words = new String[defTotal];
        int[] idxData = new int[6];
        String[] defData = new String[2];

        final String[] encodings = detectEncodings(dataRawBytes, offsetDefs, offsetXml, defTotal, dataLen, idxData,
                defData);

        dataRawBytes.position(8);
        int counter = 0;
        final String defEncoding = encodings[0];
        final String xmlEncoding = encodings[1];
        for (int i = 0; i < defTotal; i++) {
            readDefinitionData(dataRawBytes, offsetDefs, offsetXml, dataLen, defEncoding, xmlEncoding, idxData,
                    defData, i);

            words[i] = defData[0];
            defsWriter.write(defData[0]);
            defsWriter.write("\n");

            xmlWriter.write(defData[1]);
            xmlWriter.write("\n");

            outputWriter.write(defData[0]);
            outputWriter.write("=");
            outputWriter.write(defData[1]);
            outputWriter.write("\n");

            System.out.println(defData[0] + " = " + defData[1]);
            counter++;
        }

        for (int i = 0; i < idxArray.length; i++) {
            int idx = idxArray[i];
            indexWriter.write(words[idx]);
            indexWriter.write(", ");
            indexWriter.write(String.valueOf(idx));
            indexWriter.write("\n");
        }
        indexWriter.close();
        defsWriter.close();
        xmlWriter.close();
        outputWriter.close();
        System.out.println("成功读出" + counter + "组数据。");
    }

    private static final void getIdxData(final ByteBuffer dataRawBytes, final int position, final int[] wordIdxData) {
        dataRawBytes.position(position);
        wordIdxData[0] = dataRawBytes.getInt();
        wordIdxData[1] = dataRawBytes.getInt();
        wordIdxData[2] = dataRawBytes.get() & 0xff;
        wordIdxData[3] = dataRawBytes.get() & 0xff;
        wordIdxData[4] = dataRawBytes.getInt();
        wordIdxData[5] = dataRawBytes.getInt();
    }

    private static final void inflate(final ByteBuffer dataRawBytes, final List<Integer> deflateStreams,
            final String inflatedFile) {
        System.out.println("解压缩'" + deflateStreams.size() + "'个数据流至'" + inflatedFile + "'。。。");
        int startOffset = dataRawBytes.position();
        int offset = -1;
        int lastOffset = startOffset;
        boolean append = false;
        try {
            for (Integer offsetRelative : deflateStreams) {
                offset = startOffset + offsetRelative.intValue();
                decompress(inflatedFile, dataRawBytes, lastOffset, offset - lastOffset, append);
                append = true;
                lastOffset = offset;
            }
        } catch (Throwable e) {
            System.err.println("解压缩失败: 0x" + Integer.toHexString(offset) + ": " + e.toString());
        }
    }

    private static final void readDefinitionData(final ByteBuffer inflatedBytes, final int offsetWords,
            final int offsetXml, final int dataLen, final String wordEncoding, final String xmlEncoding,
            final int[] idxData, final String[] defData, final int i) throws UnsupportedEncodingException {
        getIdxData(inflatedBytes, dataLen * i, idxData);
        int lastWordPos = idxData[0];
        int lastXmlPos = idxData[1];
        final int flags = idxData[2];
        int refs = idxData[3];
        int currentWordOffset = idxData[4];
        int currenXmlOffset = idxData[5];
        String xml = strip(new String(inflatedBytes.array(), offsetXml + lastXmlPos, currenXmlOffset - lastXmlPos,
                xmlEncoding));
        while (refs-- > 0) {
            int ref = inflatedBytes.getInt(offsetWords + lastWordPos);
            getIdxData(inflatedBytes, dataLen * ref, idxData);
            lastXmlPos = idxData[1];
            currenXmlOffset = idxData[5];
            if (xml.isEmpty()) {
                xml = strip(new String(inflatedBytes.array(), offsetXml + lastXmlPos, currenXmlOffset - lastXmlPos,
                        xmlEncoding));
            } else {
                xml = strip(new String(inflatedBytes.array(), offsetXml + lastXmlPos, currenXmlOffset - lastXmlPos,
                        xmlEncoding)) + ", " + xml;
            }
            lastWordPos += 4;
        }
        defData[1] = xml;

        String word = new String(inflatedBytes.array(), offsetWords + lastWordPos, currentWordOffset - lastWordPos,
                wordEncoding);
        defData[0] = word;
    }

    private static final void readDictionary(final String ld2File, final ByteBuffer dataRawBytes,
            final int offsetWithIndex) throws IOException, FileNotFoundException, UnsupportedEncodingException {
        System.out.println("词典类型:0x" + Integer.toHexString(dataRawBytes.getInt(offsetWithIndex)));
        int limit = dataRawBytes.getInt(offsetWithIndex + 4) + offsetWithIndex + 8;
        int offsetIndex = offsetWithIndex + 0x1C;
        int offsetCompressedDataHeader = dataRawBytes.getInt(offsetWithIndex + 8) + offsetIndex;
        int inflatedWordsIndexLength = dataRawBytes.getInt(offsetWithIndex + 12);
        int inflatedWordsLength = dataRawBytes.getInt(offsetWithIndex + 16);
        int inflatedXmlLength = dataRawBytes.getInt(offsetWithIndex + 20);
        int definitions = (offsetCompressedDataHeader - offsetIndex) / 4;
        List<Integer> deflateStreams = new ArrayList<Integer>();
        dataRawBytes.position(offsetCompressedDataHeader + 8);
        int offset = dataRawBytes.getInt();
        while (offset + dataRawBytes.position() < limit) {
            offset = dataRawBytes.getInt();
            deflateStreams.add(Integer.valueOf(offset));
        }
        int offsetCompressedData = dataRawBytes.position();
        System.out.println("索引词组数目:" + definitions);
        System.out.println("索引地址/大小:0x" + Integer.toHexString(offsetIndex) + " / "
                + (offsetCompressedDataHeader - offsetIndex) + " B");
        System.out.println("压缩数据地址/大小:0x" + Integer.toHexString(offsetCompressedData) + " / "
                + (limit - offsetCompressedData) + " B");
        System.out.println("词组索引地址/大小(解压缩后):0x0 / " + inflatedWordsIndexLength + " B");
        System.out.println("词组地址/大小(解压缩后):0x" + Integer.toHexString(inflatedWordsIndexLength) + " / "
                + inflatedWordsLength + " B");
        System.out.println("XML地址/大小(解压缩后):0x" + Integer.toHexString(inflatedWordsIndexLength + inflatedWordsLength)
                + " / " + inflatedXmlLength + " B");
        System.out.println("文件大小(解压缩后):" + (inflatedWordsIndexLength + inflatedWordsLength + inflatedXmlLength) / 1024
                + " KB");
        String inflatedFile = ld2File + ".inflated";
        inflate(dataRawBytes, deflateStreams, inflatedFile);

        if (new File(inflatedFile).isFile()) {
            String indexFile = ld2File + ".idx";
            String extractedFile = ld2File + ".words";
            String extractedXmlFile = ld2File + ".xml";
            String extractedOutputFile = ld2File + ".output";

            dataRawBytes.position(offsetIndex);
            int[] idxArray = new int[definitions];
            for (int i = 0; i < definitions; i++) {
                idxArray[i] = dataRawBytes.getInt();
            }
            extract(inflatedFile, indexFile, extractedFile, extractedXmlFile, extractedOutputFile, idxArray,
                    inflatedWordsIndexLength, inflatedWordsIndexLength + inflatedWordsLength);
        }
    }

    private static final String strip(final String xml) {
        int open = 0;
        int end = 0;
        if ((open = xml.indexOf("<![CDATA[")) != -1) {
            if ((end = xml.indexOf("]]>", open)) != -1) {
                return xml.substring(open + "<![CDATA[".length(), end).replace('\t', ' ').replace('\n', ' ')
                        .replace('\u001e', ' ').replace('\u001f', ' ');
            }
        } else if ((open = xml.indexOf("<Ô")) != -1) {
            if ((end = xml.indexOf("</Ô", open)) != -1) {
                open = xml.indexOf(">", open + 1);
                return xml.substring(open + 1, end).replace('\t', ' ').replace('\n', ' ').replace('\u001e', ' ')
                        .replace('\u001f', ' ');
            }
        } else {
            StringBuilder sb = new StringBuilder();
            end = 0;
            open = xml.indexOf('<');
            do {
                if (open - end > 1) {
                    sb.append(xml.substring(end + 1, open));
                }
                open = xml.indexOf('<', open + 1);
                end = xml.indexOf('>', end + 1);
            } while (open != -1 && end != -1);
            return sb.toString().replace('\t', ' ').replace('\n', ' ').replace('\u001e', ' ').replace('\u001f', ' ');
        }
        return "";
    }

    private static final void writeInputStream(final InputStream in, final OutputStream out) throws IOException {
        byte[] buffer = new byte[1024 * 8];
        int len;
        while ((len = in.read(buffer)) > 0) {
            out.write(buffer, 0, len);
        }
    }

}

搜狗拼音输入法SCEL词库文件解析(附java词库导出程序)

源程序下载https://code.google.com/p/dict4cn/source/browse/trunk/importer/src/SogouScelReader.java

Source Code:

import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.io.RandomAccessFile;
import java.nio.ByteBuffer;
import java.nio.ByteOrder;
import java.nio.channels.Channels;
import java.nio.channels.FileChannel;

/**
 * Sougou Pinyin IME SCEL File Reader
 * 
 * SCEL Format overview:
 * 
 * General Information:
 * - Chinese characters and pinyin are all encoded with UTF-16LE.
 * - Numbers are using little endian byte order.
 * 
 * SCEL hex analysis:
 * - 0x0           Pinyin List Offset
 * - 0x120         total number of words
 * - 0x total number of pinyin
 * - ...           List of pinyin as [index, byte length of pinyin, pinyin as string] triples
 * - ...           Dictionary
 * - ...           
 * 
 * Dictionary format:
 * - It can interpreted as a list of 
 *   [alternatives of words, 
 *       byte length of pinyin indexes, pinyin indexes, 
 *       [byte length of word, word as string, length of skip bytes, skip bytes]
 *       ... (alternatives) 
 *   ].
 * 
 * 
 * @author keke
 */
public class SogouScelReader {
    public static void main(String[] args) throws IOException {
        // download from http://pinyin.sogou.com/dict
        String scelFile = "D:\\test.scel";

        // read scel into byte array
        ByteArrayOutputStream dataOut = new ByteArrayOutputStream();
        FileChannel fChannel = new RandomAccessFile(scelFile, "r").getChannel();
        fChannel.transferTo(0, fChannel.size(), Channels.newChannel(dataOut));
        fChannel.close();

        // scel as bytes
        ByteBuffer dataRawBytes = ByteBuffer.wrap(dataOut.toByteArray());
        dataRawBytes.order(ByteOrder.LITTLE_ENDIAN);

        byte[] buf = new byte[1024];
        String[] pyDict = new String[512];

        int totalWords = dataRawBytes.getInt(0x120);

        // pinyin offset
        dataRawBytes.position(dataRawBytes.getInt());
        int totalPinyin = dataRawBytes.getInt();
        for (int i = 0; i < totalPinyin; i++) {
            int idx = dataRawBytes.getShort();
            int len = dataRawBytes.getShort();
            dataRawBytes.get(buf, 0, len);
            pyDict[idx] = new String(buf, 0, len, "UTF-16LE");
        }

        // extract dictionary
        int counter = 0;
        for (int i = 0; i  0) {
                int key = dataRawBytes.getShort();
                if (first) {
                    first = false;
                } else {
                    py.append('\'');
                }
                py.append(pyDict[key]);
            }
            first = true;
            while (alternatives-- > 0) {
                if (first) {
                    first = false;
                } else {
                    word.append(", ");
                }
                int wordlength = dataRawBytes.getShort();
                dataRawBytes.get(buf, 0, wordlength);
                word.append(new String(buf, 0, wordlength, "UTF-16LE"));
                // skip bytes
                dataRawBytes.get(buf, 0, dataRawBytes.getShort());
            }
            System.out.println(word.toString() + "\t" + py.toString());
            counter++;
        }
        System.out.println("\nExtracted '" + scelFile + "': " + counter);
    }
}