验证码自动识别

概要

最近突然被朋友说要做一个豆瓣自动回帖的功能。而频繁顶帖超过三次的话会出现验证码，这就涉及到了文本OCR，正好百度有个免费的OCR识别接口。说做就做，获取token，调用接口，发现识别正确率并不高。资料一查，原来百度OCR只识别文字，并没有对图片进行二值化和降噪。

二值化

验证码的识别有很多python版本，但是Java版本很少，处理的比较好的那就更少了。所以我们就仿造python的逻辑写了个Java版本的二值化

public byte[] imageProcessing(String url) throws IOException {
  	Connection.Response execute1 = Jsoup.connect(url).timeout(5000)
				.header("Host", "www.douban.com")
				.header("Referer", "https://www.douban.com/")
				.header("User-Agent", "Mozilla/5.0 (Windows NT 10.0; WOW64; rv:58.0) Gecko/20100101 Firefox/58.0/")
				.ignoreContentType(true)
				.execute();
		BufferedInputStream bufferedInputStream = execute1.bodyStream();
		BufferedImage bufferedImage = ImageIO.read(bufferedInputStream);
		//BufferedImage bufferedImage = ImageIO.read(new File("C:\\Users\\Pictures\\captchaImage.jpg"));
		int h = bufferedImage.getHeight();
		int w = bufferedImage.getWidth();
		BufferedImage binaryBufferedImage = new BufferedImage(w, h, BufferedImage.TYPE_BYTE_BINARY);
		// 二值化
		int threshold = 30;
		for (int x = 0; x < w; x++) {
			for (int y = 0; y < h; y++) {
				int argb = bufferedImage.getRGB(x, y);
				int r = (argb & 0xff0000) >> 16;
				int g = (argb & 0xff00) >> 8;
				int b = (argb & 0xff);
				if (r > threshold || g > threshold || b > threshold) {
					binaryBufferedImage.setRGB(x, y, new Color(255, 255, 255).getRGB());
				} else {
					binaryBufferedImage.setRGB(x, y, new Color(0, 0, 0).getRGB());
				}
			}
		}
    //降噪
		BufferedImage denoise = denoise(binaryBufferedImage);
		ByteArrayOutputStream bos = new ByteArrayOutputStream();
		ImageIO.write(denoise, "jpg", bos);
		return bos.toByteArray();
	}

降噪

降噪的作用其实不是很大，但为了提高识别率，所以我们还是加了一下。

 /**
 * 降噪，以1个像素点为单位（实际使用中可以循环降噪，或者把单位可以扩大为多个像素点）
 *
 * @param image
 * @return
 */
public static BufferedImage denoise(BufferedImage image) {
	int w = image.getWidth();
	int h = image.getHeight();
	int white = new Color(255, 255, 255).getRGB();

	if (isWhite(image.getRGB(1, 0)) && isWhite(image.getRGB(0, 1)) && isWhite(image.getRGB(1, 1))) {
		image.setRGB(0, 0, white);
	}
	if (isWhite(image.getRGB(w - 2, 0)) && isWhite(image.getRGB(w - 1, 1)) && isWhite(image.getRGB(w - 2, 1))) {
		image.setRGB(w - 1, 0, white);
	}
	if (isWhite(image.getRGB(0, h - 2)) && isWhite(image.getRGB(1, h - 1)) && isWhite(image.getRGB(1, h - 2))) {
		image.setRGB(0, h - 1, white);
	}
	if (isWhite(image.getRGB(w - 2, h - 1)) && isWhite(image.getRGB(w - 1, h - 2)) && isWhite(image.getRGB(w - 2, h - 2))) {
		image.setRGB(w - 1, h - 1, white);
	}

	for (int x = 1; x < w - 1; x++) {
		int y = 0;
		if (isBlack(image.getRGB(x, y))) {
			int size = 0;
			if (isWhite(image.getRGB(x - 1, y))) {
				size++;
			}
			if (isWhite(image.getRGB(x + 1, y))) {
				size++;
			}
			if (isWhite(image.getRGB(x, y + 1))) {
				size++;
			}
			if (isWhite(image.getRGB(x - 1, y + 1))) {
				size++;
			}
			if (isWhite(image.getRGB(x + 1, y + 1))) {
				size++;
			}
			if (size >= 5) {
				image.setRGB(x, y, white);
			}
		}
	}
	for (int x = 1; x < w - 1; x++) {
		int y = h - 1;
		if (isBlack(image.getRGB(x, y))) {
			int size = 0;
			if (isWhite(image.getRGB(x - 1, y))) {
				size++;
			}
			if (isWhite(image.getRGB(x + 1, y))) {
				size++;
			}
			if (isWhite(image.getRGB(x, y - 1))) {
				size++;
			}
			if (isWhite(image.getRGB(x + 1, y - 1))) {
				size++;
			}
			if (isWhite(image.getRGB(x - 1, y - 1))) {
				size++;
			}
			if (size >= 5) {
				image.setRGB(x, y, white);
			}
		}
	}

	for (int y = 1; y < h - 1; y++) {
		int x = 0;
		if (isBlack(image.getRGB(x, y))) {
			int size = 0;
			if (isWhite(image.getRGB(x + 1, y))) {
				size++;
			}
			if (isWhite(image.getRGB(x, y + 1))) {
				size++;
			}
			if (isWhite(image.getRGB(x, y - 1))) {
				size++;
			}
			if (isWhite(image.getRGB(x + 1, y - 1))) {
				size++;
			}
			if (isWhite(image.getRGB(x + 1, y + 1))) {
				size++;
			}
			if (size >= 5) {
				image.setRGB(x, y, white);
			}
		}
	}

	for (int y = 1; y < h - 1; y++) {
		int x = w - 1;
		if (isBlack(image.getRGB(x, y))) {
			int size = 0;
			if (isWhite(image.getRGB(x - 1, y))) {
				size++;
			}
			if (isWhite(image.getRGB(x, y + 1))) {
				size++;
			}
			if (isWhite(image.getRGB(x, y - 1))) {
				size++;
			}
			//斜上下为空时，去掉此点
			if (isWhite(image.getRGB(x - 1, y + 1))) {
				size++;
			}
			if (isWhite(image.getRGB(x - 1, y - 1))) {
				size++;
			}
			if (size >= 5) {
				image.setRGB(x, y, white);
			}
		}
	}

	//降噪，以1个像素点为单位
	for (int y = 1; y < h - 1; y++) {
		for (int x = 1; x < w - 1; x++) {
			if (isBlack(image.getRGB(x, y))) {
				int size = 0;
				//上下左右均为空时，去掉此点
				if (isWhite(image.getRGB(x - 1, y))) {
					size++;
				}
				if (isWhite(image.getRGB(x + 1, y))) {
					size++;
				}
				//上下均为空时，去掉此点
				if (isWhite(image.getRGB(x, y + 1))) {
					size++;
				}
				if (isWhite(image.getRGB(x, y - 1))) {
					size++;
				}
				//斜上下为空时，去掉此点
				if (isWhite(image.getRGB(x - 1, y + 1))) {
					size++;
				}
				if (isWhite(image.getRGB(x + 1, y - 1))) {
					size++;
				}
				if (isWhite(image.getRGB(x + 1, y + 1))) {
					size++;
				}
				if (isWhite(image.getRGB(x - 1, y - 1))) {
					size++;
				}
				if (size >= 8) {
					image.setRGB(x, y, white);
				}
			}
		}
	}

	return image;
}

public static boolean isBlack(int colorInt) {
	Color color = new Color(colorInt);
	if (color.getRed() + color.getGreen() + color.getBlue() <= 300) {
		return true;
	}
	return false;
}

public static boolean isWhite(int colorInt) {
	Color color = new Color(colorInt);
	if (color.getRed() + color.getGreen() + color.getBlue() > 300) {
		return true;
	}
	return false;
}

百度OCR

百度普通文本OCR识别，个人用户每天有50,000次免费调用，所以对于并发量不是很大的需求，这么多的免费次数还是够用的。

private static final String TOKEN_KEY = "TOKEN_KEY";
private static final String HTTPS_AIP_BAIDUBCE_COM = "https://aip.baidubce.com";
private static Cache<String, String> accessTokenCache = CacheBuilder.newBuilder().expireAfterAccess(2591000, TimeUnit.SECONDS).build();
private static LoadingCache<String, AtomicLong> tokenCntCache = CacheBuilder.newBuilder().expireAfterAccess(24, TimeUnit.HOURS).build(new CacheLoader<String, AtomicLong>() {
	@Override
	public AtomicLong load(String s) throws Exception {
		return new AtomicLong(0);
	}
});


/**
 * 重要提示代码中所需工具类
 * FileUtil,Base64Util,HttpUtil,GsonUtils请从
 * https://ai.baidu.com/file/658A35ABAB2D404FBF903F64D47C1F72
 * https://ai.baidu.com/file/C8D81F3301E24D2892968F09AE1AD6E2
 * https://ai.baidu.com/file/544D677F5D4E4F17B4122FBD60DB82B3
 * https://ai.baidu.com/file/470B3ACCA3FE43788B5A963BF0B625F3
 * 下载
 */
public String generalBasic(String captchaUrl) throws Exception{
	// 请求url
	String url = HTTPS_AIP_BAIDUBCE_COM + "/rest/2.0/ocr/v1/general_basic";

	// 本地文件路径
	byte[] imgData = BinaryImage.imageProcessing(captchaUrl);
	String imgStr = Base64Util.encode(imgData);
	// 注意这里仅为了简化编码每一次请求都去获取access_token，线上环境access_token有过期时间， 客户端可自行缓存，过期后重新获取。
	String accessToken = getAccessToken();
	if (accessToken == null) {
		throw new RuntimeException("百度token获取失败");
	}
	StringBuilder requestUrl = new StringBuilder();
	requestUrl.append(url)
			.append("?access_token=").append(accessToken);
	Connection.Response post = Jsoup.connect(requestUrl.toString()).timeout(5000).header("Content-Type", "application/x-www-form-urlencoded")
			.data("image", imgStr)
			.data("language_type", "ENG")
			.method(Connection.Method.POST)
			.ignoreContentType(true)
			.execute();
	String body = post.body();
	CaptchaResultInfo captchaResultInfo = GsonUtils.fromJson(body, CaptchaResultInfo.class);
	if (captchaResultInfo != null && captchaResultInfo.getWordsResult() != null && captchaResultInfo.getWordsResult().get(0) != null) {
		CaptchaResultInfo.WordsResult wordsResult = captchaResultInfo.getWordsResult().get(0);
		body = wordsResult.getWords().replaceAll("[^0-9a-zA-Z]", "");
	}
	return body;

}

private String getAccessToken() throws IOException, ExecutionException {
	String url = HTTPS_AIP_BAIDUBCE_COM + "/oauth/2.0/token?grant_type=client_credentials&client_id=xxxxx&client_secret=xxxx";
	SimpleDateFormat sdf = new SimpleDateFormat("yyyyMMdd");
	long andIncrement = tokenCntCache.get(TOKEN_KEY + sdf).getAndIncrement();
	if (andIncrement > 30000) {
		throw new RuntimeException("百度OCR免费次数达到上限，请明天再试。");
	}
	String ifPresent = accessTokenCache.getIfPresent(TOKEN_KEY);
	if (ifPresent == null || ifPresent.length() == 0) {
		ifPresent = get(TOKEN_KEY, url);
	}
	return ifPresent;
}

private synchronized String get(String key, String url) throws IOException {
	String ifPresent = accessTokenCache.getIfPresent(key);
	if (ifPresent != null && ifPresent.length() != 0) {
		return ifPresent;
	}
	Connection.Response execute = Jsoup.connect(url).timeout(5000).ignoreContentType(true).execute();
	String body = execute.body();
	AccessTokenInfo accessTokenInfo = GsonUtils.fromJson(body, AccessTokenInfo.class);
	if (accessTokenInfo != null && accessTokenInfo.getAccessToken() != null && accessTokenInfo.getAccessToken().length() != 0) {
     String accessToken = accessTokenInfo.getAccessToken();
     accessTokenCache.put(TOKEN_KEY,accessToken);
		return accessToken ;
	}
	return null;
}

总结

整体来说，虽然识别率达不到百分百，但是多尝试几次不同的验证码识别，基本可以识别出来，一般的需求还是够用。