验证码自动识别

概要

最近突然被朋友说要做一个豆瓣自动回帖的功能。而频繁顶帖超过三次的话会出现验证码,这就涉及到了文本OCR,正好百度有个免费的OCR识别接口。说做就做,获取token,调用接口,发现识别正确率并不高。资料一查,原来百度OCR只识别文字,并没有对图片进行二值化和降噪。

二值化

验证码的识别有很多python版本,但是Java版本很少,处理的比较好的那就更少了。所以我们就仿造python的逻辑写了个Java版本的二值化

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
public byte[] imageProcessing(String url) throws IOException {
Connection.Response execute1 = Jsoup.connect(url).timeout(5000)
.header("Host", "www.douban.com")
.header("Referer", "https://www.douban.com/")
.header("User-Agent", "Mozilla/5.0 (Windows NT 10.0; WOW64; rv:58.0) Gecko/20100101 Firefox/58.0/")
.ignoreContentType(true)
.execute();
BufferedInputStream bufferedInputStream = execute1.bodyStream();
BufferedImage bufferedImage = ImageIO.read(bufferedInputStream);
//BufferedImage bufferedImage = ImageIO.read(new File("C:\\Users\\Pictures\\captchaImage.jpg"));
int h = bufferedImage.getHeight();
int w = bufferedImage.getWidth();
BufferedImage binaryBufferedImage = new BufferedImage(w, h, BufferedImage.TYPE_BYTE_BINARY);
// 二值化
int threshold = 30;
for (int x = 0; x < w; x++) {
for (int y = 0; y < h; y++) {
int argb = bufferedImage.getRGB(x, y);
int r = (argb & 0xff0000) >> 16;
int g = (argb & 0xff00) >> 8;
int b = (argb & 0xff);
if (r > threshold || g > threshold || b > threshold) {
binaryBufferedImage.setRGB(x, y, new Color(255, 255, 255).getRGB());
} else {
binaryBufferedImage.setRGB(x, y, new Color(0, 0, 0).getRGB());
}
}
}
//降噪
BufferedImage denoise = denoise(binaryBufferedImage);
ByteArrayOutputStream bos = new ByteArrayOutputStream();
ImageIO.write(denoise, "jpg", bos);
return bos.toByteArray();
}

降噪

降噪的作用其实不是很大,但为了提高识别率,所以我们还是加了一下。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
 /**
* 降噪,以1个像素点为单位(实际使用中可以循环降噪,或者把单位可以扩大为多个像素点)
*
* @param image
* @return
*/
public static BufferedImage denoise(BufferedImage image) {
int w = image.getWidth();
int h = image.getHeight();
int white = new Color(255, 255, 255).getRGB();

if (isWhite(image.getRGB(1, 0)) && isWhite(image.getRGB(0, 1)) && isWhite(image.getRGB(1, 1))) {
image.setRGB(0, 0, white);
}
if (isWhite(image.getRGB(w - 2, 0)) && isWhite(image.getRGB(w - 1, 1)) && isWhite(image.getRGB(w - 2, 1))) {
image.setRGB(w - 1, 0, white);
}
if (isWhite(image.getRGB(0, h - 2)) && isWhite(image.getRGB(1, h - 1)) && isWhite(image.getRGB(1, h - 2))) {
image.setRGB(0, h - 1, white);
}
if (isWhite(image.getRGB(w - 2, h - 1)) && isWhite(image.getRGB(w - 1, h - 2)) && isWhite(image.getRGB(w - 2, h - 2))) {
image.setRGB(w - 1, h - 1, white);
}

for (int x = 1; x < w - 1; x++) {
int y = 0;
if (isBlack(image.getRGB(x, y))) {
int size = 0;
if (isWhite(image.getRGB(x - 1, y))) {
size++;
}
if (isWhite(image.getRGB(x + 1, y))) {
size++;
}
if (isWhite(image.getRGB(x, y + 1))) {
size++;
}
if (isWhite(image.getRGB(x - 1, y + 1))) {
size++;
}
if (isWhite(image.getRGB(x + 1, y + 1))) {
size++;
}
if (size >= 5) {
image.setRGB(x, y, white);
}
}
}
for (int x = 1; x < w - 1; x++) {
int y = h - 1;
if (isBlack(image.getRGB(x, y))) {
int size = 0;
if (isWhite(image.getRGB(x - 1, y))) {
size++;
}
if (isWhite(image.getRGB(x + 1, y))) {
size++;
}
if (isWhite(image.getRGB(x, y - 1))) {
size++;
}
if (isWhite(image.getRGB(x + 1, y - 1))) {
size++;
}
if (isWhite(image.getRGB(x - 1, y - 1))) {
size++;
}
if (size >= 5) {
image.setRGB(x, y, white);
}
}
}

for (int y = 1; y < h - 1; y++) {
int x = 0;
if (isBlack(image.getRGB(x, y))) {
int size = 0;
if (isWhite(image.getRGB(x + 1, y))) {
size++;
}
if (isWhite(image.getRGB(x, y + 1))) {
size++;
}
if (isWhite(image.getRGB(x, y - 1))) {
size++;
}
if (isWhite(image.getRGB(x + 1, y - 1))) {
size++;
}
if (isWhite(image.getRGB(x + 1, y + 1))) {
size++;
}
if (size >= 5) {
image.setRGB(x, y, white);
}
}
}

for (int y = 1; y < h - 1; y++) {
int x = w - 1;
if (isBlack(image.getRGB(x, y))) {
int size = 0;
if (isWhite(image.getRGB(x - 1, y))) {
size++;
}
if (isWhite(image.getRGB(x, y + 1))) {
size++;
}
if (isWhite(image.getRGB(x, y - 1))) {
size++;
}
//斜上下为空时,去掉此点
if (isWhite(image.getRGB(x - 1, y + 1))) {
size++;
}
if (isWhite(image.getRGB(x - 1, y - 1))) {
size++;
}
if (size >= 5) {
image.setRGB(x, y, white);
}
}
}

//降噪,以1个像素点为单位
for (int y = 1; y < h - 1; y++) {
for (int x = 1; x < w - 1; x++) {
if (isBlack(image.getRGB(x, y))) {
int size = 0;
//上下左右均为空时,去掉此点
if (isWhite(image.getRGB(x - 1, y))) {
size++;
}
if (isWhite(image.getRGB(x + 1, y))) {
size++;
}
//上下均为空时,去掉此点
if (isWhite(image.getRGB(x, y + 1))) {
size++;
}
if (isWhite(image.getRGB(x, y - 1))) {
size++;
}
//斜上下为空时,去掉此点
if (isWhite(image.getRGB(x - 1, y + 1))) {
size++;
}
if (isWhite(image.getRGB(x + 1, y - 1))) {
size++;
}
if (isWhite(image.getRGB(x + 1, y + 1))) {
size++;
}
if (isWhite(image.getRGB(x - 1, y - 1))) {
size++;
}
if (size >= 8) {
image.setRGB(x, y, white);
}
}
}
}

return image;
}

public static boolean isBlack(int colorInt) {
Color color = new Color(colorInt);
if (color.getRed() + color.getGreen() + color.getBlue() <= 300) {
return true;
}
return false;
}

public static boolean isWhite(int colorInt) {
Color color = new Color(colorInt);
if (color.getRed() + color.getGreen() + color.getBlue() > 300) {
return true;
}
return false;
}

百度OCR

百度普通文本OCR识别,个人用户每天有50,000次免费调用,所以对于并发量不是很大的需求,这么多的免费次数还是够用的。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
private static final String TOKEN_KEY = "TOKEN_KEY";
private static final String HTTPS_AIP_BAIDUBCE_COM = "https://aip.baidubce.com";
private static Cache<String, String> accessTokenCache = CacheBuilder.newBuilder().expireAfterAccess(2591000, TimeUnit.SECONDS).build();
private static LoadingCache<String, AtomicLong> tokenCntCache = CacheBuilder.newBuilder().expireAfterAccess(24, TimeUnit.HOURS).build(new CacheLoader<String, AtomicLong>() {
@Override
public AtomicLong load(String s) throws Exception {
return new AtomicLong(0);
}
});


/**
* 重要提示代码中所需工具类
* FileUtil,Base64Util,HttpUtil,GsonUtils请从
* https://ai.baidu.com/file/658A35ABAB2D404FBF903F64D47C1F72
* https://ai.baidu.com/file/C8D81F3301E24D2892968F09AE1AD6E2
* https://ai.baidu.com/file/544D677F5D4E4F17B4122FBD60DB82B3
* https://ai.baidu.com/file/470B3ACCA3FE43788B5A963BF0B625F3
* 下载
*/
public String generalBasic(String captchaUrl) throws Exception{
// 请求url
String url = HTTPS_AIP_BAIDUBCE_COM + "/rest/2.0/ocr/v1/general_basic";

// 本地文件路径
byte[] imgData = BinaryImage.imageProcessing(captchaUrl);
String imgStr = Base64Util.encode(imgData);
// 注意这里仅为了简化编码每一次请求都去获取access_token,线上环境access_token有过期时间, 客户端可自行缓存,过期后重新获取。
String accessToken = getAccessToken();
if (accessToken == null) {
throw new RuntimeException("百度token获取失败");
}
StringBuilder requestUrl = new StringBuilder();
requestUrl.append(url)
.append("?access_token=").append(accessToken);
Connection.Response post = Jsoup.connect(requestUrl.toString()).timeout(5000).header("Content-Type", "application/x-www-form-urlencoded")
.data("image", imgStr)
.data("language_type", "ENG")
.method(Connection.Method.POST)
.ignoreContentType(true)
.execute();
String body = post.body();
CaptchaResultInfo captchaResultInfo = GsonUtils.fromJson(body, CaptchaResultInfo.class);
if (captchaResultInfo != null && captchaResultInfo.getWordsResult() != null && captchaResultInfo.getWordsResult().get(0) != null) {
CaptchaResultInfo.WordsResult wordsResult = captchaResultInfo.getWordsResult().get(0);
body = wordsResult.getWords().replaceAll("[^0-9a-zA-Z]", "");
}
return body;

}

private String getAccessToken() throws IOException, ExecutionException {
String url = HTTPS_AIP_BAIDUBCE_COM + "/oauth/2.0/token?grant_type=client_credentials&client_id=xxxxx&client_secret=xxxx";
SimpleDateFormat sdf = new SimpleDateFormat("yyyyMMdd");
long andIncrement = tokenCntCache.get(TOKEN_KEY + sdf).getAndIncrement();
if (andIncrement > 30000) {
throw new RuntimeException("百度OCR免费次数达到上限,请明天再试。");
}
String ifPresent = accessTokenCache.getIfPresent(TOKEN_KEY);
if (ifPresent == null || ifPresent.length() == 0) {
ifPresent = get(TOKEN_KEY, url);
}
return ifPresent;
}

private synchronized String get(String key, String url) throws IOException {
String ifPresent = accessTokenCache.getIfPresent(key);
if (ifPresent != null && ifPresent.length() != 0) {
return ifPresent;
}
Connection.Response execute = Jsoup.connect(url).timeout(5000).ignoreContentType(true).execute();
String body = execute.body();
AccessTokenInfo accessTokenInfo = GsonUtils.fromJson(body, AccessTokenInfo.class);
if (accessTokenInfo != null && accessTokenInfo.getAccessToken() != null && accessTokenInfo.getAccessToken().length() != 0) {
String accessToken = accessTokenInfo.getAccessToken();
accessTokenCache.put(TOKEN_KEY,accessToken);
return accessToken ;
}
return null;
}

总结

整体来说,虽然识别率达不到百分百,但是多尝试几次不同的验证码识别,基本可以识别出来,一般的需求还是够用。