Skip to content

Instantly share code, notes, and snippets.

@HejiaHo
Created September 14, 2014 17:26
Show Gist options
  • Save HejiaHo/9dc6d44f8cd1331db5d1 to your computer and use it in GitHub Desktop.
Save HejiaHo/9dc6d44f8cd1331db5d1 to your computer and use it in GitHub Desktop.
保存Java2s中的所有cpp程序
import org.apache.http.HttpEntity;
import org.apache.http.HttpEntityEnclosingRequest;
import org.apache.http.HttpHeaders;
import org.apache.http.HttpRequest;
import org.apache.http.client.HttpRequestRetryHandler;
import org.apache.http.client.config.RequestConfig;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.client.protocol.HttpClientContext;
import org.apache.http.conn.ConnectTimeoutException;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.protocol.HttpContext;
import org.apache.http.util.EntityUtils;
import javax.net.ssl.SSLException;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InterruptedIOException;
import java.net.UnknownHostException;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
/**
* 保存Java2s中的所有cpp程序
* User: jse7en
* Date: 2014/9/14
* Time: 12:12
* Version: 1.0
*/
public class Java2sCpp {
/**
* 默认保存路径
*/
public static final String SAVE_PATH = "D:/selfsrc/cpp/java2s/";
/**
* 浏览器表头
*/
public static final String USER_AGENT = "Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) " +
"Chrome/37.0.2062.3 Safari/537.36";
/**
* java2s 根地址
*/
public static final String JAVA2S = "http://www.java2s.com/";
/**
* 根据url获取页面内容,如出现错误,则返回空
*
* @param url 要获取的url地址
* @return 页面内容
* @throws IOException
*/
public static String getContent(String url) throws IOException {
CloseableHttpClient httpclient = HttpClients.custom().setRetryHandler(new DefaultRetryHandler()).build();
HttpGet httpGet = new HttpGet(url);
httpGet.addHeader(HttpHeaders.USER_AGENT, USER_AGENT);
//设置超时时间
RequestConfig requestConfig = RequestConfig.custom()
.setSocketTimeout(5000)
.setConnectTimeout(5000)
.build();
httpGet.setConfig(requestConfig);
CloseableHttpResponse response = httpclient.execute(httpGet);
try {
System.out.println(url + " ==> " + response.getStatusLine());
int statusLine = response.getStatusLine().getStatusCode();
//判断页面返回是否有效
if (statusLine >= 200 && statusLine <= 300) {
HttpEntity entity = response.getEntity();
return EntityUtils.toString(entity);
} else {
return "";
}
} finally {
response.close();
}
}
/**
* 根据页面内容,获取代码,并进行转译
*
* @param html 页面内容
* @return 代码
*/
public static String getCode(String html) {
System.out.println("获取代码");
Pattern pattern = Pattern.compile("<code>([\\w|\\W]+)</code>");
Matcher matcher = pattern.matcher(html);
String codeHtml = "";
if (matcher.find()) {
codeHtml = matcher.group(1);
}
Pattern tagPattern = Pattern.compile("<\\S*?( [^>|^/]*)?>|(</\\S*?>)");
Matcher tagMatcher = tagPattern.matcher(codeHtml);
String result = tagMatcher.replaceAll("");
result = result.replaceAll("&nbsp;", " ").replaceAll("<br>", "\n").replaceAll("&lt;", "<")
.replaceAll("&gt;", ">").replaceAll("&amp;", "&").replaceAll("&#34;", "\"").replaceAll("&#92;", "\\")
.replaceAll("&#39;", "'");
return result;
}
/**
* 保存代码
*
* @param url url地址
* @param code 代码内容
*/
public static void saveCppProgram(String url, String code) {
int index = url.lastIndexOf("/") + 1;
int endIndex = url.length() - ".html".length() + 1;
String fileName = url.substring(index, endIndex) + ".cpp";
System.out.println("保存程序 " + fileName + " 到 " + SAVE_PATH);
try {
FileOutputStream out = new FileOutputStream(SAVE_PATH + fileName);
out.write(code.getBytes());
out.flush();
out.close();
} catch (FileNotFoundException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
}
/**
* 获取某章节下所有程序的url列表
*
* @param url 章节地址,包含该章节所有程序的url
* @return 该章节下所有程序的url列表
*/
public static List<String> getUrlList(String url) {
System.out.println("获取" + url + "下的所有url");
List<String> list = new ArrayList<String>();
Pattern pattern = Pattern.compile("<td align=left><A href=\"(\\S+)\">");
String content = "";
try {
content = getContent(url);
} catch (IOException e) {
e.printStackTrace();
System.out.println("再次重试-" + url);
//超时重试
return getUrlList(url);
}
Matcher matcher = pattern.matcher(content);
while (matcher.find()) {
list.add(JAVA2S + matcher.group(1));
}
System.out.println(list);
return list;
}
/**
* 获取所有章节的url
*
* @param content 章节页面的内容
* @return 所有章节url的set
*/
public static Set<String> getAllUrlSet(String content) {
System.out.println("获取所有URL");
Set<String> set = new HashSet<String>();
Pattern pattern = Pattern.compile("<TD width=390>\\S+<A href=\"(\\S+)\">");
Matcher matcher = pattern.matcher(content);
while (matcher.find()) {
set.add(JAVA2S + matcher.group(1));
}
return set;
}
public static void main(String[] args) throws IOException, InterruptedException {
Set<String> set = getAllUrlSet(getContent("http://www.java2s.com/Tutorial/Cpp/CatalogCpp.htm"));
int k = 0;
System.out.println("------------total--------------" + set.size());
for (String url : set) {
System.out.println("---------------" + ++k);
List<String> list = getUrlList(url);
SaveThread[] threads = new SaveThread[list.size()];
for (int i = 0; i < list.size(); i++) {
threads[i] = new SaveThread(list.get(i));
}
for (int j = 0; j < threads.length; j++) {
threads[j].start();
}
for (int j = 0; j < threads.length; j++) {
threads[j].join();
}
boolean allDone = false;
while (allDone) {
allDone = true;
for (int j = 0; j < threads.length; j++) {
if (threads[j].isAlive()) {
allDone = false;
}
}
Thread.sleep(5000);
}
}
}
/**
* 用于执行保存的线程
*/
static class SaveThread extends Thread {
private String url;
private int tryTimes;
public SaveThread(String url) {
this.url = url;
tryTimes = 0;
}
public void setUrl(String url) {
this.url = url;
}
public void run() {
try {
tryTimes++;
saveCppProgram(url, getCode(getContent(url)));
} catch (Exception e) {
e.printStackTrace();
if (tryTimes < 10) {
run();
} else {
System.out.println("放弃重试" + url);
}
}
}
}
/**
* 连接重试的处理器
*/
static class DefaultRetryHandler implements HttpRequestRetryHandler {
@Override
public boolean retryRequest(IOException exception, int executionCount, HttpContext context) {
System.out.println("第" + executionCount + "次重试中");
if (executionCount >= 10) {
// 如果已经重试了10次,就放弃
System.out.println("放弃重试");
return false;
}
if (exception instanceof InterruptedIOException) {
// 超时
return false;
}
if (exception instanceof UnknownHostException) {
// 目标服务器不可达
return false;
}
if (exception instanceof ConnectTimeoutException) {
// 连接被拒绝
return false;
}
if (exception instanceof SSLException) {
// ssl握手异常
return false;
}
HttpClientContext clientContext = HttpClientContext.adapt(context);
HttpRequest request = clientContext.getRequest();
boolean idempotent = !(request instanceof HttpEntityEnclosingRequest);
if (idempotent) {
// 如果请求是幂等的,就再次尝试
return true;
}
return false;
}
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment