Created
September 14, 2014 17:26
-
-
Save HejiaHo/9dc6d44f8cd1331db5d1 to your computer and use it in GitHub Desktop.
保存Java2s中的所有cpp程序
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import org.apache.http.HttpEntity; | |
import org.apache.http.HttpEntityEnclosingRequest; | |
import org.apache.http.HttpHeaders; | |
import org.apache.http.HttpRequest; | |
import org.apache.http.client.HttpRequestRetryHandler; | |
import org.apache.http.client.config.RequestConfig; | |
import org.apache.http.client.methods.CloseableHttpResponse; | |
import org.apache.http.client.methods.HttpGet; | |
import org.apache.http.client.protocol.HttpClientContext; | |
import org.apache.http.conn.ConnectTimeoutException; | |
import org.apache.http.impl.client.CloseableHttpClient; | |
import org.apache.http.impl.client.HttpClients; | |
import org.apache.http.protocol.HttpContext; | |
import org.apache.http.util.EntityUtils; | |
import javax.net.ssl.SSLException; | |
import java.io.FileNotFoundException; | |
import java.io.FileOutputStream; | |
import java.io.IOException; | |
import java.io.InterruptedIOException; | |
import java.net.UnknownHostException; | |
import java.util.ArrayList; | |
import java.util.HashSet; | |
import java.util.List; | |
import java.util.Set; | |
import java.util.regex.Matcher; | |
import java.util.regex.Pattern; | |
/** | |
* 保存Java2s中的所有cpp程序 | |
* User: jse7en | |
* Date: 2014/9/14 | |
* Time: 12:12 | |
* Version: 1.0 | |
*/ | |
public class Java2sCpp { | |
/** | |
* 默认保存路径 | |
*/ | |
public static final String SAVE_PATH = "D:/selfsrc/cpp/java2s/"; | |
/** | |
* 浏览器表头 | |
*/ | |
public static final String USER_AGENT = "Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) " + | |
"Chrome/37.0.2062.3 Safari/537.36"; | |
/** | |
* java2s 根地址 | |
*/ | |
public static final String JAVA2S = "http://www.java2s.com/"; | |
/** | |
* 根据url获取页面内容,如出现错误,则返回空 | |
* | |
* @param url 要获取的url地址 | |
* @return 页面内容 | |
* @throws IOException | |
*/ | |
public static String getContent(String url) throws IOException { | |
CloseableHttpClient httpclient = HttpClients.custom().setRetryHandler(new DefaultRetryHandler()).build(); | |
HttpGet httpGet = new HttpGet(url); | |
httpGet.addHeader(HttpHeaders.USER_AGENT, USER_AGENT); | |
//设置超时时间 | |
RequestConfig requestConfig = RequestConfig.custom() | |
.setSocketTimeout(5000) | |
.setConnectTimeout(5000) | |
.build(); | |
httpGet.setConfig(requestConfig); | |
CloseableHttpResponse response = httpclient.execute(httpGet); | |
try { | |
System.out.println(url + " ==> " + response.getStatusLine()); | |
int statusLine = response.getStatusLine().getStatusCode(); | |
//判断页面返回是否有效 | |
if (statusLine >= 200 && statusLine <= 300) { | |
HttpEntity entity = response.getEntity(); | |
return EntityUtils.toString(entity); | |
} else { | |
return ""; | |
} | |
} finally { | |
response.close(); | |
} | |
} | |
/** | |
* 根据页面内容,获取代码,并进行转译 | |
* | |
* @param html 页面内容 | |
* @return 代码 | |
*/ | |
public static String getCode(String html) { | |
System.out.println("获取代码"); | |
Pattern pattern = Pattern.compile("<code>([\\w|\\W]+)</code>"); | |
Matcher matcher = pattern.matcher(html); | |
String codeHtml = ""; | |
if (matcher.find()) { | |
codeHtml = matcher.group(1); | |
} | |
Pattern tagPattern = Pattern.compile("<\\S*?( [^>|^/]*)?>|(</\\S*?>)"); | |
Matcher tagMatcher = tagPattern.matcher(codeHtml); | |
String result = tagMatcher.replaceAll(""); | |
result = result.replaceAll(" ", " ").replaceAll("<br>", "\n").replaceAll("<", "<") | |
.replaceAll(">", ">").replaceAll("&", "&").replaceAll(""", "\"").replaceAll("\", "\\") | |
.replaceAll("'", "'"); | |
return result; | |
} | |
/** | |
* 保存代码 | |
* | |
* @param url url地址 | |
* @param code 代码内容 | |
*/ | |
public static void saveCppProgram(String url, String code) { | |
int index = url.lastIndexOf("/") + 1; | |
int endIndex = url.length() - ".html".length() + 1; | |
String fileName = url.substring(index, endIndex) + ".cpp"; | |
System.out.println("保存程序 " + fileName + " 到 " + SAVE_PATH); | |
try { | |
FileOutputStream out = new FileOutputStream(SAVE_PATH + fileName); | |
out.write(code.getBytes()); | |
out.flush(); | |
out.close(); | |
} catch (FileNotFoundException e) { | |
e.printStackTrace(); | |
} catch (IOException e) { | |
e.printStackTrace(); | |
} | |
} | |
/** | |
* 获取某章节下所有程序的url列表 | |
* | |
* @param url 章节地址,包含该章节所有程序的url | |
* @return 该章节下所有程序的url列表 | |
*/ | |
public static List<String> getUrlList(String url) { | |
System.out.println("获取" + url + "下的所有url"); | |
List<String> list = new ArrayList<String>(); | |
Pattern pattern = Pattern.compile("<td align=left><A href=\"(\\S+)\">"); | |
String content = ""; | |
try { | |
content = getContent(url); | |
} catch (IOException e) { | |
e.printStackTrace(); | |
System.out.println("再次重试-" + url); | |
//超时重试 | |
return getUrlList(url); | |
} | |
Matcher matcher = pattern.matcher(content); | |
while (matcher.find()) { | |
list.add(JAVA2S + matcher.group(1)); | |
} | |
System.out.println(list); | |
return list; | |
} | |
/** | |
* 获取所有章节的url | |
* | |
* @param content 章节页面的内容 | |
* @return 所有章节url的set | |
*/ | |
public static Set<String> getAllUrlSet(String content) { | |
System.out.println("获取所有URL"); | |
Set<String> set = new HashSet<String>(); | |
Pattern pattern = Pattern.compile("<TD width=390>\\S+<A href=\"(\\S+)\">"); | |
Matcher matcher = pattern.matcher(content); | |
while (matcher.find()) { | |
set.add(JAVA2S + matcher.group(1)); | |
} | |
return set; | |
} | |
public static void main(String[] args) throws IOException, InterruptedException { | |
Set<String> set = getAllUrlSet(getContent("http://www.java2s.com/Tutorial/Cpp/CatalogCpp.htm")); | |
int k = 0; | |
System.out.println("------------total--------------" + set.size()); | |
for (String url : set) { | |
System.out.println("---------------" + ++k); | |
List<String> list = getUrlList(url); | |
SaveThread[] threads = new SaveThread[list.size()]; | |
for (int i = 0; i < list.size(); i++) { | |
threads[i] = new SaveThread(list.get(i)); | |
} | |
for (int j = 0; j < threads.length; j++) { | |
threads[j].start(); | |
} | |
for (int j = 0; j < threads.length; j++) { | |
threads[j].join(); | |
} | |
boolean allDone = false; | |
while (allDone) { | |
allDone = true; | |
for (int j = 0; j < threads.length; j++) { | |
if (threads[j].isAlive()) { | |
allDone = false; | |
} | |
} | |
Thread.sleep(5000); | |
} | |
} | |
} | |
/** | |
* 用于执行保存的线程 | |
*/ | |
static class SaveThread extends Thread { | |
private String url; | |
private int tryTimes; | |
public SaveThread(String url) { | |
this.url = url; | |
tryTimes = 0; | |
} | |
public void setUrl(String url) { | |
this.url = url; | |
} | |
public void run() { | |
try { | |
tryTimes++; | |
saveCppProgram(url, getCode(getContent(url))); | |
} catch (Exception e) { | |
e.printStackTrace(); | |
if (tryTimes < 10) { | |
run(); | |
} else { | |
System.out.println("放弃重试" + url); | |
} | |
} | |
} | |
} | |
/** | |
* 连接重试的处理器 | |
*/ | |
static class DefaultRetryHandler implements HttpRequestRetryHandler { | |
@Override | |
public boolean retryRequest(IOException exception, int executionCount, HttpContext context) { | |
System.out.println("第" + executionCount + "次重试中"); | |
if (executionCount >= 10) { | |
// 如果已经重试了10次,就放弃 | |
System.out.println("放弃重试"); | |
return false; | |
} | |
if (exception instanceof InterruptedIOException) { | |
// 超时 | |
return false; | |
} | |
if (exception instanceof UnknownHostException) { | |
// 目标服务器不可达 | |
return false; | |
} | |
if (exception instanceof ConnectTimeoutException) { | |
// 连接被拒绝 | |
return false; | |
} | |
if (exception instanceof SSLException) { | |
// ssl握手异常 | |
return false; | |
} | |
HttpClientContext clientContext = HttpClientContext.adapt(context); | |
HttpRequest request = clientContext.getRequest(); | |
boolean idempotent = !(request instanceof HttpEntityEnclosingRequest); | |
if (idempotent) { | |
// 如果请求是幂等的,就再次尝试 | |
return true; | |
} | |
return false; | |
} | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment