Skip to content

Instantly share code, notes, and snippets.

@tedzhou
Created April 16, 2014 05:36
Show Gist options
  • Save tedzhou/10811840 to your computer and use it in GitHub Desktop.
Save tedzhou/10811840 to your computer and use it in GitHub Desktop.
抓数据的工具集
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="CompilerConfiguration">
<option name="DEFAULT_COMPILER" value="Javac" />
<resourceExtensions />
<wildcardResourcePatterns>
<entry name="!?*.java" />
<entry name="!?*.form" />
<entry name="!?*.class" />
<entry name="!?*.groovy" />
<entry name="!?*.scala" />
<entry name="!?*.flex" />
<entry name="!?*.kt" />
<entry name="!?*.clj" />
</wildcardResourcePatterns>
<annotationProcessing>
<profile default="true" name="Default" enabled="false">
<processorPath useClasspath="true" />
</profile>
</annotationProcessing>
</component>
</project>
<component name="ProjectDictionaryState">
<dictionary name="ted" />
</component>
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="Encoding" useUTFGuessing="true" native2AsciiForPropertiesFiles="false" />
</project>
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="JavaScriptLibraryMappings">
<file url="file://$PROJECT_DIR$/ioHelper.js" libraries="{Node.js Globals}" />
<file url="file://$PROJECT_DIR$/ioPool.js" libraries="{Node.js Globals}" />
</component>
</project>
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="ProjectInspectionProfilesVisibleTreeState">
<entry key="Project Default">
<profile-state>
<expanded-state>
<State>
<id />
</State>
<State>
<id>CSS</id>
</State>
<State>
<id>General</id>
</State>
<State>
<id>GeneralJavaScript</id>
</State>
<State>
<id>JavaScript</id>
</State>
<State>
<id>Probable bugs</id>
</State>
</expanded-state>
<selected-state>
<State>
<id>Class structure</id>
</State>
</selected-state>
</profile-state>
</entry>
</component>
<component name="ProjectRootManager" version="2" languageLevel="JDK_1_6" assert-keyword="true" jdk-15="true">
<output url="file://$PROJECT_DIR$/out" />
</component>
<component name="SvnConfiguration" maxAnnotateRevisions="500" myUseAcceleration="nothing" myAutoUpdateAfterCommit="false" cleanupOnStartRun="false" SSL_PROTOCOLS="sslv3">
<option name="USER" value="" />
<option name="PASSWORD" value="" />
<option name="mySSHConnectionTimeout" value="30000" />
<option name="mySSHReadTimeout" value="30000" />
<option name="LAST_MERGED_REVISION" />
<option name="MERGE_DRY_RUN" value="false" />
<option name="MERGE_DIFF_USE_ANCESTRY" value="true" />
<option name="UPDATE_LOCK_ON_DEMAND" value="false" />
<option name="IGNORE_SPACES_IN_MERGE" value="false" />
<option name="CHECK_NESTED_FOR_QUICK_MERGE" value="false" />
<option name="IGNORE_SPACES_IN_ANNOTATE" value="true" />
<option name="SHOW_MERGE_SOURCES_IN_ANNOTATE" value="true" />
<option name="FORCE_UPDATE" value="false" />
<option name="IGNORE_EXTERNALS" value="false" />
<configuration useDefault="true">$USER_HOME$/.subversion</configuration>
<myIsUseDefaultProxy>false</myIsUseDefaultProxy>
</component>
</project>
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="ProjectModuleManager">
<modules>
<module fileurl="file://$PROJECT_DIR$/ioPool.iml" filepath="$PROJECT_DIR$/ioPool.iml" />
</modules>
</component>
</project>
<component name="DependencyValidationManager">
<state>
<option name="SKIP_IMPORT_STATEMENTS" value="false" />
</state>
</component>
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="VcsDirectoryMappings">
<mapping directory="" vcs="" />
</component>
</project>
var http = require('http');
var fs = require('fs');
var url = require('url');
var request = require('request');
var iconv = require('iconv-lite');
var mkdirp = require('mkdirp');
var path = require('path');
/**
* 获取url返回的html
* @param urlStr
* @param callback(e, html)
*/
function getHtml(urlStr, callback) {
if (!urlStr || !callback) return;
if (typeof urlStr == "string") {
urlStr = url.parse(urlStr);
}
http.get(urlStr,function (res) {
var source = "";
var html = "";
res.setEncoding('binary');
res.on('data',function (data) {
source += data;
}).on('end', function () {
try {
var contentType = res.headers['content-type'];
html = new Buffer(source, 'binary');
if (/GBK/i.test(contentType)) {
html = iconv.decode(html, 'GBK');
} else {
html = iconv.decode(html, 'utf8');
}
callback && callback(null, html);
} catch (ex) {
callback && callback(ex, html);
}
});
}).on('error', function (err) {
console.log('http get error:', err);
callback && callback(err, null);
});
}
/**
* 简单写方法,在path不存在的情况下也那写进去
* @param pathName
* @param str
* @param [callback]
*/
function simpleWrite(pathName, str, callback) {
if (typeof str == "object") {
str = JSON.stringify(str);
}
var name = path.basename(pathName);
var docPath = path.dirname(pathName);
if (!fs.existsSync(docPath))
mkdirp.sync(docPath, 0755);
fs.writeFile(docPath + '/' + name, str, function (err) {
if (err) return console.log(err);
console.log("write success" + pathName);
if (callback) callback(err);
});
}
/**
* 下图片
* @param uri
* @param filename
* @param callback
*/
function download(uri, filename, callback) {
request.head(uri, function (err, res, body) {
if (err) return;
console.log('content-type:', res.headers['content-type']);
console.log('content-length:', res.headers['content-length']);
request(uri).pipe(fs.createWriteStream(filename)).on('close', callback);
});
}
/**
* 递归拿root里面的所有文件
* @param root
* @returns {Array}
*/
function getAllFiles(root) {
var res = [] , files = fs.readdirSync(root);
files.forEach(function (file) {
var pathName = root + '/' + file;
var stat = fs.lstatSync(pathName);
if (!stat.isDirectory()) {
res.push(pathName);
} else {
res = res.concat(getAllFiles(pathName));
}
});
return res;
}
/**
* 批量重命名
* @param sourceDoc 源地址
* @param destinationDoc 目标地址
* @param renameOperation 如何重命名
* @param isMove 是否删掉之前的
*
*
* // 重命名test Demo
* ioHelper.filesRename('/Users/ted/Downloads/info', '/Users/ted/Downloads/renameInfo', function (name) {
var arr = name.split('.');
if(!arr[0]){return;}
arr[1] = 'txt';
arr[0] = 'schoolInfo-' + arr[0];
return arr.join('.');
});
*/
function filesRename(sourceDoc, destinationDoc, renameOperation, isMove) {
var fileArray = getAllFiles(sourceDoc);
fileArray.forEach(function (pathName) {
var name = path.basename(pathName);
if (renameOperation) {
name = renameOperation(name);
}
if (name) {
destinationDoc = destinationDoc || path.dirname(pathName);
var finalPath = destinationDoc + '/' + name;
if (!fs.existsSync(destinationDoc))
mkdirp.sync(destinationDoc, 0775);
if (!isMove) {
fileCopy(pathName, finalPath);
} else {
fs.renameSync(pathName, finalPath);
}
}
})
}
function fileCopy(oldPath, newPath) {
if (!fs.existsSync(path.dirname(newPath)))
mkdirp.sync(newPath, 0775);
fs.createReadStream(oldPath).pipe(fs.createWriteStream(newPath));
}
module.exports = exports = {
simpleWrite: simpleWrite,
getHtml: getHtml,
download: download,
getAllFiles: getAllFiles,
filesRename: filesRename,
fileCopy: fileCopy
};
<?xml version="1.0" encoding="UTF-8"?>
<module type="WEB_MODULE" version="4">
<component name="NewModuleRootManager" inherit-compiler-output="true">
<exclude-output />
<content url="file://$MODULE_DIR$" />
<orderEntry type="sourceFolder" forTests="false" />
</component>
</module>
var ioHelper = require('./ioHelper');
function IoPool() {
this.totalRequest = 0;
this.aliveRequest = 0;
this.waitingQueue = [];
this.maxRequestInSameTime = 40;
this.onDone = null;
this.canceled = false;
}
IoPool.prototype = {
cancel: function () {
this.totalRequest = 0;
this.aliveRequest = 0;
this.waitingQueue = [];
},
start: function () {
var self = this;
self.checkQueue();
},
push: function (url, fn) {
var self = this;
self.totalRequest++;
self.waitingQueue.push({url: url, callback: fn});
},
checkQueue: function () {
var self = this;
if (self.waitingQueue.length > 0) {
var left = self.maxRequestInSameTime - self.aliveRequest;
left = Math.min(left, self.waitingQueue.length);
while (left--) {
var currentRequest = self.waitingQueue.shift();
self.aliveRequest++;
self.log();
ioHelper.getHtml(currentRequest.url, (function (currentRequest) {
return function () {
self.aliveRequest--;
var callback = currentRequest.callback;
if (callback) {
callback.apply(this, arguments);
}
self.checkQueue();
};
})(currentRequest));
}
} else {
self.log('done');
if (self.onDone && (self.aliveRequest == 0) && !self.doned) {
self.doned = true;
self.onDone.call(self);
}
}
},
log: function (tag) {
var self = this;
var percentage = ((self.waitingQueue.length + self.aliveRequest) / self.totalRequest * 100) >>0 ;
console.log("tag:" + (tag || "default"), "total:" + self.totalRequest, "waitingQueue:" + self.waitingQueue.length, "requesting:" + self.aliveRequest, "left:" +percentage + "%");
}
};
module.exports = exports = IoPool;
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment