Created
April 22, 2015 04:37
-
-
Save ninegua/923175791dec7a8a3259 to your computer and use it in GitHub Desktop.
Weibo Scrapper (using conduit == 0.2.*) written in 2012
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{-# LANGUAGE OverloadedStrings, RankNTypes, ScopedTypeVariables, | |
NoMonomorphismRestriction, DeriveDataTypeable #-} | |
module Main where | |
import Prelude hiding (and, catch) | |
import Data.Char (toLower) | |
import Data.Conduit | |
import Data.Conduit.Util | |
import Data.Conduit.ImageSize (sinkImageInfo) | |
import Data.Conduit.Binary (sourceFile, conduitFile, sinkFile) | |
import Data.Conduit.Lazy (lazyConsume) | |
import qualified Data.Conduit.List as CL | |
import Network.HTTP.Conduit (parseUrl, Response (..), urlEncodedBody, withManager, Cookie(..)) | |
import Network.HTTP.Conduit.Browser (BrowserAction, setCookieFilter, browse, makeRequest, setUserAgent) | |
import Text.HTML.TagStream (Token, Token'(..), tokenStream, encode) | |
import qualified Network.HTTP.Types as T | |
import System.Environment (getArgs) | |
import System.IO (stderr, hPutStrLn, getContents) | |
import System.Directory (renameFile) | |
import System.Random (randomRIO) | |
import qualified Data.ByteString as B | |
import qualified Data.ByteString.Lazy as L | |
import qualified Data.ByteString.Lazy.UTF8 as LU | |
import qualified Data.ByteString.UTF8 as BU | |
import Control.Monad (when, (=<<)) | |
import Control.Monad.IO.Class (liftIO) | |
import Control.Monad.State (StateT) | |
import qualified Control.Monad.State as S | |
import Control.Monad.Trans (lift) | |
import Control.Monad.ST (ST, runST) | |
import Control.Arrow ((***), first, second) | |
import Control.Exception.Lifted | |
import Data.Monoid (Monoid, mappend, mempty) | |
import qualified Data.String as DS | |
import Data.List (splitAt, partition, dropWhile, takeWhile, span, | |
isInfixOf, isPrefixOf, intercalate) | |
import Data.Typeable (Typeable) | |
import Data.CaseInsensitive (mk) | |
import qualified Data.Map as M | |
import Data.String.Utils (split, replace) | |
import Control.Concurrent (threadDelay, newMVar, withMVar, modifyMVar) | |
iphoneAgent = "Mozilla/5.0 (iPhone; U; CPU like Mac OS X; en) AppleWebKit/420+ (KHTML, like Gecko) Version/3.0 Mobile/1A543a Safari/419.3" | |
prefix = "http://3g.sina.com.cn/prog/wapsite/sso" | |
weibo = "http://weibo.cn" | |
main = do | |
login : pass : _ <- getArgs | |
cookieCache <- newMVar M.empty | |
withManager (flip browse $ | |
setCookieFilter (cookieFilter cookieCache) >> | |
setUserAgent (BU.fromString iphoneAgent) >> | |
startLogin (BU.fromString login) (BU.fromString pass) >>= | |
afterLogin >>= | |
myWeiqun >>= | |
countPages "/qun/detail/" >>= | |
allMyQunPages >>= | |
debug . show) | |
cookieFilter cookieCache _ cookie = do | |
modifyMVar cookieCache $ \cache -> do | |
let name = BU.toString $ cookie_name cookie | |
value = BU.toString $ cookie_value cookie | |
cache' = M.insert name value cache | |
debug $ "Set Cookie: " ++ name ++ " " ++ value | |
return (cache', True) | |
consume file src = lift $ src $= dump file $$ CL.consume | |
data MyException | |
= MalURL | |
| ErrorResponse String | |
| LoginURLNotFound | |
| LoginFormNotFound | |
| UIDNotFound | |
| HomeURLNotFound | |
| MyWeiboURLNotFound | |
| PageListNotFound | |
| PageTotalNotFound | |
| PageTotalNotNumber | |
| CommentTotalNotFound | |
| CommentTotalNotNumber | |
deriving (Show, Typeable) | |
instance Exception MyException | |
data Msg = Msg [Token] [[Token]] | |
instance Show Msg where | |
show (Msg m s) = BU.toString $ B.intercalate "\n" $ | |
"Text: " : p m : "Comment: " : map p s | |
where p = B.append " " . encode | |
msgToHtml (Msg m s) = | |
addDiv "message" $ B.concat $ | |
[encode m, addDiv "comments" $ B.concat $ map (addDiv "comment" . encode) s] | |
where addDiv c s = B.concat ["<div class=\"", c, "\">", s, "</div>"] | |
instance Monoid Msg where | |
mappend (Msg a b) (Msg c d) = (Msg a (b ++ d)) | |
mempty = Msg [] [] | |
debug = liftIO . hPutStrLn stderr | |
send' msg modReq url = do | |
case parseUrl url of | |
Nothing -> throwIO MalURL | |
Just req -> do | |
let req' = modReq req | |
debug $ msg ++ url | |
liftIO $ randomRIO (0,4::Int) >>= threadDelay . (*1000000) | |
Response status headers body <- makeRequest req' | |
debug $ unlines $ ["Status: " ++ show status, | |
"Headers: " ++ show headers] | |
case T.statusCode status of | |
302 -> return $ Right status | |
200 -> return $ Left body | |
_ -> return $ Right status | |
send msg modReq url = do | |
x <- send' msg modReq url | |
case x of | |
Right status -> throwIO $ ErrorResponse $ show status | |
Left body -> return $ body $= tokenStream | |
post form = send "Post to: " $ urlEncodedBody (M.toList form) | |
get = send "Get from: " id | |
getFile = fmap (either id (const $ CL.sourceList [])) . send' "Get file from: " id | |
dump = mapConduit (encode . (:[])) . conduitFile | |
matchTag tag key check (TagOpen name attrs _) = tag == name && (maybe False check $ lookup key attrs) | |
matchTag _ _ _ _ = False | |
pickAttr name = maybe [] ((:[]) . amp . BU.toString) . lookup name | |
parseHref = CL.concatMap $ pickAttr "href" . fst | |
parseForm = CL.concatMap $ uncurry zip . | |
(pickAttr "action" *** (:[]) . M.fromList . concatMap isInput) | |
where | |
isInput (TagOpen a attrs _) | a == "input" = | |
case (lookup "name" attrs, lookup "value" attrs) of | |
(Just n, Just v) -> [(n, v)] | |
(Just n, _) -> [(n, B.empty)] | |
_ -> [] | |
isInput _ = [] | |
parseRefresh = CL.concatMap $ map (dropWhile (/='h')) . pickAttr "content" . fst | |
grabTag match = grabTag' match =$= CL.concatMap (either (:[]) (const [])) | |
grabTag' match = sequenceSink () $ \() -> do | |
t <- CL.head | |
return $ | |
case t of | |
Just x@(TagOpen tag attrs closed) | match x -> | |
if closed then Emit () [Left (attrs, [])] | |
else StartConduit $ grabClose tag attrs | |
Just x -> Emit () [Right x] | |
Nothing -> Stop | |
where | |
grabClose tag attrs = sequenceSink (Just (0, [])) $ \state -> do | |
case state of | |
Nothing -> return $ StartConduit $ grabTag' match | |
Just (l, xs) -> do | |
t <- CL.head | |
return $ | |
case t of | |
Just x -> | |
case x of | |
TagOpen a _ _ | a == tag -> Emit (Just (l + 1, x:xs)) [] | |
TagClose a | a == tag -> if l > 0 then Emit (Just (l - 1, x:xs)) [] | |
else Emit Nothing [Left (attrs, reverse xs)] | |
_ -> Emit (Just (l, x:xs)) [] | |
Nothing -> Stop | |
startLogin login pass = do | |
src <- get weibo | |
login <- throwWhenNothing LoginURLNotFound $ lift $ src | |
$= grabTag (matchTag "a" "href" $ B.isInfixOf "sso/login") | |
$= parseHref $$ CL.head | |
src <- get login | |
(url, form) <- throwWhenNothing LoginFormNotFound $ lift $ src | |
$= grabTag (matchTag "form" "action" $ const True) | |
$= parseForm $$ CL.head | |
post (fill form) (prefix </> url) | |
where | |
f p q k v = if B.isPrefixOf p k then q else v | |
fill = M.insert "mobile" login . M.mapWithKey (f "password" pass) | |
afterLogin src = (get =<<) $ | |
throwWhenNothing HomeURLNotFound $ lift $ src | |
$= grabTag (matchTag "meta" "http-equiv" (=="refresh")) | |
$= parseRefresh $$ CL.head | |
myWeiqun src = (get . (weibo </>) =<<) $ | |
throwWhenNothing MyWeiboURLNotFound $ lift $ src | |
$= grabTag (matchTag "a" "href" $ B.isInfixOf "/qun?") | |
$= parseHref $$ CL.head | |
countPages formStr src = do | |
(url, form) <- throwWhenNothing PageListNotFound $ lift $ src $= dump "main.html" | |
$= dropUntil (matchTag "div" "id" (=="pagelist")) | |
$= grabTag (matchTag "form" "action" $ B.isInfixOf formStr) | |
$= parseForm $$ CL.head | |
debug $ show form | |
totals <- throwWhenNothing PageTotalNotFound $ | |
return $ fmap BU.toString $ M.lookup "mp" form | |
total <- liftIO $ catch (readIO totals :: IO Int) (throw' PageTotalNotNumber) | |
return (url, total, form) | |
parseMsg = parseMsg' 1 | |
parseMsg' i formStr src = do | |
(msg, (cmts, pgs)) <- lift $ src $$ fmap accum $ | |
grabTag' (matchTag "div" "id" $ B.isPrefixOf "M_") =|= | |
grabTag' (matchTag "div" "id" $ B.isPrefixOf "C_") =|= | |
(grabTag (matchTag "form" "action" $ B.isInfixOf formStr) =$= parseForm) =$ | |
CL.consume | |
debug $ "Got msg form: " ++ show pgs | |
let m = Msg (concat $ map snd msg) (map snd cmts) | |
j = i + 1 | |
case pgs of | |
(url, form):_ -> do | |
s <- throwWhenNothing CommentTotalNotFound $ | |
return $ fmap BU.toString $ M.lookup "mp" form | |
n <- liftIO $ catch (readIO s :: IO Int) (throw' CommentTotalNotNumber) | |
if j <= n | |
then fmap (mappend m) $ | |
post (M.insert "page" (showB j) form) (weibo </> url) >>= | |
parseMsg' j formStr . ($= dump "second.html") | |
else return m | |
_ -> return m | |
where | |
aux f g = either (first . f) (second . g) | |
accum = foldr (aux (:) (aux (:) (:))) ([], ([], [])) | |
throw' :: MyException -> SomeException -> a | |
throw' e _ = throw e | |
throwWhenNothing e io = io >>= maybe (throwIO e) return | |
amp = replace "&" "&" | |
(</>) p q = if isPrefixOf "http://" q then q | |
else p ++ "/" ++ (dropWhile (=='/') q) | |
showB = BU.fromString . show | |
myQunPage url form i = do | |
src <- post form' (weibo </> url) | |
urls <- lift $ src | |
$= grabTag (matchTag "div" "id" $ B.isPrefixOf "M_") | |
$= parseDivId $$ CL.consume | |
mapM_ ((>>= printMsg) . get . (weibo </>)) urls | |
where | |
printMsg src = parseMsg "act=view" src >>= dumpMsg >>= liftIO . B.putStrLn . msgToHtml | |
form' = M.insert "page" (showB i) form | |
idToURL id = "/dpool/ttt/grouphome.php?act=view&gmid=" ++ drop 2 id ++ "&groupid=198197&" ++ (dropWhile(=='?') $ dropWhile (not . (=='?')) url) | |
parseDivId = CL.concatMap $ map idToURL . pickAttr "id" . fst | |
allMyQunPages (url, total, form) = mapM_ (myQunPage url form) [35..55] | |
dumpMsg (Msg body cmt) = do | |
body' <- fmap tagImage $ dumpImage body | |
return $ Msg body' cmt | |
tagImage (x@(TagOpen a attrs True) : _ : TagOpen b bttrs True : xs) | a == "img" && b == "a" = | |
TagOpen "a" bttrs False : x : TagClose "a" : tagImage xs | |
tagImage (x:xs) = x : tagImage xs | |
tagImage [] = [] | |
dumpImage (TagOpen a _ _ : x@(TagOpen b attrs closed) : xs) | a == "a" && b == "img" = do | |
let src = pickAttr "src" attrs | |
xs' = dropUntilAfter (isTagClose "a") xs | |
debug $ "src = " ++ show src | |
y <- case src of | |
[url] -> do | |
let file = "thumb/" ++ (last $ split "/" url) | |
safeGuard () $ getFile (weibo </> url) >>= lift . ($$ sinkFile file) | |
return $ TagOpen b (replaceAttr "src" (BU.fromString file) attrs) closed | |
_ -> return x | |
fmap (y:) $ dumpImage xs' | |
dumpImage (x@(TagOpen a attrs False):y:xs) | a == "a" = do | |
let href = pickAttr "href" attrs | |
let next = fmap (x:) $ dumpImage (y:xs) | |
case (y, href) of | |
(Text y', [url]) | y' == BU.fromString "原图" -> do | |
let query = last $ split "/" url | |
args = map (split "=") $ split "&" $ dropUntilAfter (=='?') query | |
hash = concat $ concat $ map tail $ filter (isPrefixOf ["u"]) args | |
if null hash then next | |
else do | |
let file = "image/" ++ hash | |
safeGuard () $ getFile (weibo </> url) >>= lift . ($$ sinkFile file) | |
info <- safeGuard Nothing $ lift (sourceFile file $$ sinkImageInfo) | |
let file' = case info of | |
Just (_, format) -> file ++ "." ++ map toLower (show format) | |
_ -> file | |
liftIO $ when (file /= file') $ renameFile file file' | |
let xs' = dropUntilAfter (isTagClose "a") xs | |
x' = TagOpen "a" [("href", BU.fromString file')] True | |
fmap (x':) $ dumpImage xs' | |
_ -> next | |
dumpImage (x:xs) = fmap (x:) $ dumpImage xs | |
dumpImage [] = return [] | |
replaceAttr key val attrs = (key, val) : filter ((/=key) . fst) attrs | |
isTagClose a (TagClose b) | a == b = True | |
| otherwise = False | |
isTagClose _ _ = False | |
dropUntilAfter f = dropWhile f . dropWhile (not . f) | |
safeGuard d io = catch io $ \(e :: SomeException) -> debug (show e) >> return d |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment