Skip to content

Instantly share code, notes, and snippets.

@xccds
Created April 4, 2015 11:28

Revisions

  1. xccds created this gist Apr 4, 2015.
    125 changes: 125 additions & 0 deletions china_politics.R
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,125 @@
    # 读取
    df = read.csv('2014data.csv',stringsAsFactors =FALSE)
    df1 = df[4:53]
    df2 = df[54:57]

    # 整理
    library(plyr)
    df1_names = names(df1)
    names(df1) = paste0('x',1:ncol(df1))

    df2_names = names(df2)
    names(df2) = paste0('y',1:ncol(df2))

    map_func = function(x){
    temp = mapvalues(x, from = c("强烈同意","同意","反对","强烈反对"),
    to = c(2,1,-1,-2))
    return(as.numeric(temp))
    }

    df1_1 = colwise(map_func)(df1)

    df2_2 = df2
    df2_2$y1 = ifelse(df2$y1=='F',1,0)
    df2_2$y2 = 2015-df2_2$y2
    df2_2$y2 = cut(df2_2$y2,breaks=c(0,18,22,25,30,35,40,50,60,70,120),
    labels=1:10)
    df2_2$y2 = as.numeric(df2_2$y2)
    df2_2$y3 = mapvalues(df2$y3, from = c("0-25k","25k-50k","50k-75k","75k-100k","100k-150k","150k-300k","300k+"),
    to = 1:7)
    df2_2$y3 = as.numeric(df2_2$y3)

    df2_2$y4 = mapvalues(df2$y4, from = c("初中及以下","高中","大学","研究生及以上"),
    to = 1:4)
    df2_2$y4 = as.numeric(df2_2$y4)

    # 去除有问题数据
    df3 = cbind(df1_1,df2_2)
    df4 = df3[complete.cases(df3),]
    df5 = subset(df5, !(y2==10))
    df5 = subset(df5, !(y2==1&y4==4))
    df5 = subset(df5, !(y2==1&y3>5))


    im_func = function(x,y){
    e=1e-8
    px = matrix(prop.table(table(x)))
    py = matrix(prop.table(table(y)))
    pxy = matrix(prop.table(table(x,y)),ncol=nrow(py))
    im = pxy*(log2(pxy+e) - log2(e+px %*% t(py)))
    nomi = sum(im)
    denomi = -0.5*(sum(px*log2(px+e))+sum(py*log2(py+e)))
    return(nomi/denomi)
    }

    m = ncol(df5)
    result = matrix(nrow=m,ncol=m)
    for (i in 1:m){
    for (j in 1:i){
    result[i,j] = im_func(df5[[i]],df5[[j]])
    }
    }

    diag(result) = 0

    # 哪些问题最相关
    max_v=max(result[1:50,1:50],na.rm = T)
    which(result==max_v,arr.ind = T)
    df1_names[c(3,6)]
    table(df5$x3,df5$x6)

    # 学历和哪个问题最相关
    order(result[54,],decreasing = T)
    df1_names[41]
    table(df5$x41,df5$y4)


    # 年龄 和那个问题有关
    order(result[52,],decreasing = T)
    df1_names[35]
    table(df5$x30,df5$y2)

    # 收入和那个问题有关
    order(result[53,],decreasing = T)
    df1_names[35]
    table(df5$x35,df5$y3)


    # 性别和那个问题有关
    order(result[51,],decreasing = T)
    df1_names[30]
    table(df5$x30,df5$y1)


    # 模型
    library(gbm)
    model = gbm(y3~.,data = df5,
    distribution = "multinomial",
    n.trees = 200,
    shrinkage = 0.01,
    train.fraction = 0.8,
    cv.folds=5)

    pred = predict(model,type="response")
    pred = matrix(pred[,,1],ncol=7)
    pred_y = apply(pred,1,which.max)

    coef = relative.influence(model)
    sort(coef[coef>0])
    df1_names[16]
    table(df5$x16,df5$y3)
    df1_names[41]
    table(df5$x41,df5$y3)
    df1_names[12]
    table(df5$x12,df5$y3)

    # # 政治
    # df5$poli = rowMeans(df5[,1:20])
    # # 经济
    # df5$econ = rowMeans(df5[,21:40])
    # # 文化
    # df5$cult = rowMeans(df5[,41:50])
    #
    # # cluster
    # library(fpc)
    # pka <- kmeansruns(df5[,c('poli','econ','cult')],krange=2:6,critout=TRUE,runs=2,criterion="asw")