Created
June 6, 2018 01:54
-
-
Save hliang/be8d50817cb789d69003ed4f0e0a5853 to your computer and use it in GitHub Desktop.
NLP toys: analysis of song lyrics
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
--- | |
title: "NLP toys" | |
author: "hliang" | |
date: "8/24/2016" | |
output: html_document | |
--- | |
Use NLP tools to analyze song lyrics. | |
load packages required: | |
```{r} | |
#install.packages("jiebaR") | |
#install.packages("devtools") | |
#devtools::install_github("qinwf/jiebaRD") | |
#devtools::install_github("qinwf/jiebaR") | |
library("jiebaRD") | |
library("jiebaR") | |
library(jsonlite) | |
setwd("~/prj/playr/jieba") | |
setwd("~/lyric/") | |
``` | |
download lyrics | |
```{r} | |
dir.create("lrc") | |
# setwd("./lrc") | |
# download.file("http://www.cnlyric.com/LrcDown/3098/185575.lrc", destfile = "./lrc/185575.lrc") | |
# download.file("http://www.cnlyric.com/LrcDown/3098/118126.lrc", destfile = "./lrc/118126.lrc") | |
# download.file("http://www.cnlyric.com/LrcDown/2778/228924.lrc", destfile = "./lrc/228924.lrc") | |
# download.file("http://music.163.com/api/song/media?id=96661", destfile = "./lrc/96661.lrc") | |
# use jsonlite package to parse the json | |
# 163 | |
s_ids = c("186331", "5260326", "25638702", "99", "99999", "5274094", "28680228") # 186331 #æå·ä¸å¤© 5260326:"好人" 25638702:"ç«¥è¯" 5274094: "æä»¬è¿éè¿æé±¼" 28680228: # 第äºåæ-红è²è¦æä¸å½é»å½æ¦åå¼å¤§PK(ä¸) | |
for (i in seq_along(s_ids)) { | |
id163 = s_ids[i] | |
uri_detail = paste0("http://music.163.com/api/song/detail/?ids=[", id163, "]") | |
uri_media = paste0("http://music.163.com/api/song/media?id=", id163) | |
download.file(uri_detail, destfile = paste0("./lrc/", id163, ".detail.json")) | |
download.file(uri_media, destfile = paste0("./lrc/", id163, ".media.json")) | |
} | |
``` | |
parse json file and extract lyric | |
```{r} | |
setwd("~/prj/playr/jieba") | |
setwd("~/lyric/") | |
lrc = data.frame(id=paste0("sid_", s_ids), name=NA, artist=NA, lyric=NA, stringsAsFactors = FALSE) | |
# parse lyric files | |
for (i in seq_len(nrow(lrc))) { | |
id163 = s_ids[i] | |
s_detail = fromJSON(txt=paste0("./lrc/", id163, ".detail.json")) | |
s_media = fromJSON(txt=paste0("./lrc/", id163, ".media.json")) | |
if (!(is.null(s_detail$songs$name) & is.null(s_media$lyric))) { | |
lrc$name[i] = s_detail$songs$name | |
lrc$lyric[i] = gsub("\\[[0-9.:]*\\]", "", s_media$lyric) | |
artist[[paste0("sid_", id163)]] = s_detail$songs$artists[[1]]$name | |
} else { | |
artist[[paste0("sid_", id163)]] = NA | |
} | |
} | |
# remove NA entries | |
keep = apply(!is.na(lrc[, -1]), 1, all) | |
lrc = lrc[keep, ] | |
artist = artist[keep] | |
``` | |
segmentation | |
```{r} | |
cutter = worker(bylines=TRUE, write=FALSE) | |
segs = cutter[lrc$lyric] | |
names(segs) = lrc$id | |
str(segs) | |
``` | |
simhash and distance | |
```{r} | |
simhasher = worker("simhash", topn=10) | |
#segs = lapply(lrc, "[[", "seg") # extract segmentations | |
sims = lapply(X = segs, FUN = vector_simhash, jiebar = simhasher) # compute simhash | |
# extract simhash | |
x = unlist(sapply(sims, "[", "simhash")) | |
# distance matrix | |
distmat = simhash_dist_mat(x, x) | |
colnames(distmat) = names(x) | |
rownames(distmat) = names(x) | |
d = as.dist(distmat) | |
fit = hclust(d) | |
tmpid = gsub("(sid_.*)\\.simhash", "\\1", fit$labels) | |
tmpname = lrc$name[match(tmpid, lrc$id)] | |
fit$labels = tmpname | |
``` | |
You can also embed plots, for example: | |
```{r, echo=FALSE} | |
plot(fit) | |
``` | |
Note that the `echo = FALSE` parameter was added to the code chunk to prevent printing of the R code that generated the plot. |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment