R2-03 第二期-第2次作业

微思微丝 2018-02-08 10:21:10 阅读: 1425

R语言与MySQL数据库

任务1：

install.packages("DBI")
install.packages('RMySQL',type='source')
library(DBI)
library(RMySQL)
con <- dbConnect(MySQL(),host="localhost",dbname="rdb",user="root",password="")
dbSendQuery(con,'SET NAMES utf8')##创建链接
dbGetInfo(con)##显示数据库的信息
dbListTables(con)
dbRemoveTable(con,"test")
summary(con)

OUTPUT:
> con <- dbConnect(MySQL(),host="localhost",dbname="rdb",user="root",password="")
> dbSendQuery(con,'SET NAMES utf8')
<MySQLResult:472858824,0,0>
> dbGetInfo(con)
$host
[1] "localhost"

$user
[1] "root"

$dbname
[1] "rdb"

$conType
[1] "localhost via TCP/IP"

$serverVersion
[1] "5.6.17"

$protocolVersion
[1] 10

$threadId
[1] 934

$rsId
$rsId[[1]]
<MySQLResult:0,0,0>


> dbListTables(con) 
[1] "article" "word"   
> dbRemoveTable(con,"test")
[1] FALSE
> summary(con)
<MySQLConnection:0,0>
  User:   root 
  Host:   localhost 
  Dbname: rdb 
  Connection type: localhost via TCP/IP 

Results:

任务2：

killDbConnections()
install.packages("httr")
library(DBI)
library(RMySQL)
con <- dbConnect(MySQL(),host="localhost",dbname="rdb",user="root",password="")
dbSendQuery(con,'SET NAMES utf8')

library(httr)
baseUrl="https://eutils.ncbi.nlm.nih.gov/"
totalNum=562
pageSize=10
totalPage=ceiling(totalNum/pageSize)
currentPage=1
term='(cell[TA]) AND 2017[DP]'
usehistory='Y'#是否使用历史搜索
querykey=''
webenv=''
postSearchUrl='https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi'
while(currentPage<=totalPage){ retstart=(currentPage-1)*pageSize
  r <- POST(postSearchUrl, 
            body = list(
              db='pubmed',
              term=term,
              retmode='json',
              retstart=retstart,
              retmax=pageSize,
              usehistory=usehistory,
              rettype='uilist'
            ) )
  stop_for_status(r) #clear http status
  data=content(r, "parsed", "application/json")#data里面存储了所有数据
 
  esearchresult=data$esearchresult
  #$idlist=array $count=562,$retmax=20, $retstart=0,$querykey=1, 
  $webenv=NCID_1_30290513_130.14.18.34_9001_1515165012_617859421_0MetA0_S_MegaStore_F_1 
  querykey=esearchresult$querykey
  webenv=esearchresult$webenv
  idlist =esearchresult$idlist#idlist为搜索结果中pmid的合集，下面的代码用于拼接出Rmysql需要的数据
  n=length(idlist)
  pmid=c()
  i=1
  while(i<=n){
    pmid=c(pmid,as.character(idlist[i][1]))
    i=i+1
  }
  article=data.frame('pmid'=pmid)
  dbWriteTable(con,"article",article,append=TRUE)
  currentPage=currentPage+1##while循环后记得增加，否则就是死循环了。
}
dbDisconnect(con)

OUTPUT:
> killDbConnections()
[[1]]
<MySQLConnection:0,543>

[[2]]
<MySQLConnection:0,545>

[1] "2  connections killed."
Warning messages:
1: Closing open result sets 
2: Closing open result sets 

...
> con <- dbConnect(MySQL(),host="localhost",dbname="rdb",user="root",password="")
> dbSendQuery(con,'SET NAMES utf8')
<MySQLResult:122396728,546,0>

......
+   currentPage=currentPage+1##while循环后记得增加，否则就是死循环了。
+ }
> dbDisconnect(con)
[1] TRUE

任务3：

##任务3
##从mysql数据库里面循环取出id，每次取出10个，然后获取到title和abstract
#为什么要用mysql数据库？
#1. 本次作业数量比较少，用其他方法比如txt文本存储也是可以
#2. mysql是当前最流行的数据库，学习mysql数据库的使用
#3. 如果网络获取数量达百万级，一次执行不可能获得所有内容，
可能多次中断执行，用mysql数据库方便纪录哪些已经被处理了。
killDbConnections() #清除所有mysql连接，否则会报错说超过16个连接
library(RMySQL)
library(xml2)
library(httr)

con <- dbConnect(MySQL(),host="localhost",dbname="rdb",user="root",password="")
dbSendQuery(con,'SET NAMES utf8')#创建数据库连接 

#isdone=0 表示查询article表里面还没有获取完的条目
rs <- dbSendQuery(con, "SELECT * FROM article WHERE isdone=0")
while (!dbHasCompleted(rs)) {
  chunk <- dbFetch(rs, 10) #mode(chunk)，print(chunk)，chunk[x,3] 第3列为获取到的pmid
  pmidStr=""
  i=1
  n=nrow(chunk)#获得总行数，和上面设置的10一致，最后的时候是3
  while(i<=n){
    pmidStr=paste(pmidStr,chunk[i,3],sep=",")
    i=i+1
  }
  #pmid=",29195067,29195066,29195065,29195064,29153837,29153836,29153835,29153834,29153833,29153832"
  #去掉pmid第一个逗号,从第2位起，到100000位，上面字符串没有这么多字符，因此到末尾
  pmidStr=substr(pmidStr,2,100000)
  #上面字符串就是我们post到pubmed上面的字符串，用于获取title和abstract
  #下面就是第一次作业里面获取title和abstract
  postFetchUrl='https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi'
  r2 <- POST(postFetchUrl, 
             body = list(
               db='pubmed',
               id=pmidStr,
               retmode='xml'
             )
  )
  stop_for_status(r2) #clear http status
  data2=content(r2, "parsed", "application/xml")
  article=xml_children(data2)
  #xml_length(article)为里面文章的数量
  count=length(article)
  cnt=1
  while(cnt<=count){
    #下面的xml_text和xml_find_first均为XML2包里面的函数
    title=xml_text(xml_find_first(article[cnt],".//ArticleTitle")) #找到第一个ArticleTitle节点
    abstract=xml_text(xml_find_first(article[cnt],".//AbstractText"))
    pmid=xml_text(xml_find_first(article[cnt],".//PMID"))
    #接下来我们要更新数据库
    #1首先我们去掉title和abstract里面的单引号，单引号会导致mysql更新出现问题
    title=gsub("'","",title)
    abstract=gsub("'","",abstract)
    #1构建mysql更新语句,R语言的字符串拼接不太好，不能使用"+"，也不能使用点"."
    #设置isdone字段用于标记已经处理完的
    sql=paste("UPDATE article SET title='",title,"',abstract='",abstract,"',isdone=1"," where pmid='",pmid,"'",sep="")
    #2执行,需要新开通一个mysql连接
    con2 <- dbConnect(MySQL(),host="localhost",dbname="rdb",user="root",password="")
    dbSendQuery(con2,'SET NAMES utf8')
    dbSendQuery(con2,sql)
    dbDisconnect(con2)
    cnt = cnt + 1
    #延迟1秒运行，pubmed接口说明如果1秒内并发超过3次将会被封禁IP
    Sys.sleep(1)
    #break 用于中断循环，调试程序的时候非常有用
  }
}

OUTPUT:
> killDbConnections()
list()
[1] "0  connections killed."

> con <- dbConnect(MySQL(),host="localhost",dbname="rdb",user="root",password="")
> dbSendQuery(con,'SET NAMES utf8')#创建数据库连接 
<MySQLResult:34,547,0>

...... 
No encoding supplied: defaulting to UTF-8.
No encoding supplied: defaulting to UTF-8.
No encoding supplied: defaulting to UTF-8.
No encoding supplied: defaulting to UTF-8.
No encoding supplied: defaulting to UTF-8.
No encoding supplied: defaulting to UTF-8.
No encoding supplied: defaulting to UTF-8.
No encoding supplied: defaulting to UTF-8.
No encoding supplied: defaulting to UTF-8.
No encoding supplied: defaulting to UTF-8.
No encoding supplied: defaulting to UTF-8.
No encoding supplied: defaulting to UTF-8.
No encoding supplied: defaulting to UTF-8.
No encoding supplied: defaulting to UTF-8.
No encoding supplied: defaulting to UTF-8.
No encoding supplied: defaulting to UTF-8.
No encoding supplied: defaulting to UTF-8.
No encoding supplied: defaulting to UTF-8.
No encoding supplied: defaulting to UTF-8.
There were 50 or more warnings (use warnings() to see the first 50)##因为我再一次搜索了，搜索条目增加了，可能是重复了。

1518055005(1).png