服务热线
178 0020 3020
R语言与MySQL数据库
任务1:
install.packages("DBI")
install.packages('RMySQL',type='source')
library(DBI)
library(RMySQL)
con <- dbConnect(MySQL(),host="localhost",dbname="rdb",user="root",password="")
dbSendQuery(con,'SET NAMES utf8')##创建链接
dbGetInfo(con)##显示数据库的信息
dbListTables(con)
dbRemoveTable(con,"test")
summary(con)OUTPUT: > con <- dbConnect(MySQL(),host="localhost",dbname="rdb",user="root",password="") > dbSendQuery(con,'SET NAMES utf8') <MySQLResult:472858824,0,0> > dbGetInfo(con) $host [1] "localhost" $user [1] "root" $dbname [1] "rdb" $conType [1] "localhost via TCP/IP" $serverVersion [1] "5.6.17" $protocolVersion [1] 10 $threadId [1] 934 $rsId $rsId[[1]] <MySQLResult:0,0,0> > dbListTables(con) [1] "article" "word" > dbRemoveTable(con,"test") [1] FALSE > summary(con) <MySQLConnection:0,0> User: root Host: localhost Dbname: rdb Connection type: localhost via TCP/IP Results:
任务2:
killDbConnections()
install.packages("httr")
library(DBI)
library(RMySQL)
con <- dbConnect(MySQL(),host="localhost",dbname="rdb",user="root",password="")
dbSendQuery(con,'SET NAMES utf8')
library(httr)
baseUrl="https://eutils.ncbi.nlm.nih.gov/"
totalNum=562
pageSize=10
totalPage=ceiling(totalNum/pageSize)
currentPage=1
term='(cell[TA]) AND 2017[DP]'
usehistory='Y'#是否使用历史搜索
querykey=''
webenv=''
postSearchUrl='https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi'
while(currentPage<=totalPage){ retstart=(currentPage-1)*pageSize
r <- POST(postSearchUrl,
body = list(
db='pubmed',
term=term,
retmode='json',
retstart=retstart,
retmax=pageSize,
usehistory=usehistory,
rettype='uilist'
) )
stop_for_status(r) #clear http status
data=content(r, "parsed", "application/json")#data里面存储了所有数据
esearchresult=data$esearchresult
#$idlist=array $count=562,$retmax=20, $retstart=0,$querykey=1,
$webenv=NCID_1_30290513_130.14.18.34_9001_1515165012_617859421_0MetA0_S_MegaStore_F_1
querykey=esearchresult$querykey
webenv=esearchresult$webenv
idlist =esearchresult$idlist#idlist为搜索结果中pmid的合集,下面的代码用于拼接出Rmysql需要的数据
n=length(idlist)
pmid=c()
i=1
while(i<=n){
pmid=c(pmid,as.character(idlist[i][1]))
i=i+1
}
article=data.frame('pmid'=pmid)
dbWriteTable(con,"article",article,append=TRUE)
currentPage=currentPage+1##while循环后记得增加,否则就是死循环了。
}
dbDisconnect(con)OUTPUT: > killDbConnections() [[1]] <MySQLConnection:0,543> [[2]] <MySQLConnection:0,545> [1] "2 connections killed." Warning messages: 1: Closing open result sets 2: Closing open result sets ... > con <- dbConnect(MySQL(),host="localhost",dbname="rdb",user="root",password="") > dbSendQuery(con,'SET NAMES utf8') <MySQLResult:122396728,546,0> ...... + currentPage=currentPage+1##while循环后记得增加,否则就是死循环了。 + } > dbDisconnect(con) [1] TRUE

任务3:
##任务3
##从mysql数据库里面循环取出id,每次取出10个,然后获取到title和abstract
#为什么要用mysql数据库?
#1. 本次作业数量比较少,用其他方法比如txt文本存储也是可以
#2. mysql是当前最流行的数据库,学习mysql数据库的使用
#3. 如果网络获取数量达百万级,一次执行不可能获得所有内容,
可能多次中断执行,用mysql数据库方便纪录哪些已经被处理了。
killDbConnections() #清除所有mysql连接,否则会报错说超过16个连接
library(RMySQL)
library(xml2)
library(httr)
con <- dbConnect(MySQL(),host="localhost",dbname="rdb",user="root",password="")
dbSendQuery(con,'SET NAMES utf8')#创建数据库连接
#isdone=0 表示查询article表里面还没有获取完的条目
rs <- dbSendQuery(con, "SELECT * FROM article WHERE isdone=0")
while (!dbHasCompleted(rs)) {
chunk <- dbFetch(rs, 10) #mode(chunk),print(chunk),chunk[x,3] 第3列为获取到的pmid
pmidStr=""
i=1
n=nrow(chunk)#获得总行数,和上面设置的10一致,最后的时候是3
while(i<=n){
pmidStr=paste(pmidStr,chunk[i,3],sep=",")
i=i+1
}
#pmid=",29195067,29195066,29195065,29195064,29153837,29153836,29153835,29153834,29153833,29153832"
#去掉pmid第一个逗号,从第2位起,到100000位,上面字符串没有这么多字符,因此到末尾
pmidStr=substr(pmidStr,2,100000)
#上面字符串就是我们post到pubmed上面的字符串,用于获取title和abstract
#下面就是第一次作业里面获取title和abstract
postFetchUrl='https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi'
r2 <- POST(postFetchUrl,
body = list(
db='pubmed',
id=pmidStr,
retmode='xml'
)
)
stop_for_status(r2) #clear http status
data2=content(r2, "parsed", "application/xml")
article=xml_children(data2)
#xml_length(article)为里面文章的数量
count=length(article)
cnt=1
while(cnt<=count){
#下面的xml_text和xml_find_first均为XML2包里面的函数
title=xml_text(xml_find_first(article[cnt],".//ArticleTitle")) #找到第一个ArticleTitle节点
abstract=xml_text(xml_find_first(article[cnt],".//AbstractText"))
pmid=xml_text(xml_find_first(article[cnt],".//PMID"))
#接下来我们要更新数据库
#1首先我们去掉title和abstract里面的单引号,单引号会导致mysql更新出现问题
title=gsub("'","",title)
abstract=gsub("'","",abstract)
#1构建mysql更新语句,R语言的字符串拼接不太好,不能使用"+",也不能使用点"."
#设置isdone字段用于标记已经处理完的
sql=paste("UPDATE article SET title='",title,"',abstract='",abstract,"',isdone=1"," where pmid='",pmid,"'",sep="")
#2执行,需要新开通一个mysql连接
con2 <- dbConnect(MySQL(),host="localhost",dbname="rdb",user="root",password="")
dbSendQuery(con2,'SET NAMES utf8')
dbSendQuery(con2,sql)
dbDisconnect(con2)
cnt = cnt + 1
#延迟1秒运行,pubmed接口说明如果1秒内并发超过3次将会被封禁IP
Sys.sleep(1)
#break 用于中断循环,调试程序的时候非常有用
}
}OUTPUT: > killDbConnections() list() [1] "0 connections killed." > con <- dbConnect(MySQL(),host="localhost",dbname="rdb",user="root",password="") > dbSendQuery(con,'SET NAMES utf8')#创建数据库连接 <MySQLResult:34,547,0> ...... No encoding supplied: defaulting to UTF-8. No encoding supplied: defaulting to UTF-8. No encoding supplied: defaulting to UTF-8. No encoding supplied: defaulting to UTF-8. No encoding supplied: defaulting to UTF-8. No encoding supplied: defaulting to UTF-8. No encoding supplied: defaulting to UTF-8. No encoding supplied: defaulting to UTF-8. No encoding supplied: defaulting to UTF-8. No encoding supplied: defaulting to UTF-8. No encoding supplied: defaulting to UTF-8. No encoding supplied: defaulting to UTF-8. No encoding supplied: defaulting to UTF-8. No encoding supplied: defaulting to UTF-8. No encoding supplied: defaulting to UTF-8. No encoding supplied: defaulting to UTF-8. No encoding supplied: defaulting to UTF-8. No encoding supplied: defaulting to UTF-8. No encoding supplied: defaulting to UTF-8. There were 50 or more warnings (use warnings() to see the first 50)##因为我再一次搜索了,搜索条目增加了,可能是重复了。


附件