25 Web scraping

25.1 Download Content from NCBI Databases using RISmed

25.1.1 Search author name

#install the RISmed package
#install.packages("RISmed")
library(RISmed)
#now let's look up this dude called Shaojun Xie
res <- EUtilsSummary('Shaojun Xie', type='esearch', db='pubmed')
 
summary(res)
## Query:
## Xie, Shaojun[Full Author Name] 
## 
## Result count:  25
#what are the PubMed ids for the Author Shaojun Xie?
QueryId(res)
##  [1] "29898403" "29290610" "29232718" "29115936" "29087450" "27934869"
##  [7] "27645898" "27531446" "27252305" "27231839" "27208288" "26943172"
## [13] "26829553" "26667818" "26568274" "25733903" "25593350" "25037214"
## [19] "24920332" "24330712" "24131563" "23860794" "22862992" "22660547"
## [25] "22114195"
#limit by date
res2 <- EUtilsSummary('Shaojun Xie', type='esearch', db='pubmed', mindate='2010', maxdate='2019')
 
summary(res2)
## Query:
## Xie, Shaojun[Full Author Name] AND 2010[EDAT] : 2019[EDAT] 
## 
## Result count:  25
#three publications in 2012
QueryId(res2)
##  [1] "29898403" "29290610" "29232718" "29115936" "29087450" "27934869"
##  [7] "27645898" "27531446" "27252305" "27231839" "27208288" "26943172"
## [13] "26829553" "26667818" "26568274" "25733903" "25593350" "25037214"
## [19] "24920332" "24330712" "24131563" "23860794" "22862992" "22660547"
## [25] "22114195"

25.1.2 Search keyword

#first how many total articles containing retrotransposon
str_key = 'WGBS'
res3 <- EUtilsSummary(str_key, type='esearch', db='pubmed', mindate='2008', maxdate='2018')
 
summary(res3)
## Query:
## WGBS[All Fields] AND 2008[EDAT] : 2018[EDAT] 
## 
## Result count:  136
#if you only want the number of articles
QueryCount(res3)
## [1] 136
#tally each year beginning at 1970
#In order not to overload the E-utility servers, NCBI recommends that users post no more than three
#URL requests per second and limit large jobs to either weekends or between 9:00 PM and 5:00 AM
#Eastern time during weekdays. Failure to comply with this policy may result in an IP address being
#blocked from accessing NCBI.
 
tally <- array()
x <- 1
for (i in 2016:2018){
  Sys.sleep(1)
  r <- EUtilsSummary(str_key, type='esearch', db='pubmed', mindate=i, maxdate=i)
  tally[x] <- QueryCount(r)
  x <- x + 1
}
 
names(tally) <- 2016:2018
max(tally)
## [1] 41
barplot(tally, las=2, ylim=c(0,max(tally)), main="Number of PubMed articles")