文献计量学系列32: 关键词描述统计、词频分布和模式
一、自定义函数
keywordAnalysis <- function(M, Tag = 'DE_TM', sep = ';'){
data = tableTag(M, Tag = Tag, sep = sep) %>% as.data.frame(stringsAsFactors = F)
#MainInformation
keyword_TF = sum(data$Freq)#total frequency, or keywords' appearance
keyword = length(data$Tab) #the number of the type of keywords
keyword_AF = keyword_appearance / keyword #average Frequency
keyword_SD = sd(data$Freq) #standard deviation
keyword_CV = keyword_AveFreq / keyword_SD * 100 #coefficient of variation
keyword_TTR = keyword / keyword_appearance * 100 #Type-Token Ratio, 形符类比
keyword_HP = length(data$Tab[data$Freq == 1])# the number of hapax keywords
keywordMainInformation = data.frame(keyword = keyword, keyword_TF = keyword_TF, keyword_AF = keyword_AF,
keyword_SD = keyword_SD, keyword_CV = keyword_CV, keyword_TTR = keyword_TTR,
keyword_HP = keyword_HP)
#distribution
TabFreq = table(data$Freq) %>% as.data.frame()
names(TabFreq) = c('keywordFreq','Count')
#Pattern
data$keywordRank <- 1:length(data$Tab)
data$keywordRankPercentage = data$keywordRank / max(data$keywordRank) * 100
data$FreqCumsum = cumsum(data$Freq)
data$FreqCumsumPercentage = data$FreqCumsum / max(data$FreqCumsum) * 100
#Results
rlist = list(MainInformation = keywordMainInformation, Distribution = TabFreq,
Pattern = data)
return(rlist)
}
二、加载包,数据导出与导入
pacman::p_load(bibliometrix, rio, tidyverse)
m1_TE <- import(file = 'E:/精鼎统计/m1_TE.xlsx')
rownames(m1_TE) <- m1_TE$SR
三、关键词描述统计
keywordInfo <- keywordAnalysis(m1_TE)
keywordInfo$MainInformation
# keyword keyword_TF keyword_AF keyword_SD keyword_CV keyword_TTR keyword_HP
# 1 957 5300 5.53814 15.14573 36.56568 18.0566 496
head(keywordInfo$Distribution)
# keywordFreq Count
# 1 1 496
# 2 2 155
# 3 3 81
# 4 4 33
# 5 5 27
# 6 6 21
head(keywordInfo$Pattern)
# Tab Freq keywordRank keywordRankPercentage FreqCumsum FreqCumsumPercentage
# 1 CATCHMENT 205 1 0.1044932 205 3.867925
# 2 STABLE-ISOTOPE 150 2 0.2089864 355 6.698113
# 3 RUNOFF 149 3 0.3134796 504 9.509434
# 4 GROUNDWATER 123 4 0.4179728 627 11.830189
# 5 PRECIPITATION 106 5 0.5224660 733 13.830189
# 6 RUNOFF-GENERATION 103 6 0.6269592 836 15.773585
三、词频分布与模式可视化
词频分布:
fig1 <- ggplot(keywordInfo$Distribution, aes(x = keywordFreq, y = Count))+
geom_bar(stat = 'identity')+
labs(x = '关键词频率', y = '关键词数量')+
theme_bw()+
theme(axis.title = element_text(size = 20))
fig1
关键词分布模式
fig2 <- ggplot(keywordInfo$Pattern, aes(x = keywordRankPercentage, y = FreqCumsumPercentage))+
geom_line( )+
annotate(geom = "text", x = 5.2246604, y = 51.981132, label = "(Top 50, 5.22%, 51.98%)", color = 'red')+
annotate(geom = "text", x = 10.4493208 , y = 65.716981, label = "(Top 100, 10.45%, 65.72%)", color = 'green')+
annotate(geom = "text", x = 20.89864 , y = 78.32075, label = "(Top 200, 20.90%, 78.32%)", color = 'blue')+
labs(x = '关键词累积百分比', y = '关键词词频累积百分比')+
theme_bw()
fig2
赞 (0)