文献计量学系列32: 关键词描述统计、词频分布和模式
一、自定义函数
keywordAnalysis <- function(M, Tag = 'DE_TM', sep = ';'){ data = tableTag(M, Tag = Tag, sep = sep) %>% as.data.frame(stringsAsFactors = F) #MainInformation keyword_TF = sum(data$Freq)#total frequency, or keywords' appearance keyword = length(data$Tab) #the number of the type of keywords keyword_AF = keyword_appearance / keyword #average Frequency keyword_SD = sd(data$Freq) #standard deviation keyword_CV = keyword_AveFreq / keyword_SD * 100 #coefficient of variation keyword_TTR = keyword / keyword_appearance * 100 #Type-Token Ratio, 形符类比 keyword_HP = length(data$Tab[data$Freq == 1])# the number of hapax keywords keywordMainInformation = data.frame(keyword = keyword, keyword_TF = keyword_TF, keyword_AF = keyword_AF, keyword_SD = keyword_SD, keyword_CV = keyword_CV, keyword_TTR = keyword_TTR, keyword_HP = keyword_HP) #distribution TabFreq = table(data$Freq) %>% as.data.frame() names(TabFreq) = c('keywordFreq','Count') #Pattern data$keywordRank <- 1:length(data$Tab) data$keywordRankPercentage = data$keywordRank / max(data$keywordRank) * 100 data$FreqCumsum = cumsum(data$Freq) data$FreqCumsumPercentage = data$FreqCumsum / max(data$FreqCumsum) * 100 #Results rlist = list(MainInformation = keywordMainInformation, Distribution = TabFreq, Pattern = data) return(rlist)}二、加载包,数据导出与导入
pacman::p_load(bibliometrix, rio, tidyverse)m1_TE <- import(file = 'E:/精鼎统计/m1_TE.xlsx')rownames(m1_TE) <- m1_TE$SR三、关键词描述统计
keywordInfo <- keywordAnalysis(m1_TE)keywordInfo$MainInformation# keyword keyword_TF keyword_AF keyword_SD keyword_CV keyword_TTR keyword_HP# 1 957 5300 5.53814 15.14573 36.56568 18.0566 496head(keywordInfo$Distribution)# keywordFreq Count# 1 1 496# 2 2 155# 3 3 81# 4 4 33# 5 5 27# 6 6 21
head(keywordInfo$Pattern)# Tab Freq keywordRank keywordRankPercentage FreqCumsum FreqCumsumPercentage# 1 CATCHMENT 205 1 0.1044932 205 3.867925# 2 STABLE-ISOTOPE 150 2 0.2089864 355 6.698113# 3 RUNOFF 149 3 0.3134796 504 9.509434# 4 GROUNDWATER 123 4 0.4179728 627 11.830189# 5 PRECIPITATION 106 5 0.5224660 733 13.830189# 6 RUNOFF-GENERATION 103 6 0.6269592 836 15.773585三、词频分布与模式可视化
词频分布:
fig1 <- ggplot(keywordInfo$Distribution, aes(x = keywordFreq, y = Count))+ geom_bar(stat = 'identity')+ labs(x = '关键词频率', y = '关键词数量')+ theme_bw()+theme(axis.title = element_text(size = 20))fig1
关键词分布模式
fig2 <- ggplot(keywordInfo$Pattern, aes(x = keywordRankPercentage, y = FreqCumsumPercentage))+ geom_line( )+ annotate(geom = "text", x = 5.2246604, y = 51.981132, label = "(Top 50, 5.22%, 51.98%)", color = 'red')+ annotate(geom = "text", x = 10.4493208 , y = 65.716981, label = "(Top 100, 10.45%, 65.72%)", color = 'green')+ annotate(geom = "text", x = 20.89864 , y = 78.32075, label = "(Top 200, 20.90%, 78.32%)", color = 'blue')+ labs(x = '关键词累积百分比', y = '关键词词频累积百分比')+ theme_bw()fig2
赞 (0)
