--- title: "Analyse Endnote Library" author: "Michael Rustler" date: "`r Sys.Date()`" output: rmarkdown::html_vignette vignette: > %\VignetteIndexEntry{Analyse Endnote Library} %\VignetteEngine{knitr::rmarkdown} %\VignetteEncoding{UTF-8} --- ```{r setup, include = FALSE} knitr::opts_chunk$set( collapse = TRUE, comment = "#>" ) ``` # Import Endnote XML As Data Frame ```{r echo=TRUE} library(kwb.endnote) endnote_xml <- kwb.endnote::default_xml() references_df <- kwb.endnote::create_df_from_endnote_xml(endnote_xml) %>% dplyr::filter(.data$ref_type_name != "Generic") n_publications <- length(unique(references_df$record_id)) ``` # Analysis of Publications The imported Endnote XML library `r basename(endnote_xml)` contains `r n_publications` which are analysed in detail in the following. ## By Type ```{r echo=TRUE} refs_by_type <- kwb.endnote::get_reference_type_names(endnote_xml) %>% dplyr::count(.data$ref_type_name) %>% dplyr::arrange(dplyr::desc(.data$n)) knitr::kable(refs_by_type) ``` ## By Year ```{r echo=TRUE, fig.width=8, fig.height=9} endnote_list <- kwb.endnote::create_endnote_list(endnote_xml) refs_df <- kwb.endnote::create_references_df(endnote_list) p1 <- kwb.endnote::plot_pubs_by_year(refs_df) plotly::ggplotly(p1) ``` ## By Language ```{r echo=TRUE} refs_by_language <- refs_df %>% dplyr::count(language) %>% dplyr::mutate(percent = round(100*n/nrow(refs_df), 1)) knitr::kable(refs_by_language) ``` ## Has an Abstract? ```{r echo=TRUE} refs_with_abstract <- references_df %>% dplyr::filter(.data$key1 == "abstract") %>% dplyr::count(.data$record_id, .data$rec_number, .data$ref_type_name) n_pubs_with_abstract <- nrow(refs_with_abstract) ### Percent of Publications with Abstracts percent_pubs_with_abstracts <- round(100*nrow(refs_with_abstract)/n_publications,1) ``` `r percent_pubs_with_abstracts` percent (i.e. `r sprintf("%d / %d", n_pubs_with_abstract, n_publications)` publications) have an abstract. ```{r echo=TRUE} refs_with_abstract_and_linebreaks <- refs_with_abstract %>% dplyr::filter(.data$n > 1) %>% dplyr::arrange(dplyr::desc(.data$n)) n_pubs_with_abstract_and_linebreaks <- nrow(refs_with_abstract_and_linebreaks) knitr::kable(refs_with_abstract_and_linebreaks) ``` However there are some line breaks for these abstracts for in total `r n_pubs_with_abstract_and_linebreaks` publications as shown in the table above. These need to be corrected manually! ## By Accessibility Level Is the accessibility level for the referenc defined in the fiel"caption" (confidential or not?) ```{r echo=TRUE} refs_with_accessability_level <- refs_df %>% dplyr::count(.data$caption) %>% dplyr::mutate(percent = round(100*n/nrow(refs_df), 1)) n_pubs_with_accessability_level <- sum(refs_with_accessability_level$n[1:2]) percent_with_accessibility_level <- round(100*n_pubs_with_accessability_level/n_publications, 1) is_confidential <- refs_with_accessability_level$caption=="confidential" & !is.na(refs_with_accessability_level$caption) knitr::kable(refs_with_accessability_level) ``` Only for `r n_pubs_with_accessability_level` (i.e. `r percent_with_accessibility_level` percent) meta information on the accessibility level is explicitly defined, out of which `r refs_with_accessability_level$n[is_confidential]` are defined as confidential (i.e. `r refs_with_accessability_level$percent[is_confidential]` of the publications with metadata on the accessibility level). **Recommendation:** Fill the field **caption** for all publications with either: - **public** or - **confidential** ## By Project Group references by "label" (i.e. "project names") ```{r echo=TRUE} refs_with_project <- refs_df[!is.na(refs_df$label),] n_pubs_with_project_meta <- nrow(refs_with_project) ### in percent percent_pubs_with_project_meta <- round(100*n_pubs_with_project_meta/n_publications,1) unique_project_names <- refs_with_project %>% dplyr::count(.data$label) knitr::kable(unique_project_names) ``` In total `r percent_pubs_with_project_meta` percent (i.e. `r sprintf("%d / %s", n_pubs_with_project_meta, n_publications)` publications) contain meta-information on the project. However, the spelling of project names does not follow a controlled vocabulary yet as can be seen in the table above. Thus it is recommended to establish a controlled vocabulary for [project identifiers](https://kwb-r.github.io/fakin.doc/qms.html#project-identifier) according to the best-practices defined in the FAKIN project (see [here](https://kwb-r.github.io/fakin.doc/best-practices.html#acronyms)). ## By Author ## Lastname/Firstname ```{r echo=TRUE, fig.height=10, fig.width=8} refs_by_author_lastfirst <- references_df %>% dplyr::filter(.data$key2 == "authors") %>% dplyr::count(.data$value) %>% dplyr::arrange(dplyr::desc(.data$n)) p2 <- kwb.endnote::plot_pubs_by_author(refs_by_author_lastfirst[1:50, ]) plotly::ggplotly(p2) ``` Quite messy the figure above. Thus names should be entered in Endnote in the following format: **Lastname, Firstname** Also note the **,** also is an author! This is due to entering a wrong semicolon in the Endnote database and needs to be fixed here for at least the following entries: ```{r echo=TRUE} pubs_with_semicolon_as_author <- references_df %>% dplyr::filter(.data$key2 == "authors", .data$value == ",") %>% dplyr::count(.data$rec_number) %>% dplyr::arrange(dplyr::desc(.data$n)) n_pubs_with_semicolon_as_author <- nrow(pubs_with_semicolon_as_author) knitr::kable(pubs_with_semicolon_as_author) ``` In total `r n_pubs_with_semicolon_as_author` publications contain one or more **semicolon** authors. This needs to be fixed manually in the Endnote DB. ## Lastname For now data-munging is needed to just use the authors` lastname for aggregating the data ```{r echo=TRUE, fig.height=10, fig.width=8} refs_by_author_last <- references_df %>% dplyr::filter(.data$key2 == "authors") %>% dplyr::mutate(value = stringr::str_remove_all(.data$value, ",.*")) %>% dplyr::mutate(value = stringr::str_remove_all(.data$value, "^\\w+\\.?\\s+")) %>% dplyr::count(.data$value) %>% dplyr::arrange(dplyr::desc(.data$n)) p3 <- kwb.endnote::plot_pubs_by_author(refs_by_author_last[1:50, ]) plotly::ggplotly(p3) ``` ## Journal Articles ### By Journal Journal names need to be harmonised. Also needs to be checked what the differences between **full-title** and **secondary-title** of journal are. #### Journal full-title Defined in: *$`record`$periodical$`full-title`$`style`[[1]]* ```{r echo=TRUE} pubs_in_journals <- references_df %>% dplyr::filter(.data$ref_type_name == "Journal Article") n_pubs_in_journals <- length(unique(pubs_in_journals$record_id)) journal_title_full <- pubs_in_journals %>% dplyr::filter(.data$key2 %in% c("full-title")) %>% dplyr::mutate("source_field" = sprintf("%s_%s",.data$key1, .data$key2)) %>% dplyr::count(.data$value, .data$source_field) knitr::kable(journal_title_full) ``` #### Journal secondary-title Defined in: *$`record`$titles$`secondary-title`$`style`[[1]]* ```{r echo=TRUE} journal_title_secondary <- pubs_in_journals %>% dplyr::filter(.data$key2 %in% c("secondary-title")) %>% dplyr::mutate("source_field" = sprintf("%s_%s",.data$key1, .data$key2)) %>% dplyr::count(.data$value, .data$source_field) knitr::kable(journal_title_secondary) ``` #### Journal DOI All `r n_pubs_in_journals` journal articles should have an DOI. Let`s check for how many we have this metadata: ```{r echo=TRUE} has_doi <- pubs_in_journals %>% dplyr::filter(.data$key1 == "electronic-resource-num", .data$value != "") n_pubs_in_journals_with_doi <- nrow(has_doi) knitr::kable(has_doi) ``` Only `r n_pubs_in_journals_with_doi` (`r round(100*n_pubs_in_journals_with_doi/n_pubs_in_journals,1)` %) journal publications have a DOI which is encoded in the field **electronic-resource-num**. All DOIs should be entered in the field **electronic-resource-num** in the following format, i.e.: **10.6084/m9.figshare.828487** # Export References To XLSX Write references dataframe to xlsx with one sheet for each **publication type** to `references.xlsx` ```{r echo=TRUE, eval=FALSE} kwb.endnote::write_references_df_to_xlsx(endnote_list) ``` And also writting a **cleaned** version after automatically running `kwb.endnote::clean_references_df()` to `references_cleaned.xlsx`. ```{r echo=TRUE, eval=FALSE} kwb.endnote::write_clean_references_df_to_xlsx(endnote_list) ``` # Check Endnote References ```{r echo=TRUE, eval=TRUE} problematic_entries <- kwb.endnote::check_problematic_entries(endnote_list) knitr::kable(problematic_entries[1:100,]) ```