Ce document vise à rassembler les commandes utiles pour programmer en R.


Départ d’analyse

Import données

  • JSON
library(jsonlite)
decp_Lambersart <- fromJSON(txt = "../data/decp/decp_acheteur.json", flatten = T)
decp_Lambersart <- as.data.frame(decp_Lambersart$marches) |> 
  mutate(titulaires = map(titulaires, ~ mutate(.x, id = as.character(id)))) |> 
  unnest(cols = c(titulaires))
  • XML
library(XML)
library(httr)
data_19 <- xmlParse(content(GET("https://marchespublics596280.fr/app.php/api/v1/donnees-essentielles/contrat/xml-extraire-criteres/50286/a:1:%7Bi:0;i:0;%7D/1/2019/false/false/false/false/false/false/false/false/false", user_agent("Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2227.0 Safari/537.36")), "text"), 
                      encoding = "UTF-8")
xml_df_19 <- xmlToDataFrame(nodes = getNodeSet(data_19, "//marche")) |> mutate(Year = 2019)
  • SAS
library(haven)
data <- read_sas("../data/mon_fichier.sas7bdat")
  • Multiple CSV into folder (import and rbind)
library(data.table)
rbindlist_fread <- function(path, pattern = "*.csv") {
    files = list.files(path, pattern, full.names = TRUE)
    data.table::rbindlist(lapply(files, function(x) fread(x)))
}
data <- rbindlist_fread("mon/super/path")
  • ZIP
library(utils)
download.file("lien/vers/zip.zip", "dossier_complet.zip")
unzip("dossier_complet.zip")
data <- read_delim("dossier_complet.csv", ";", trim_ws = TRUE)
  • Google sheets
library(googlesheets4) 
data <- read_sheet("lien/vers/le/google/sheets")
# Attention : ne marche que si le tableau est en format GoogleSheet et pas Excel déposé sur Drive !!


API

  • Open Alex
parse_api_open_alex <- function(start, end){
    
    # Import des données : Works dataset, appels de l'API
    works_data <- purrr::map(
        .x = dois_bso[start:end,]$doi,
        .y = data.frame(matrix(ncol = 1, nrow = 1)),
        possibly(.f = ~fromJSON(txt = paste("https://api.openalex.org/works/mailto:diane@datactivist.coop/doi:", .x, sep = ""), flatten = T), otherwise = NA_character_),
        .default = NA)
    
    # Aplatissement
        # sélection des 2 variables qui nous intéressent
    works_df <- purrr::map(
        .x = works_data,
        .y = data.frame(matrix(ncol = 1, nrow = 1)),
        possibly(.f = ~unnest(data.frame(    # on récupère chaque élément/variable qui nous intéresse, on les met dans un df
                       doi = .x$doi, 
                       .x$authorships),
                   cols = "institutions", names_repair = "universal") |> select(doi, country_code), otherwise = NA_character_), 
        .default = NA)
        # suppression des NA et mise au format tabulaire
    works_df <- works_df[works_df !=  "NA"] # replace NA (DOIs non matchés avec OpenAlex) by NULL
    works_df <- rrapply(works_df, condition = Negate(is.null), how = "prune") #remove NULL
    works_df <- works_df |> bind_rows()
    
    # Export du df
    rio::export(works_df, glue("data/3.external/OpenAlex/french_CA/API_{start}_{end}.csv"))

}


### On applique la fonction pour 50 DOIs
parse_api_open_alex(1,50)


Web scraping

  • Easy scraping (data into table in 1 page)
library(rvest)
content <- read_html("url")
body_table <- content |> html_nodes('body')  |>
                    html_nodes('table') |>
                    html_table(dec = ",") 
data <- body_table[[1]]
  • Middle scraping (data into table in multiple pages)
library(rvest)
library(tidyverse)
data <- purrr::map(
        .x = (as.data.frame(rep(1:5, each = 1)) |> rename(page = `rep(1:5, each = 1)`))$page,
        .y = data.frame(matrix(ncol = 1, nrow = 1)),
        .f = ~read_html(paste0("http://portal.core.edu.au/conf-ranks/?search=&by=all&source=all&sort=atitle&page=", .x)) |> html_nodes('body')  |> html_nodes('table') |> html_table(dec = ","), 
        .default = NA)
data <- bind_rows(data)
  • Difficult scraping (data in the body text in multiple pages)
# Code valable en janvier 2023, site a évolué maintenant
library(htm2txt)
core_millesime <- purrr::map(
        .x = (as.data.frame(rep(1:10, each = 1)) |> rename(page = `rep(1:10, each = 1)`))$page,
        .y = data.frame(matrix(ncol = 1, nrow = 1)),
        possibly(.f = ~ as.data.frame(gettxt(paste0('http://portal.core.edu.au/conf-ranks/', .x, '/'))) |> #import page par page
    rename(text = 1) |> 
    mutate(text = strsplit(as.character(text), "\n")) |> unnest(text) |> #split les éléments séparés par des "\n"
    filter(row_number() ==  10 | #nom de conférence
               grepl("Acronym:", text) ==  TRUE | 
               grepl("Source:", text) ==  TRUE | 
               grepl("Rank:", text) ==  TRUE, #champs que l'on garde
           grepl("DBLP", text) ==  FALSE) |> #retrait de la ligne contenant ce string
    mutate(text = case_when(row_number() ==  1 ~ paste("Title:", text), TRUE ~ text), #ajout du préfixe "titre:"
           champ = str_extract(text, "^[a-zA-Z0-9_]*"), #dans une nouvelle colonne ce qui est avant ":"
           value = str_extract(text, "(?< = : )[^\n]*")) |> #dans une nouvelle colonne ce qui est après ": "
    select(-text) |> t() |> row_to_names(row_number = 1) |> data.frame() |> #transpose puis 1ère ligne en nom de colonnes
    pivot_longer(cols = -c(Title, Acronym), names_to = "number", values_to = "value", names_prefix = "Source|Rank") |> # format long pour rank et source quand multiples
    mutate(col = case_when(row_number() %% 2 ==  0 ~ "rank",
                           row_number() %% 2 ==  1 ~ "source")) |> #
    pivot_wider(names_from = col, values_from = value) |> select(-number) |> mutate(core_id = .x), otherwise = NA_character_),
        .default = NA)

# Gestion des Na et mise au format tabulaire
core_histo <- core_millesime[core_millesime !=  "NA"] # replace NA by NULL
core_histo <- rrapply::rrapply(core_histo, condition = Negate(is.null), how = "prune") #remove NULL
core_histo <- core_histo |> bind_rows()


Packages

  • Install and/or load multiple packages
packages = c("tidyverse", "jsonlite", "glue", "parallel", "doParallel", "foreach")
package.check <- lapply(
  packages,
  FUN = function(x) {
    if (!require(x, character.only = TRUE)) {
      install.packages(x, dependencies = TRUE)
      library(x, character.only = TRUE)
    }
  }
)


Traitement de variables


Nettoyage de données


Noms de colonnes

  • Clean column names
library(janitor)
data <- data |> clean_names() # retire majuscules, espaces et caractères spéciaux
  • Clean specific column names
library(janitor)
clean_some_names <- function(dat, idx, ...) {
  names(dat)[idx] <- janitor::make_clean_names(names(dat)[idx], ...)
  dat
}
data <- data |> 
    clean_some_names(14:18)
  • Rename all columns with suffix except two
data <- data |> rename_at(vars(-Name, -State), ~ paste0(., '_2017'))
  • Order alphabetically columns
data <- data |> select(indicateurs, order(colnames(data)))
  • Order categorical column based on other categorical col
# Dans le cas où les niveaux changent, ex : "3 à 5 fois par semaine (12 commerces)"
table |> 
  mutate(discu_nb_commerces = factor(discu_nb_commerces, 
                                     levels = table |> 
                                       distinct(`Avez-vous l'habitude de discuter avec vos clients en dehors du cadre strict de la vente ? (exemple : prendre des nouvelles personnelles)`, discu_nb_commerces) %>%
                                       arrange(factor(`Avez-vous l'habitude de discuter avec vos clients en dehors du cadre strict de la vente ? (exemple : prendre des nouvelles personnelles)`, 
                                                      levels = c("Jamais", "Très rarement (moins d'une fois par mois)", "Rarement (1 à 2 fois par mois)", "Ponctuellement (1 à 2 fois par semaine)", "Régulièrement (3 à 5 fois par semaine)", "Très régulièrement (plusieurs fois par jour)"))) |> 
                                       pull(discu_nb_commerces)))


NAs

  • Fill NAs with previous values by group
data <- data |> 
    group_by(`Référence commande`) |> 
    fill(everything(), .direction = "downup") |> 
    ungroup()
  • Replace NA
# NAs d'une colonne par 0
data <- data |> mutate(col = replace_na(col, 0))

# NAs du df entier par 0
data <- data |> mutate_all(replace_na, 0)

# NAs par string
data <- data |> replace(is.na(.), "unknown")

# NAs par valeurs autre colonne
data <- data |> mutate(col_NA = coalesce(col_NA, col_replace))
  • Replace by NA
# Cellules vides par NAs
data <- data |> mutate_all(na_if, "")

# NULL par NAs
data <- data |> replace(. == "NULL", NA) 
data[data ==  "null"] <- NA

# Chiffres négatifs par NAs
data <- data |> mutate(col = replace(col, which(col<0), NA))

# Valeur par NAs sur certaines colonnes
data <- data |> mutate(across(starts_with("Choix_"), ~ na_if(.x, "Pas de préférence")))


Variables numériques

  • Round multiple columns
data <- data |> mutate_at(vars(var_3:var_17), ~round(.,0))
  • Round values to nearest 5
plyr::round_any(x, 5)
  • Shaping numeric variables
# Nombres arrondis au million
format(round(100000000 / 1e6, 1), trim = TRUE)

# Centaines et milliers séparés des virgules [comma]
format(as.integer(1000000, 0), nsmall = 1, big.mark = ".")
  • “Total” row per group
library(janitor)
data <- data |> 
    group_by(Var2) |> 
    group_modify(~ adorn_totals(.x, where = "row")) |> 
    ungroup() |> 
    arrange(fct_relevel(Commune, 'Total')) #puis trier avec ligne "Total" en haut puis ordre alphabétique sur colonne "Commune"

# Solving error "trying to re-add a totals dimension that is already been added"
data |> 
    untabyl() |> #ajouter avant le total
    adorn_totals() 


Indices, rep(), seq()

  • Index rep
c(rep(1:5570, each = 50), rep(5571, each = 7))
  • Between groups index (not within)
data <- data |> 
    group_by(defi_profil) |> 
    mutate(groupNbr = cur_group_id())


Dates

  • Change date format
data <- mutate(date = format(as.Date(Date, format = "%Y-%m-%d %H:%M:%S"),"%d %B %Y")) #30 mai 2023
# autres formats : https://www.r-bloggers.com/2013/08/date-formats-in-r/
  • Dates subtraction
data <- data |> mutate(nb_weeks = round(as.numeric(difftime(fin, Sys.Date(), units = "weeks")), 0), #semaine
                       nb_month = round(as.numeric(difftime(fin, Sys.Date(), units = "weeks") /4.34524), 0)) #mois
  • Dates sum
library(mondate)
data <- data |> mutate(date_fin = as.mondate(date_debut) + duree) #duree en mois


Âge

  • Check intersections between intervals
data <- data |> 
    mutate(has_intersection = all(sapply(1:(n() - 1), function(i) {
      all(pmax(min[i], min[(i + 1):n()]) <= pmin(max[i]+3, max[(i + 1):n()]+3)) 
    })), .by = author_id) #où min est âge_min et max est age_max de la tranche
  • Create age intervals
data <- data |> mutate(tranche_age = cut(age, c(18,20, seq(30, 90, 5), 98)))
  • Find age from date of birth
data <- data |> mutate(age = round(as.numeric(difftime(Sys.Date(), dateOfBirth, units = "weeks")) / 52.1429, 0)) #année


Chaînes de caractères

  • Replace string
# Valeurs d'une colonne
data <- data |> mutate(col = str_replace_all(col, c("pattern1 | pattern2" = "replacement")))

# Attention à escape les parenthèses pour que le remplacement fonctionne
data <- data |> mutate(col = str_replace_all(col, c("string avec \\(parenthèses\\)" = "replacement")))

# Valeurs du df entier
data <- data |> mutate_all(function(x) gsub("pattern1 | pattern2", "replacement", x))
  • Replace character of multiple columns
data |> mutate_at(vars(January:December), ~str_replace(., ",", "."))
  • Upper string and remove special characters
data <- data |> mutate(col = stringi::stri_trans_general(str = gsub("-", " ", toupper(string)), id = "Latin-ASCII"))
  • Upper first letter of both words
library(tools)
toTitleCase(tolower("MY STRING"))
toTitleCase("my other string")
  • Upper first letter of first word
gsub("^(\\w)(\\w+)", "\\U\\1\\L\\2", "my other string", perl = TRUE)
  • Remove specific words/characters
data <- data |> mutate(col = removeWords(string, c("IEEE ", "ACM ", "SIAM ")))
  • Remove duplicated words
rem_dup_word <- function(x){
  #x <- tolower(x)
  paste(unique(trimws(unlist(strsplit(x, split = " ", fixed = F, perl = T)))), collapse = " ")
}
rem_dup_word(x)
  • Remove isolated letters/characters
data <- data |> mutate(col = gsub("\\W*\\b\\w\\b\\W*", " ", string))
  • Remove blank spaces at the begining (or the end)
data <- data |> mutate(col = trimws(string, which = "left"))
  • Remove blank spaces at the begining and at the end
data <- data |> mutate(col = str_squish(col)) #specific column
data <- data |> mutate_all(~str_squish(.)) #all character columns
  • Remove first digit if 0
data <- data |> mutate(num = gsub("^0", "", num))
  • Remove special characters (all)
# Supprimer les caractères spéciaux ex : ? ' !
data <- data |> mutate(col = str_replace_all(col, "[^[:alnum:]]", " "))
  • Keep first word
data <- data |> mutate(first_word = word(string, 1))
  • Extract digits
library(strex)
data <- data |> mutate(min = str_nth_number(string, n = 1)) # extrait le 1er chiffre du string
  • Extract year
data <- data |> mutate(annee = str_extract(`En quelle année ?`, "(1|2)\\d{3}")) #seulement "\\d{5}" pour zipCode
  • Extract n first characters
data <- data |> mutate(sub_string = substr(string, 1, n))
  • Extract n last characters
data <- data |> mutate(sub_string = substr(string, nchar(string)-n+1, nchar(string)))
  • Add digit identifiant
data <- data |> mutate(num = sprintf("%02d", num)) #passer de 1 à 2 digits
data <- data |> mutate(num = str_pad(num, 14, pad = "0")) #obliger à avoir 14 caractères (donc ajoute 0 en début si besoin)
  • Only 1 character in a string
nchar(string) ==  1


Regular expressions (regex)

  • Detect digits
str_detect(string, "[0-9]") ==  TRUE
grep("\\d+", string, value = TRUE) 
  • Detect special characters (i.e. no letter nor digit)
grepl('[^[:alnum:]]', string)
  • Detect dates with format “%Y-%m-%d %H:%M:%S”
grepl("\\d{4}-\\d{2}-\\d{2} \\d{2}:\\d{2}:\\d{2}", string)
  • Extract characters before first digit
str_extract(string, "^\\D+")
  • Extract characters before “:”
str_extract(string, "^[a-zA-Z0-9_]*")
  • Extract characters after “:”
str_extract(string, "(?<=: )[^\n]*")
  • Extract characters after “[”
str_extract(string, "(?<=\\[).*")
substr(Question, 1, nchar(Question)-1) #remove last character
  • Extract characters before “[”
str_extract("string bla [da]", "^.*\\[") #crochet inclu dans l'extraction
  • Extract characters before “(”
str_extract(string, "^[^(]+")
  • Extract characters between parenthesis
str_extract(string, "\\(.*?\\)")
  • Extract characters after blankspace
str_extract(string, "\\s+(.*)")
  • Extract all numbers and “+” character, and collapse into 1 cell
sapply(str_extract_all("+3214FSEtest!! 1", "[\\d+]+"), function(x) paste(x,collapse=""))
  • Replace misled zip code
# Reformater code postal mal renseigné (avec des espaces entre les 5 digits)
# Exemple : "49, avenue Du grau du rieu Marseillan Occitanie 34 340 France"
mutate(adresses = str_replace_all(adresses, "\\b(\\d{2}) (\\d{3})\\b", "\\1\\2"))


Listes

  • Unlist
data <- data |>
    pull(column) |> pluck() |> bind_rows() |> 
    group_by(author_id) |> mutate(n = n()) |> select(author_id, n) |> distinct()
  • Complex unlist when simple pull/pluck is not working
# Exemple
comm1 <- data |> 
    filter(lengths(comments) != 0) |> 
    group_by(id) |> 
    mutate(nb_comments = nrow(comments[[1]])) |> 
    select(id2, id, comments, nb_comments) |> 
    pluck() |> bind_rows()
comm2 <- comm1 |>
    pull(comments) |> 
    pluck() |> bind_rows()
comm1.1 <- comm1 |> ungroup() |> 
    mutate(index = row_number()) |> 
    group_by(id) |> 
    slice(rep(1:n(), each = nb_comments)) |> 
    arrange(index)
proj_comm_date <- cbind(comm1.1, comm2) |> ungroup() |> 
    select(id2, author_id, publishedAt) |> 
    rename(date = publishedAt) |> mutate(type = "commentaires")


Filtres

  • Remove values containing string
data <- data |> filter(!grepl(',', column)) #containing comma
  • Keep values containing string
data <- data |> filter(grepl("mot particulier", column) ==  TRUE)
data <- data |> filter_all(all_vars(grepl("mot", .)))
  • Even and odd lines
data |> filter(row_number() %% 2 ==  0) # pair
data |> filter(row_number() %% 2 ==  1) # impair
  • All columns equal a value
data |> group_by(cat) |> filter(across(where(is.character), ~. != "N/A"))
  • At least 2 values by group
data |> 
    filter(any(projet %in% c("proj1234", "proj4321")), 
           .by = id)
  • Filter on first and second minimum values by group
data |> 
    filter((degre_etude == min(degre_etude) | degre_etude == min(degre_etude[degre_etude != min(degre_etude)])) |
           (degre_etude == min(degre_etude) | is.na(degre_etude)), 
         .by = id)


Traitement de dataframes


Formats

  • Pivot longer
m3 <- data |> select(c(BATIMENTS:TYPE_DE_BATIMENTS, starts_with("m3"))) |> 
    mutate_all(as.character) |> 
    pivot_longer(cols = -c(BATIMENTS:TYPE_DE_BATIMENTS), names_to = "Annee", values_to = "m3", names_prefix = "m3_")
montant <- data |> select(c(BATIMENTS:TYPE_DE_BATIMENTS, starts_with("montant"))) |> 
    mutate_all(as.character) |> 
    pivot_longer(cols = -c(BATIMENTS:TYPE_DE_BATIMENTS), names_to = "Annee", values_to = "montant", names_prefix = "montant_")
final <- cbind(m3, montant |> select(montant))
  • Pivot wider
data <- data |> 
    pivot_wider(names_from = choix, values_from = nb_interesses, names_prefix = "choix_")
  • Split comma separated values
# Split 
data <- data |> mutate(journal_issns = strsplit(as.character(journal_issns), ",")) |> unnest(journal_issns)
# Unsplit
data <- data |> mutate(journal_issns = paste0(unique(na.omit(journal_issns)), collapse = ","))
  • Repeat row based on 1 column number
data |> uncount(x)
  • Repeat row based on several column number
data.frame(col1 = c(0, 167, 73),
           col2 = c(62, 0, 73)) |> 
    slice(rep(1:n(), times = c(col2[1], col1[2], col2[3])))
  • Random values with specific probability
random <- c("groupe 1", "groupe 2", "groupe 3")
sample(random, size = nrow(data), replace = TRUE, prob = c(1/2,3/5,2/5))


Merge, join

  • Merge 2 or more columns into 1
data <- data |> mutate(new_col = coalesce(col1,col2,col3))
  • Fuzzy join
library(fuzzyjoin)
data <- stringdist_left_join(data, data2, by = "col_name", max_dist = 5, distance_col = "distance") |> 
  group_by(nom) |> slice_min(distance)
  • Merge 2 dataframes ~rbind()
data_merged <- merge(df_1, df_2, all = TRUE) 
  • Alternative rbind() when different number of columns
data <- list(cat1, cat2) |>  bind_rows(.id = 'origine_df')
  • Differences 2 dataframes
anti_join(df1, df2)
  • Similarities 2 dataframes
semi_join(df1, df2)

Autre

  • Redundancy test
identical(data$id1, data$id2)
  • Increase total system memory R session
library(unix) #pour linux
rlimit_as(1e20)  #increases to ~12GB


Analyse de données


Opérations, statistiques

  • NAs per column of a dataframe
# Une colonne
data |> count(is.na(col_name))

# Toute les colonnes
nb_NA <- as.data.frame(apply(is.na(data), 2, sum)) |> 
                       rename(`nombre de NA` = `apply(is.na(data), 2, sum)`) |>
                       mutate(pourcentage = `nombre de NA`/nrow(data)*100) |> 
                       mutate(pourcentage = round(pourcentage, 2)) |> 
                       arrange(desc(pourcentage)) |> 
    rownames_to_column() |> 
    rename(variable = rowname)
  • Non NAs per group
table <- data |> 
    summarise_all(list(~sum(!is.na(.))), .by = group)
  • Values frequency
data <- as.data.frame(table(data$column))  #R base
data <- data |> group_by(group) |> count(column)   #dplyr
data <- data |> summarise(n = n(), .by = group) #dplyr
  • Sum in pipe for specific rows
data |> mutate(new_cat = sum(n[Catégorie2 == "Total"])) #[]
  • Calculate sum accross mutliple columns df
data |> rowwise() |> mutate(sum_multiple = sum(c_across(var_3:var_17)))
  • Calculate difference between rows by group
data |> group_by(Structure) |> mutate(ecart = Pourcentage - lag(Pourcentage))
  • Weighted mean and median
# Fonction pour calculer une médiane pondérée
weighted_median <- function(x, w) {
  df <- data.frame(x = x, w = w) |>
    arrange(x)
  cum_w <- cumsum(df$w)
  cut_point <- sum(df$w) / 2
  median <- df$x[which(cum_w >= cut_point)[1]]
  return(median)
}

# Calculs pondérés
library(stats)
library(questionr)
data |> mutate(n_moyen_stats = round(weighted.mean(n_repondants, PONDFIN_logit), 2), #package stats
               n_moyen_questionr = round(questionr::wtd.mean(n_repondants, PONDFIN_logit), 2), #package questionr
               n_moyen_manuel = round(sum(n_repondants * PONDFIN_logit) / sum(PONDFIN_logit), 2), #calcul manuel
               n_median_pondere = weighted_median(n_repondants, PONDFIN_logit)) #médiane


Dataviz

  • Reorder values before plotting
# geom_bar ordre alphabetic, après arrange()
mutate(colonne = factor(colonne, levels = rev(unique(colonne)))) 
# geom_bar décroissant selon n
mutate(colonne = fct_reorder(colonne, n))
  • Arrange multiple plots
# Afficher plusieurs ggplots
library(gridExtra)
grid.arrange(g1,g2,g3, ncol = 3, nrow = 1, 
             top = grid::textGrob("Titre", gp = grid::gpar(fontsize = 15, font = 2)))

# Aligner les boxes
library(cowplot)
plot_grid(p3.1, p3.2, p3.3, p3.4, p3.5, align = 'vh')

# Afficher plusieurs ggplotlys
library(plotly)
subplot(plotly_positif, plotly_negatif, nrows = 1)

# Pas de message grid.arrange() dans rmd
graph <- grid.arrange(g1,g2)
grid::grid.draw(graph)
  • Interactive graph
# Passer en plotly
ggplotly(graph, tooltip = c("text")) |> 
    layout(xaxis = list(autorange = TRUE), yaxis = list(autorange = TRUE)) #auto adjust scale when click on element

# Passer en giraph : mettre les geométries en interactif !! ex: geom_segment_interactive()
graphc <- ggplot(data, aes(x = Réponses, y = Pourcentage, fill = Edition,
                              tooltip = paste0(Pourcentage*100, "% en ", Edition))) +#texte au survol
    geom_point_interactive() + 
    theme_minimal()
girafe(print(graph), width_svg = 15, height_svg = 12)
  • Highlight bar on stacked barplot with alpha
data |> 
  ggplot(aes(y = n)) +
  geom_col(aes(x = cycle, fill = rowname, alpha = cycle != "Catégorie"), color = "white", position = "stack", width = 0.7) +
  scale_alpha_manual(values=c(1, .4))
  • Custom_theme() BBC
font <- "Helvetica"
custom_theme <- function (){
    font <- "Helvetica"
    ggplot2::theme(plot.title = ggplot2::element_text(family = font,size = 21, face = "bold", color = "#222222"), 
        plot.subtitle = ggplot2::element_text(family = font,size = 18, face = "italic", margin = ggplot2::margin(0, 0, 9, 0)), 
        plot.caption = ggplot2::element_text(family = font,size = 18, margin = ggplot2::margin(9, 0, 9, 0)), 
        plot.title.position = "plot",
        plot.caption.position = "plot",
        legend.title = ggplot2::element_text(family = font, size = 18, color = "#222222"), 
        legend.position = "top", 
        legend.text.align = 0, 
        legend.background = ggplot2::element_blank(),
        legend.key = ggplot2::element_blank(),
        legend.text = ggplot2::element_text(family = font, size = 18,color = "#222222"), 
        axis.text = ggplot2::element_text(family = font, size = 15,color = "#222222"), 
        axis.text.x = ggplot2::element_text(margin = ggplot2::margin(5,b = 10)), 
        axis.title = ggplot2::element_text(family = font, size = 18,color = "#222222"),
        axis.ticks = ggplot2::element_blank(),
        axis.line = ggplot2::element_blank(), 
        panel.grid.minor = ggplot2::element_blank(),
        panel.grid.major.y = ggplot2::element_line(color = "#cbcbcb"),
        panel.grid.major.x = ggplot2::element_blank(), 
        panel.background = ggplot2::element_blank(),
        strip.background = ggplot2::element_rect(fill = "white"),
        strip.text = ggplot2::element_text(size = 22, hjust = 0, face = "bold"))
}
custom_theme_largeG <- function (){
    font <- "Helvetica"
    ggplot2::theme(plot.title = ggplot2::element_text(family = font,size = 25, face = "bold", color = "#222222"), 
        plot.subtitle = ggplot2::element_text(family = font,size = 18, face = "italic", margin = ggplot2::margin(0, 0, 9, 0)), 
        plot.caption = ggplot2::element_text(family = font,size = 18, margin = ggplot2::margin(9, 0, 9, 0)), 
        plot.title.position = "plot", #titre commence où y-axis commencent !!
        plot.caption.position = "plot",
        legend.title = ggplot2::element_text(family = font, size = 18, color = "#222222"), 
        legend.position = "top", 
        legend.text.align = 0, 
        legend.background = ggplot2::element_blank(),
        legend.key = ggplot2::element_blank(),
        legend.text = ggplot2::element_text(family = font, size = 18,color = "#222222"), 
        axis.text = ggplot2::element_text(family = font, size = 24,color = "#222222"), 
        axis.text.x = ggplot2::element_text(margin = ggplot2::margin(5,b = 10)), 
        axis.title = ggplot2::element_text(family = font, size = 27,color = "#222222"),
        axis.ticks = ggplot2::element_blank(),
        axis.line = ggplot2::element_blank(), 
        panel.grid.minor = ggplot2::element_blank(),
        panel.grid.major.y = ggplot2::element_line(color = "#cbcbcb"),
        panel.grid.major.x = ggplot2::element_blank(), 
        panel.background = ggplot2::element_blank(),
        strip.background = ggplot2::element_rect(fill = "white"),
        strip.text = ggplot2::element_text(size = 22, hjust = 0, face = "bold"))
}

# change grid when coord_flip()
theme(panel.grid.major.x = ggplot2::element_line(color = "#cbcbcb"),
      panel.grid.major.y = ggplot2::element_blank())


Par élément

Geometry

# Geométrie initiale
  geom_line(size = 1.7, alpha = 0.9, linetype = 1, color = "#0066CC") +
  geom_point(colour = "#0066CC", fill = "#0066CC", size = 2, pch = 21, stroke = 1.5) +
  geom_bar(position = position_dodge(.9), stat = "identity", width = .8, fill = "#21468d") + #.9 et width pour barres pas collées
  geom_bar(aes(x = forcats::fct_infreq(adequation))) + #fct_infreq pour ordonner selon count
  geom_col(position = "stack", width = 0.7, color = "white") +  coord_flip() + #cas particulier de geom_bar où on prend n comme Y et non count
  geom_text_wordcloud(family = "Montserrat") +
    
# Géométrie additionnelle
  geom_text(aes(y = 1, label = title_projet), hjust = "bottom", #aligner geom_text à gauche avec coord_flip
            fontface = "italic", size = 5, hjust = 0) +
  geom_label(aes(y = 1, label = title_projet), hjust = "bottom", fontface = "italic", size = 2.6,
             fill = "white", label.size = NA, hjust = 0) + #white background and remove black borders à la fin sinon marche pas !
  geom_label(aes(y = 1, label = title_projet), hjust = "bottom", fontface = "italic", size = 2.6,
             fill = "white", label.size = NA, position = position_dodge(width = .9), hjust = 0) + #pour double barres plot (dodge)
  stat_count(geom = "text", colour = "white", size = 4,
             aes(label = ..count.., y = ..count..+.7), #y pour positionnement juste au dessus des barres
             position = position_stack(vjust = 0.5)) + #geom_text des geom_bar sans y
  geom_vline(xintercept = -.5, linetype = 2, color = "#0066CC") +
    
# Annotate a graph
coord_fixed(clip = 'off') # geom_label() déborde du graph

Scales

  xlim(1, 100) +
  scale_y_continuous(labels = scales::comma) + #grands chiffres lisibles
  scale_y_continuous(breaks = scales::pretty_breaks()) + #breaks réguliers, plus lisible (pas d'axe)
  scale_y_continuous(labels = scales::percent, limits = c(0,1)) + # pourcentages
  scale_y_discrete(limits = 1:12) + #valeurs discrètes
  scale_y_discrete(labels = function(x) stringr::str_wrap(x, width = 50)) + #axis-text trop longs sur plusieurs lignes
  scale_color_continuous(high = "#132B43", low = "#56B1F7") #reverse color
  scale_y_continuous(breaks = c(0, 2, 4, 6, 8, 10), limits = c(-.2, 10)) # de 0 à 10 avec les breaks spécifiés
  scale_fill_manual(values = c("Non, mais c’est prévu dans les 12 prochains mois" = "#e1b44d", 
                             "Oui, au cours des deux dernières années" = "#323465", 
                             "Oui" = "#33bbc9"),
                  labels = c("Non, mais c’est prévu dans les 12 prochains mois" = "Prévue dans les 12 prochains mois", 
                             "Oui, au cours des deux dernières années" = "Engagée au cours des deux dernières années", 
                             "Oui" = "Engagée"), #rename categories legend
                  breaks = c("Oui", "Oui, au cours des deux dernières années", "Non, mais c’est prévu dans les 12 prochains mois")) #order items legend

Labs

# titres trop longs, automatiquement coupés
title = stringr::str_wrap("Exemple de titre très très très très très très très très très très très très long", width = 45)

# titres avec des mots colorés
library(ggtext)
ggplot(data) +
    geom_point() +
    labs(title = "Dans mon titre je veux mettre en avant <span style='color: #323465; font-size: 23pt;'>cette catégorie</span> par rapport aux autres") +
    theme(plot.title = element_markdown()) #coloré et taille plus grande

Theme

  theme_classic() +
  theme(panel.grid.major.y = ggplot2::element_line(color = "#cbcbcb"), #lignes horizontales fond graphique en gris (BBC thème)
        strip.text.x = element_text(face = "bold"), #label des facettes
        axis.title.x = element_text(margin = margin(t = 5, r = 0, b = 5, l = 0)), #augmenter marges entre texte et labels des axes
        plot.title = element_textbox_simple(hjust = 1), #hjust: titre aligné à droite, element_textbox_simple line break auto titre
        plot.background = element_rect(fill = "#FAF3EE", colour = "#FAF3EE"), #background couleur Datactivist
        panel.background = element_rect(fill = "#FAF3EE", colour = "#FAF3EE"),
        legend.background = element_rect(fill = "#FAF3EE", colour = "#FAF3EE"),
        legend.box = "vertical", legend.box.just = "left", #multiple guide_legend each one on new row, for top legend 
        ) +

Facet

  facet_grid(Projet ~ ., 
             scales = "free", #scales = "free" pour label différents d'une facette à une autre
             space = "free") + #space = "free" pour hauteur différentes selon le nombre d'éléments par facette
  facet_zoom(x = annee > 2014, split = TRUE) +
  ggforce::facet_col(facets = vars(Projet), 
                     scales = "free_y", 
                     space = "free") # pour avoir scales et face de facet_grid avec labels to the top de facet_wrap
  facet_wrap(~vote, scales='free_x') + scale_y_continuous(limits=c(0 ,50)) # pour avoir les ticks sur chaque facettes et pas juste celles du bas

Legend

  guides(fill = guide_legend(nrow = 6, byrow = TRUE,  # nombre d'éléments par ligne
                             title = "titre légende"), # titre légende
        lwd = "none",  #ne pas afficher une légende en particulier
        col = guide_legend(title = "", reverse = TRUE, override.aes = list(lwd = 2))) + #lwd = 2 pour ligne plus épaisse et plus visible dans la légende

# Deux légendes sur un même graph, affichées l'une sous l'autre
theme(legend.box = "vertical", legend.box.just = "left")

Colors

scale_fill_manual(values = c("#c898ae", "#da4729", "#f38337", "#74a466", "#fecf5d", "#5E79AC")) #couleurs Bauhaus


Graphiques

  • Donut
iris |> 
    group_by(species) |> summarise(Freq = n()) |> 
    mutate(fraction = Freq / sum(Freq),
           proportion = round((Freq / sum(Freq))*100),
           ymax = cumsum(fraction),
           ymin = c(0, head(ymax, n = -1)), 
           labelPosition = (ymax + ymin) / 2) |> 
    ggplot(aes(ymax = ymax, ymin = ymin, xmax = 3.3, xmin = 2, fill = species)) +
      geom_rect(col = "white", linewidth = 2) +
      geom_text(x = 4, aes(y = labelPosition, label = paste(proportion,"%",sep = "")), color = "#333333", size = 7) +
      geom_text(aes(x = 0, y = 0, label = sum(Freq)), col = "#333333", alpha = 0.8, size = 14, fontface = "bold", inherit.aes = FALSE) +
      scale_fill_manual(values = c('#fecf5d', '#21468d','#82888d')) +
      coord_polar(theta = "y") +
      labs(title = "Titre") +
      xlim(c(0, 4)) +   
      theme_void() +
      theme(legend.position = "right",
            plot.title = element_text(face = "bold"))+
      guides(fill = guide_legend(title = ""))

  • Histogram
iris |> 
    ggplot() +
      aes(x = sepal_length) +
      geom_histogram(bins = 7L, col = "white", fill = "#21468d", width = 5) +
      geom_vline(xintercept = mean(iris$sepal_length, na.rm = TRUE), linetype = 2, col = "red") +
      geom_text(aes(x = mean(sepal_length, na.rm = T), y = 1, label = paste("Moyenne :", round(mean(sepal_length, na.rm = T), 1))), 
                col = "red", fontface = "italic", hjust = 0, size = 5) +
      labs(x = "Valeur",y = "Fréquence", title = "Titre",
           subtitle = paste(iris |> filter(is.na(sepal_length)) |> nrow(), "valeur manquante")) +
      theme_classic() +
      theme(plot.subtitle = element_text(face = "italic"),
            plot.title = element_text(face = "bold")) +
      scale_x_continuous(n.breaks = 10)

  • Histogram on multiple variables
iris |> 
  #select(1, 37:40) |> 
  reshape2::melt(id.vars = c("species")) |> 
  ggplot(aes(x = value)) +
      geom_histogram(bins = 10L, col = "white", fill = "#21468d", width = 5) +
      labs(x = "Valeur", y = "Fréquence", title = "Titre") +
      theme_classic() +
      theme(plot.title = element_text(face = "bold")) +
      facet_wrap(variable ~ .)

  • Histogram with normal distribution
ggplot(data = table_femmes) + 
  geom_histogram(mapping = aes(x = percent, y=..density..), fill = "#fd710f", color = "white", alpha = .7) +
  stat_function(fun = dnorm, args = list(mean = mean(table_femmes$percent), sd = sd(table_femmes$percent)), 
                size = 1, alpha = .8, aes(col = "Distribution normale")) +
  scale_colour_manual("", values = c("#cb1d27")) +
  labs(title = "Distribution de la part des femmes dans les cohortes", x = "Pourcentage de femmes", y = "Densité") +
  theme_classic() +
  scale_x_continuous(labels = scales::percent, limits = c(0,1)) +
  geom_vline(xintercept = .5, linetype = 2, col = "#666666", linewidth = .7) +
  custom_theme() +
  theme(plot.title = ggplot2::element_text(size = 18))
  • Pyramid
library(lemon)
iris |> 
  filter(species !=  "setosa") |> 
  mutate(tranche_pw = cut(petal_width, c(0.1, 0.5, seq(1, 2.5, .5)))) |> 
  group_by(tranche_pw, species) |> 
  summarise(Freq = n()) |> ungroup() |> group_by(species) |> 
  mutate(percent = Freq / sum(Freq) *100,
         percent = ifelse(percent < 0.5, round(percent, 1), round(percent, 0))) |> 
  na.omit() |> 
  ungroup() |> 
  ggplot(mapping = aes(x = ifelse(species ==  "versicolor", -Freq, Freq), y = tranche_pw, fill = species)) +
      geom_col(col = "white", size = 1.3) + #col = "black"
      geom_text(aes(y = tranche_pw, x = ifelse(species ==  "versicolor", -Freq-1, Freq+1), label = paste(percent,"%",sep = "")), 
                color = "#333333", size = 3, check_overlap = T) +
      scale_x_symmetric(labels = abs) + 
      scale_colour_manual(values = c('#fecf5d', '#21468d'),
                          aesthetics = c("colour", "fill")) +
      labs(x = "Fréquence", y = "", title = "Titre") +
      theme_classic() + 
      theme(legend.position = "right",
            plot.title = element_text(face = "bold"),
            axis.title.y = element_blank(),
            axis.line.y = element_blank(),
            axis.ticks.y = element_blank()) +
      guides(fill = guide_legend(title = ""))

  • Bar
iris |> 
    group_by(species) |> 
    summarise(n = n()) |> ungroup() |> na.omit() |> 
    mutate(percent = round((n / sum(n))*100, 0)) |> 
    mutate(species = fct_reorder(species, n)) |> 
    ggplot(aes(x = species, y = n))+
        geom_bar(stat = "identity", width = .6, fill = "#21468d") +
        geom_text(aes(y = n+3, x = species, label = paste(percent,"%",sep = "")), 
                color = "#333333", size = 3, check_overlap = T) +
        coord_flip() +
        labs(x = "", y = "Fréquence", title = "Titre") +
        scale_y_discrete(labels = function(x) stringr::str_wrap(x, width = 70)) + #axis-text trop longs sur plusieurs lignes
        theme_classic() +
        theme(plot.title = element_text(face = "bold"),
              axis.title.x = element_text(margin = margin(t = 10, r = 0, b = 0, l = 0)))

  • Radar
devtools::install_github("ricardo-bion/ggradar")  #installation impossible
library(ggradar)
iris |> 
  select(sepal_length, sepal_length:petal_width) |> 
  summarise_all(funs(mean(., na.rm = T))) |> 
  ggradar(values.radar = c("1", "3", "5"),
          grid.min = 1, grid.mid = 3, grid.max = 5,
          # Polygones
          group.line.width = 1, 
          group.point.size = 3,
          group.colours = "#21468d",
          # Arrière-plan et lignes de grille
          background.circle.colour = "white",
          gridline.mid.colour = "grey") +
  xlim(-10,10) + #selon longueur catégories
  labs(title = "Titre") +
  theme(legend.position = "none",
        plot.title = element_text(face = "bold"))
  • Comparison gauge
comp_data <- data.frame("variable" = c("sepal_length", "sepal_width", "petal_length", "petal_width"),
           "Minimum" = c(min(iris$sepal_length), min(iris$sepal_width), 
                         min(iris$petal_length), min(iris$petal_width)),
           "Maximum" = c(max(iris$sepal_length), max(iris$sepal_width), 
                         max(iris$petal_length), max(iris$petal_width)),
           "Moyenne" = c(mean(iris$sepal_length), mean(iris$sepal_width), 
                         mean(iris$petal_length), mean(iris$petal_width)))
comp_data |> 
  arrange(variable) |> 
  mutate(variable = factor(variable, levels = rev(unique(variable)))) |> 
  ggplot() +
    geom_segment(aes(x = Minimum, xend = Maximum, y = variable, yend = variable, colour = variable), lwd = 10) +
    geom_segment(aes(x = Moyenne, xend = Moyenne+.05, y = variable, yend = variable), 
                 colour = ifelse(comp_data$variable ==  "sepal_length", "white", "black"), lwd = 10) +
    geom_text(aes(x = Moyenne+.2, y = variable, label = paste("Moyenne :", round(Moyenne, 1)), hjust = "bottom"), 
              size = 3, col = ifelse(comp_data$variable ==  "sepal_length", "white", "#333333")) +
    labs(title = "Titre", x = "Score", y = "") +
    scale_y_discrete(labels = function(x) stringr::str_wrap(x, width = 50)) +
    scale_color_manual(values = c("petal_length" = "#21468d", 
                                  "sepal_width" = "#fecf5d", 
                                  "sepal_length" = "#d55152", 
                                  "petal_width" = "#ddd6ad")) +
    theme_bw() +
    theme(legend.position = "none", 
          plot.title = element_text(face = "bold"))

  • Infographie simple
library(icons)
library(tidyverse)
# Table
df <- data.frame(
    x = c(2, 8.5, 15, 21.5),
    y = rep(6.5, 4),
    h = rep(4.25, 4),
    w = rep(6.25, 4),
    value = c(5, 7, 17, 5),
    info = c("Communes",
             "Participants",
             "Jeux de données \nouverts",
             "Réutilisations"),
    color = factor(1:4)
)

# Graphique
ggplot(df, aes(x, y, height = h, width = w, label = info)) +
    ## Create the tiles using the `color` column
    geom_tile(aes(fill = color)) +
    ## Add the numeric values as text in `value` column
    geom_text(color = "white", fontface = "bold", size = 16,
              aes(label = ifelse(value > 999, format(as.integer(value, 0), nsmall = 1, big.mark = "."), value), 
                  x = x, y = y + 0.7), 
              hjust = 0.5) +
    ## Add the labels for each box stored in the `info` column
    geom_text(color = "white", fontface = "bold", size = 10,
              aes(label = info, x = x - 2.9, y = y - 1), hjust = 0) +
    coord_fixed() +
    #scale_fill_manual(type = "qual", palette = "Dark2") +
    scale_fill_manual(values = c("#9bcea4", "#ef7875", "#ffcc00", "#23ae84", "#fecf5d", "#21468d")) +
    ## Use `geom_text()` to add the icons by specifying the unicode symbol.
    theme_void() +
    guides(fill = FALSE)
  • Correlation matrix with ggplot
    # couleurs
col <- colorRampPalette(c("#BB4444", "#EE9988", "#FFFFFF", "#77AADD", "#4477AA"))
bgcolors <- matrix("black", nrow(matrix), ncol(matrix),dimnames = dimnames(matrix))
bgcolors[,2] <- "red"
bgcolors <- bgcolors[lower.tri(bgcolors, diag=TRUE)]
    # matrice
ggcorrplot(matrix, hc.order = T, type = "lower", show.diag = TRUE, legend.title = "",
           lab = TRUE, lab_col = bgcolors, colors = c("#BB4444", "white", "#4477AA")) +
    labs(title = stringr::str_wrap("Décorrelation entre la phase <span style='color: red;'>Analyse, traitement, calcul</span> et les autres phases du cycle de vie", width = 50), 
         subtitle = "Les phases de dépôt et de partage fortement connectées entre elles", 
         caption = "Sur quelle(s) partie(s) du cycle de vie des données intervenez-vous ? - 5642 réponses") +
    theme(axis.text.y = element_blank(),
          panel.grid = element_blank(),
          plot.subtitle = element_markdown(size = 15),
          plot.title = element_markdown(size = 18, lineheight = 2),
          plot.caption = element_text(size = 15, face = "italic", hjust = 0, color = "#cbcbcb")) +
    geom_label(aes(x = 6, y = 7), label = "Dépot", family = font, vjust = .4, hjust = "right", 
               fill = "white", label.size = NA) +
    geom_label(aes(x = 5, y = 6), label = "Préparation du dépôt", family = font, vjust = .4, hjust = "right", 
               fill = "white", label.size = NA) +
    geom_label(aes(x = 4, y = 5), label = "Ouverture ou partage", family = font, vjust = .4, hjust = "right", 
               fill = "white", label.size = NA) +
    geom_label(aes(x = 3, y = 4), label = "Planification, PGD", family = font, vjust = .4, hjust = "right", 
               fill = "white", label.size = NA) +
    geom_label(aes(x = 2, y = 3), label = "Réutilisation", family = font, vjust = .4, hjust = "right", 
               fill = "white", label.size = NA) +
    geom_label(aes(x = 1, y = 2), label = "Analyse, traitement, calcul", family = font, vjust = .4, hjust = "right", 
               fill = "white", label.size = NA) +
    geom_label(aes(x = 0, y = 1), label = "Collecte, création, stockage", family = font, vjust = .4, hjust = "right", 
               fill = "white", label.size = NA) +
    coord_fixed(clip = 'off') 
  • Positive and negative evolutions
test <- table |> 
  mutate(ecart = ecart/100,
         type_ecart = case_when(ecart >= 0 ~ "positif",
                                ecart < 0 ~ "negatif",
                                .default = NA_character_)) |> 
  select(Structure, Réponses, ecart, type_ecart) |> na.omit() 
test |> 
  ggplot(aes(x = Réponses, y = ecart, fill = type_ecart)) +
    geom_bar(stat="identity", width=.6, size = 2, alpha = .7) +
    coord_flip() +
    labs(x = "", y = "Écart en points de pourcentage", 
         subtitle = "Évolution des réponses entre l'édition 2022 et l'édition 2023",
         title = stringr::str_wrap("Commençons par la protection de la vie privée. Considérez-vous que votre structure est en conformité avec le RGPD ?", width = 55)) +
    scale_x_discrete(labels = function(x) stringr::str_wrap(x, width = 40)) + 
    scale_y_continuous(labels = scales::percent) + # pourcentages
    scale_fill_manual(values = c("positif" = "#288a28", "negatif" = "#a81008")) +
    theme_classic() +
    custom_theme() +
    theme(legend.position = "none",
          panel.grid.major.x = ggplot2::element_line(color = "#cbcbcb"),
          panel.grid.major.y = ggplot2::element_blank(),
          plot.subtitle = element_markdown()) +
    geom_label(aes(x = Réponses, y = ecart, label = paste0(ecart*100, "%")), 
              color = "#333333", size = 4, fill = "white", label.size = NA, 
              hjust = ifelse(test$ecart >= 0, 0, 1))
  • Cartography
# Graph
    # titre
tag.map.title <- tags$style(HTML("
  .leaflet-control.map-title { 
    transform: translate(0%,-170%);
    left: 7%;
    text-align: center;
    padding-left: 10px; 
    padding-right: 10px; 
    font-weight: bold;
    font-size: 22px;
    color: black;
  }
"))
title <- tags$div(tag.map.title, HTML("Localisation des participants aux projets (hors ZFE)"))

    # carte
proj_loin |> 
  leaflet() |>
  addTiles() |> 
  addControl(title, position = "topleft", className = "map-title") |> 
  setView(lng = 3, lat = 47, zoom = 4.8) |> 
  addCircles(radius = proj_loin$nb_contrib, lng = proj_loin$longitude, lat = proj_loin$latitude, #color = proj_loin$col, 
             weight = 1, opacity = proj_loin$nb_contrib, fillOpacity = .1, 
             label = proj_loin$nom_commune,
             popup = paste0(proj_loin$nom_commune, ", ", format(as.integer(proj_loin$nb_contrib, 0), nsmall = 1, big.mark = "."), 
                            ifelse(proj_loin$nb_contrib <= 1, " contribution dont :\n", " contributions dont :\n"),
                           "<br>• ", format(as.integer(proj_loin$questionnaires, 0), nsmall = 1, big.mark = "."), ifelse(proj_loin$questionnaires <= 1, " questionnaire", " questionnaires"),
                           "<br>• ", format(as.integer(proj_loin$propositions, 0), nsmall = 1, big.mark = "."), ifelse(proj_loin$propositions <= 1, " proposition", " propositions"),
                           "<br>• ", format(as.integer(proj_loin$commentaires, 0), nsmall = 1, big.mark = "."), ifelse(proj_loin$commentaires <= 1, " commentaire", " commentaires"),
                           "<br>• ", format(as.integer(proj_loin$votes, 0), nsmall = 1, big.mark = "."), ifelse(proj_loin$votes <= 1, " vote", " votes"))) |> 
  addProviderTiles(provider = "Esri.WorldGrayCanvas")
# all providers : http://leaflet-extras.github.io/leaflet-providers/preview/index.html

    # Carte avec les points qui se précisent au fur et à mesure du zoom
test |> 
  leaflet() |>
  addTiles() |> 
  setView(lng = 3, lat = 47, zoom = 4.8) |> 
  addMarkers(lng = test$long, lat = test$lat, 
             label = test$Concessionnaires_Dénominationsociale,
             clusterOptions = markerClusterOptions()) |> #clusterOptions fait cet effet
  addProviderTiles(provider = "Esri.WorldGrayCanvas")
  • Legend out of graph
# Légende
graph_leg <- table_all |> 
filter(Structure == "Une commune" | Structure == "Un EPCI" | Structure == "Un département" | 
       Structure == "Une région" | Structure == "Autre") |> 
mutate(Structure = factor(Structure, levels = c("Autre","Une région","Un département","Un EPCI","Une commune"))) |> 
ggplot(aes(x = Structure, y = Pourcentage, fill = Réponses,
           text = paste0(Pourcentage*100, "% de réponses '", Réponses, "'"))) + 
  geom_bar(position = "fill", color = "white", stat="identity", alpha = .9) +
  labs(x = "", y = "Pourcentage de réponses", title = "Par type de collectivité") +
  scale_y_continuous(labels = scales::percent, limits = c(0,1)) +
  scale_fill_manual(values = c("grey", "#bf0001", "#e1b44d", "#33bbc9", "#323465", "#616165", "#0a0a14")) +
  facet_wrap(~Edition) +
  coord_flip() +
  theme_classic() +
  custom_theme() +
  guides(fill = guide_legend(ncol = 1, title = "", reverse = T)) +
  theme(legend.position = "top",
            panel.grid.major.x = ggplot2::element_line(color = "#cbcbcb"),
            panel.grid.major.y = ggplot2::element_blank())
legend <- get_legend(graph_leg + theme(legend.box.margin = margin(0, 0, 10, 12)))
  • Dual axis on ggplot
table |> 
    mutate(timestamp = as.Date(as.character(paste0(timestamp, "-01")))) |> 
  ggplot(aes(x = timestamp
             #text = paste(n_cumule, "inscrits en", format(timestamp, "%b %Y"))
             )) +
  geom_line( aes(y=n_cumule), size=1, alpha=0.9, color = "#3366CC") + 
  geom_line( aes(y=n_cumule_contrib/3.785737), size=1, alpha=0.9, color = "#CC0000") +
  labs(x = "Temps", y = "Nombre d'inscrits", 
       title = stringr::str_wrap("Évolution du nombre d'inscrits et du nombre de contributions cumulés sur la plateforme depuis son lancement", width = 40)) +
  scale_y_continuous(name = "Nombre total d'inscrits",
    sec.axis = sec_axis(~ . * 3.785737, name = "Nombre total de contributions")) +  #scale_x_date(date_labels = "%Y %b") +
  theme_classic() +
  custom_theme() +
  theme(legend.position = "right",
        axis.title.y = element_text(color = "#3366CC"),
        axis.title.y.right = element_text(color = "#CC0000"))
  • Treemap
library(treemap)
library(treemapify)
table |> 
  filter(row_number() <= 10) |> 
  ggplot() +
      geom_treemap(aes(area = effectif_couple_spe, fill = couple_spe), col = "white", size = 4) +
      geom_treemap_text(aes(area = effectif_couple_spe, fill = couple_spe, 
                            label = paste0(couple_spe, "\n(", percent, "%)")),
                        colour = "white", place = "centre", size = 15, grow = TRUE) +
      scale_fill_manual(values = c("#345E68", "#FEDEA0", "#B7C2A5", "#023743","#7A9BB1", "#B8AA75", "#7B8598", "#345B48", "#476F84", "#D0BA7C")) +
      labs(title = "Les 10 principales spécialités au BAC choisies par les étudiants d'Ile-de-France") +
      custom_theme() +
      theme(legend.position = "none")


Tables

gttable()

library(gt)
library(gtExtras)
data |> 
  gt() |>
  gt_theme_nytimes() |> 
  tab_header(title = "Mon titre") |>
  tab_style(style = list(cell_text(weight = "bold")), #en gras
    locations = cells_body(columns = ma_col)) |> 
  cols_align(align = "center", columns = everything()) |> 
  cols_align(align = "left", columns = c(col1, col2)) |> 
  gt_plt_bar_pct(`% temps`, scaled = TRUE, labels=TRUE, decimals = 0, #attention à remplacer les NA par 0 sinon erreur
           font_size = "14px", fill = "#173541", height = 20)


DT::datatable()

datatable(data, options = list(pageLength = 5, scrollX = TRUE))


knitr::kable()

knitr::kable(stat_indiv, format = "html") |> 
kableExtra::kable_styling(bootstrap_options = c("striped", "hover"))


reactable()

  • Colorize cells based on categorical column
library(reactable)

# Fonction pour colorer les catégories
category_styles <- function(Catégorie) {
  case_when(
    Catégorie == "Faible" ~ list(background = "#e3f2fd", color = "black"),
    Catégorie == "Moyen" ~ list(background = "#b5dbfb", color = "black"),
    Catégorie == "Élevé" ~ list(background = "#1d82df", color = "white"),
    Catégorie == "Très élevé" ~ list(background = "#0d47a1", color = "white"),
    Catégorie == "NA" ~ list(background = "#CCCCCC", color = "black"),
    TRUE ~ list(background = NA, color = NA)
  )
}

# Affichage de la table
table |> 
    reactable(columns = list(Catégorie = colDef(style = function(value){category_styles(value)})))
  • Colorize all columns except one based on numerical column
# Indicateurs en une table
table <- data.frame(
  Nom = c("nom1", "nom3", "nom3"),
  Score = c(85, 92, 78),
  Age = c(25, 30, 22),
  Height = c(160, 175, 168)
)

# Fonction pour colorer les indicateurs
indicateur_styles <- function(Indicateur) {
    # calcul des statistiques
  q1 <- quantile(Indicateur, 0.25, na.rm = TRUE)
  med <- median(Indicateur, na.rm = TRUE)
  q3 <- quantile(Indicateur, 0.75, na.rm = TRUE)
    # conditions de mise en forme
  sapply(Indicateur, function(value) {
    dplyr::case_when(
      value <= q1 ~ "background-color: #e3f2fd; color: black;",
      value > q1 & value <= med ~ "background-color: #b5dbfb; color: black;",
      value > med & value <= q3 ~ "background-color: #1d82df; color: white;",
      value > q3 ~ "background-color: #0d47a1; color: white;",
      TRUE ~ "background-color: #CCCCCC; color: black;"
    )
  })
}

# Liste des colonnes sur lesquelles appliquer le style
column_defs <- setNames(
  lapply(names(table), function(col) {
    colDef(
      style = if (col != "Nom") {
        function(value, index, name) {
          indicateur_styles(table[[col]])[index]
        }
      } else {
        NULL
      }
    )
  }),
  names(table)
)

# Affichage de la table
table |> 
    reactable(columns = column_defs, defaultPageSize = 5)
  • Freeze first column
# Fixer la première colonne
data |> reactable(columns = list(ID = colDef(sticky = "left")))

# Fonction pour fixer la première colonne en plus d'appliquer aux autres colonnes la mise en forme des couleurs
column_defs <- setNames(
  lapply(names(table), function(col) {
    if (col != "ID") {
      colDef(
        style = function(value, index, name) {
          indicateur_styles(table[[col]])[index]
        }
      )
    } else {
      colDef(
        sticky = "left"
      )
    }
  }),
  names(table)
)
  • Export reactable as HTML and JPG
# Export de la table
library(htmlwidgets)
library(webshot2)
saveWidget(widget = table_rea, file = "../mon_path/data.html", selfcontained = TRUE)
webshot2::webshot("../mon_path/data.html", "../mon_path/data.jpg", 
                  vwidth = 1200, vheight = 600)


Fin d’analyse


Export

  • Export CSV
rio::export(data, "~/Downloads/tableau.csv")
write.csv(data, "~/Downloads/tableau.csv", row.names = FALSE, fileEncoding = "UTF-8")
  • Export plots
saving_plot <- function(graph, name, width, height) {
  ggsave(file = glue("~/Downloads/SVG/{name}.svg"), plot = graph, width = width, height = height)
  ggsave(file = glue("~/Downloads/PNG/{name}.png"), plot = graph, width = width, height = height)
}
saving_plot(graph, "histogram", 9, 5)
  • Export html objects (leaflet map, plotly)
library(htmlwidgets)
saveWidget(map, file = "ma_carte.html")


R Markdown

  • Footer, CSS, header files in other folder
includes:
    in_header: !expr here::here("inst/rmarkdown/resources/header.html")
  • Logo / image at top of the document
# Logo haut de page
htmltools::img(src = "lien/vers/mon/image", 
               alt = 'logo', 
               style = 'position:absolute; top:0; right:0; width:400px') #width pour la taille! (ici positionné en haut à droite)
  • Output file into ‘reports’ folder
# Mettre dans le header du document RMD
knit: (
  function(inputFile, encoding) { 
    rmarkdown::render(inputFile, params = "ask",  
      encoding    = encoding,
      output_dir = "../reports", 
      output_file = paste0(tools::file_path_sans_ext(inputFile), ".html")) })
  • Justify text
# Mettre en corps de texte du RMD où dans un fichier CSS à part
<style>
body {
text-align: justify
}
</style> 
  • Custom dfsummary
print(dfSummary(data_summary, style = "grid", graph.magnif = 1, 
                valid.col = FALSE, varnumbers = FALSE, tmp.img.dir = "/tmp", 
                max.distinct.values = 5, headings = FALSE, method = "render", 
                col.widths  = c(300, 200, 100, 50, 20)),
      max.tbl.height = 600,
      method = "render")
  • Use installed font family
library(showtext)
font_add("Nexa", regular = "Nexa Bold.otf")
font_add("Trade Gothic", regular = "Trade Gothic.otf")
showtext_auto()
# then specify on the CSS the name of the font
  • TOC
output:
    rmarkdown::html_document:
      toc: true
      toc_float: true
      toc_depth: 2
      number_sections: true
# {-} après certains titres si on veut enlever le numérotage automatique pour ceux-là
  • Python on Rmd
# En début de Rmd
library(reticulate)

# Première utilisation
py_install("pandas") #pandas
pip install plotly #plotly, à runer dans le terminal

# Pour utiliser un environnement virtuel
Sys.setenv(RETICULATE_PYTHON = "path/to/python.exe") 
virtualenv_create("test_proj")
py_install("pandas", envname = "test_proj", method = "auto")
use_virtualenv("test_proj")
#```{python}
import pandas
import plotly.express as px
matrice = [[43, 57], [12, 88]]
fig = px.imshow(matrice)
fig.show()
#```
  • Remove blank space end RMD file
<div class = "tocify-extend-page" data-unique = "tocify-extend-page" style = "height: 0;"></div>
  • RMD collaboratif
# Déposer Rmd sur GDrive pour travailler en collaboration   
    # LE METTRE DANS UN DOSSIER TRACKDOWN ET LE NOM EN LIGNE DOIT GARDER L'EXTENSION .RMD
trackdown::upload_file(file = "scripts/Rapport_final.Rmd", gfile = "Rapport_final.Rmd")
trackdown::download_file(file = "scripts/Rapport_final.Rmd", gfile = "Rapport_final.Rmd")
  • Insert and center image in Rmd page
<p align="center">
 <img src="../figures/graphique.png" width = "110">
</p>
  • Embed HTML in Rmd page

Code à mettre en corps de texte pour que ça run :

# Embed centré, html en ligne (ex: carte ODS)
<div align="center">
<iframe frameborder="0" width="800" height="600" src="lien/vers/mon/graphique"></iframe>
</div>

# HTML en local
<iframe src="Save_cartos/carte_p17_t1.html" height="600" width="1000" style="border: 0px solid #464646;" allowfullscreen="" allow="autoplay" data-external="1"></iframe>

Attention bien mettre “self_contained: false” dans le header. Ouvrir dans un browser pour voir le résultat.

Autre solution :

knitr::include_url(glue('../figures/tableau_global/tableau_global_{name_salarie}.html'))

Mettre ça dans <style> en corps de texte du RMD pour enlever la bordure noire.

iframe {
  border: none;
}
  • Documentation

Pimp my rmd, Yan Holtz


Git

  • Cancel last commit NOT pushed
git reset HEAD~1
  • Cancel last commit pushed
git revert HEAD
  • Merge diverging branches
git config pull.rebase false
  • Stash local changes for specific file
git stash push
git stash push scripts/tableau_de_bord.html #specific file
  • Stash local changes for all files
git stash # annuler le dernier commit
git stash push --include-untracked # supprimer tous les changements locaux (pull possible ensuite)

Quand message “needs merge” :

  • data/table_tjh.csv
  • commit
  • push


Github actions

  • Install package from github (not from CRAN)
  1. Télécharger le repo contenant le package via l’URL : https://github.com/account/reponame/archive/branchname.tar.gz
  2. Le sauvegarder dans un dossier du repo de travail actuel
  3. L’installer via cette commande à ajouter au .yaml de l’action (name: install icons package, sans oublier d’installer le package remotes)
# Intégrer dans .github/workflows/render-document.yaml
      - uses: r-lib/actions/setup-r-dependencies@v2
        with:
          cache-version: 2
          packages:
            any::knitr
            any::tidyverse
            any::flexdashboard
            any::remotes
            
      - name: install icons package
        run: |
          Rscript -e "remotes::install_url('https://raw.githubusercontent.com/datactivist/plans_de_charge/main/.github/workflows/master.tar.gz')"


Divers


  • Bash commands in R
system(glue("cat {in_dir}/raw_data_{year}.jsonl | jq -c '{{doi, year, bso_classification, hal_id}}' | jq --slurp > {out_dir}/unnested_data_{year}.json"))
  • Parallel execution command
library(parallel)
library(doParallel)
library(foreach)
years <- 2013:2020

numCores <- 2
registerDoParallel(numCores)

foreach (year = years) %dopar% {
  ma_fonction(year)
}

stopImplicitCluster()


Icônes

  • Icons pages

https://fontawesome.com/search?o=r&m=free

https://ionic.io/ionicons

https://jpswalsh.github.io/academicons/

  • Icons into rmarkdwon

fontawesome::fa("play", fill = "#0000CC"), meilleure solution.

icon_style(fontawesome("play", style = NULL), fill = "#0000CC"), implique ce chunk en début de Rmd :

#remotes::install_github("mitchelloharawild/icons", force = TRUE)
library(icons)
#download_fontawesome()
library(extrafont)
  • Illustrations libres de droits

https://cocomaterial.com/

Fonctions créées

  • Export object created within function
assign(glue("n_{year}"), n, envir = .GlobalEnv)
  • Detach created object from environment
rm(ls = inter_bv, ratio, area_commune, area_2017)
  • Possibly function treatment
tryCatch(ma_fonction(data), error = function(e) NULL)
  • Use function saved in other directory
source(here("functions", "match_commune.R"))
object <- memoise::memoise(match_commune, cache = memoise::cache_filesystem(here("cache")))
  • Call variable within created function
fonction <- function(data, variable){
    data |> filter({{variable}} == 2)
}
  • Rename column wihtin created function
fonction <- function(new_cols, cols){
    data |> rename({{ new_cols }} := {{ cols }}) 
}
  • Huge utilisation created function
library(purrr)
purrr::map(.x = c(13:16, 18, 26:38, 40:44, 46, 49, 55, 66:68, 70, 84:90, 93:96, 99:108, 111:113, 115, 139, 162:178, 180, 181, 188:197),
          .f = ~table_recap_simple(.x))
table_recap_stat <- rbind(lapply(ls(pattern="^all_stat_"), function(x) get(x))) |> 
    bind_rows()
  • Create column if it does not already exist
datat <- data |> mutate(`NA` = ifelse("NA" %in% names(data), `NA`, "0%"))
  • “Error in auto_copy(): ! x and y must share the same src”

Dans une fonction créée, lorsqu’une base de données est appelée sans être mise comme argument attendu, il faut l’ajouter comme argument de fonction.

  • Get statistics within function
inter_categories <- function(data, variable){
    # Calcul
    min_val <- min(data[[variable]], na.rm = TRUE) #data[[variable]] pour accéder à la variable dans un df
    q1 <- quantile(data[[variable]], 0.25, na.rm = TRUE)
    med <- median(data[[variable]], na.rm = TRUE)
    q3 <- quantile(data[[variable]], 0.75, na.rm = TRUE)
    max_val <- max(data[[variable]], na.rm = TRUE)
    
    # Assignation à l'environnement global
    assign("min_val", min_val, envir = .GlobalEnv) #nom d'objet "min_val" et pas "min" pour pas crééer de conflit avec le nom de fonction
    assign("q1", q1, envir = .GlobalEnv)
    assign("med", med, envir = .GlobalEnv)
    assign("q3", q3, envir = .GlobalEnv)
    assign("max_val", max_val, envir = .GlobalEnv)
}
inter_categories(table, "Densité (habitant par m²)") #nom de variable entre guillemets et pas backticks sinon est considéré comme objet à part et non une variable du jeu de données
  • Automatic tab for elements of a list in a flexdashboard document
# voir script ici [repo privé] : https://github.com/datactivist/plans_de_charge/blob/main/scripts/tableau_de_bord.Rmd
# ATTENTION : ne pas mettre de caractères spéciaux (&.) dans les noms data-navmenu=""
 

Document sous licence ouverte réalisé par Diane Thierry

diane@datactivist.coop