library(readxl)
library(plyr)
library(dplyr)
library(tricky)
library(tibble)
library(readr)
library(knitr)

Find keys

  • find_keys() looks at a table and returns a data frame with the name of all available keys.
read_csv(
  system.file(
    "extdata", 
    "table_deputes.csv", 
    package = "tricky")
  ) %>% 
  find_keys() 
## Parsed with column specification:
## cols(
##   nom = col_character(),
##   genre = col_character(),
##   prenom = col_character(),
##   nom_famille = col_character(),
##   date_naissance = col_datetime(format = ""),
##   lieu_naissance = col_character(),
##   circonscription = col_character(),
##   sycomore_id = col_integer(),
##   wikidata = col_character(),
##   frwiki = col_character()
## )
## Warning: Truncating vector to length 1
##               keys
## 1              nom
## 2            genre
## 3           prenom
## 4      nom_famille
## 5   date_naissance
## 6   lieu_naissance
## 7  circonscription
## 8      sycomore_id
## 9         wikidata
## 10          frwiki

Standardize table names

Many datasets have non standard variable names including accents (ie é, è, à, …), spaces and so on.

The French IT Dashboard is an example of a data set with column names in natural language :

read_excel(
  path = system.file(
    "extdata", 
    "panorama.xlsx", 
    package = "tricky"
    )
  ) %>% 
  names()
##  [1] "Ministère porteur"                            
##  [2] "Ministère nom complet"                        
##  [3] "Nom du projet"                                
##  [4] "Projet interministériel"                      
##  [5] "Description et objectifs du projet"           
##  [6] "En savoir plus sur le projet"                 
##  [7] "Début"                                        
##  [8] "Durée prévisionnelle en année"                
##  [9] "Phase du projet en cours"                     
## [10] "Coût estimé"                                  
## [11] "Coût estimé par tranche"                      
## [12] "Zone fonctionnelle"                           
## [13] "Financement [programme(s)]"                   
## [14] "Lien vers les Projets Annuels de Performances"
## [15] "Date de publication"
  • set_standard_names takes a table and returns the same table with standardized names
read_excel(
  path = system.file(
    "extdata", 
    "panorama.xlsx", 
    package = "tricky"
    )
  ) %>% 
  set_standard_names() %>% 
  glimpse()
## Observations: 57
## Variables: 15
## $ ministere_porteur                             <chr> "SPM", "SPM", "S...
## $ ministere_nom_complet                         <chr> "Premier ministr...
## $ nom_du_projet                                 <chr> "FRANCE CONNECT"...
## $ projet_interministeriel                       <chr> "Oui", "Oui", "O...
## $ description_et_objectifs_du_projet            <chr> "FranceConnect o...
## $ en_savoir_plus_sur_le_projet                  <chr> "https://doc.int...
## $ debut                                         <dttm> 2013-07-01, 201...
## $ duree_previsionnelle_en_annee                 <chr> "3.7506849315068...
## $ phase_du_projet_en_cours                      <chr> "Déploiement", "...
## $ cout_estime                                   <chr> "7", "52", "26.3...
## $ cout_estime_par_tranche                       <chr> "entre 5 et 9 M€...
## $ zone_fonctionnelle                            <chr> "Gestion et cont...
## $ financement_programme_s_                      <chr> "Coordination du...
## $ lien_vers_les_projets_annuels_de_performances <chr> "http://www.perf...
## $ date_de_publication                           <dttm> 2016-11-03, 201...

Detect missing values

  • count_na() returns a table of missing and non-missing values in a vector
  • detect_na() returns a table with the number and the share of missing values for each variable
read_excel(
  path = system.file(
    "extdata", 
    "panorama.xlsx", 
    package = "tricky"
    )
  ) %>% 
  set_standard_names() %>% 
  .$ministere_porteur %>% 
  count_na()
## # A tibble: 1 x 2
##   f         n
##   <fct> <int>
## 1 FALSE    57
read_excel(
  path = system.file(
    "extdata", 
    "panorama.xlsx", 
    package = "tricky"
    )
  ) %>% 
  set_standard_names() %>% 
  detect_na()
## # A tibble: 4 x 3
## # Groups:   variable [4]
##   variable                                      n_missing share_missing
##   <fct>                                             <int>         <dbl>
## 1 en_savoir_plus_sur_le_projet                         38         66.7 
## 2 debut                                                 1          1.75
## 3 financement_programme_s_                              1          1.75
## 4 lien_vers_les_projets_annuels_de_performances         6         10.5