Notes

Data harmonization and linguistic mapping

At least three data types:

Typology

library(lingtypology)
library(tidyverse)
## ── Attaching packages ───────────────────────────────────── tidyverse 1.2.0 ──
## ✔ ggplot2 2.2.1.9000     ✔ purrr   0.2.4     
## ✔ tibble  1.3.4          ✔ dplyr   0.7.4     
## ✔ tidyr   0.7.2          ✔ stringr 1.2.0     
## ✔ readr   1.1.1          ✔ forcats 0.2.0
## ── Conflicts ──────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
library(leaflet)
library(leaflet.minicharts)
## Warning: package 'leaflet.minicharts' was built under R version 3.4.3
library(sf)
## Warning: package 'sf' was built under R version 3.4.3
## Linking to GEOS 3.6.1, GDAL 2.1.3, proj.4 4.9.3
library(glue)
## 
## Attaching package: 'glue'
## The following object is masked from 'package:dplyr':
## 
##     collapse
uralic <- lingtypology::lang.aff("Uralic")
wals_85A <- wals.feature("85A")
wals_85A_scandinavia <- wals_85A %>% filter(language %in% c("Finnish", "Russian", "Swedish"))

map.feature(languages = wals_85A_scandinavia$language,
              features = wals_85A_scandinavia$`85A`,
              label = wals_85A_scandinavia$language,
              shape = c("➡", "⬅"))

Extends to linguistic area maps

map.feature(languages = circassian$language,
            features = circassian$dialect,
            label = circassian$village,
            latitude = circassian$latitude,
            longitude = circassian$longitude)

kpv <- read_csv("https://raw.githubusercontent.com/langdoc/kpv-geography/master/kpv.csv")
## Parsed with column specification:
## cols(
##   village = col_character(),
##   population_2010 = col_integer(),
##   latitude = col_double(),
##   longitude = col_double(),
##   type = col_character(),
##   adm_center = col_character(),
##   district = col_character(),
##   region = col_character(),
##   language = col_character(),
##   dialect = col_character()
## )
map.feature(languages = kpv$language,
            features = kpv$dialect,
            label = kpv$village,
            latitude = kpv$latitude,
            longitude = kpv$longitude)

Comments

Dialect atlas

Map source: http://kettunen.fnhost.org/html/kett117.html

sfc_as_cols <- function(x, names = c("longitude","latitude")) {
  stopifnot(inherits(x,"sf") && inherits(sf::st_geometry(x),"sfc_POINT"))
  ret <- sf::st_coordinates(x)
  ret <- tibble::as_tibble(ret)
  stopifnot(length(names) == ncol(ret))
  x <- x[ , !names(x) %in% names]
  ret <- setNames(ret,names)
  dplyr::bind_cols(x,ret)
}

kettunen <- st_read('data/kettunen.shp') %>% st_transform("+proj=longlat +datum=WGS84") %>% sfc_as_cols()
## Reading layer `kettunen' from data source `/Users/niko/github/paris20180122/data/kettunen.shp' using driver `ESRI Shapefile'
## Simple feature collection with 108500 features and 12 fields
## geometry type:  POINT
## dimension:      XY
## bbox:           xmin: -272962.2 ymin: 6509287 xmax: 757501.3 ymax: 7795978
## epsg (SRID):    NA
## proj4string:    +proj=utm +zone=35 +ellps=GRS80 +units=m +no_defs
map_finnic <- function(data, map =  "Kartta 151"){

        my_colors <-
          c(
            "#1f77b4",
            "#ff7f0e",
            "#2ca02c",
            "#d62728",
            "#9467bd",
            "#8c564b",
            "#e377c2",
            "#7f7f7f",
            "#17becf",
            sample(grDevices::colors()[!grepl("ivory|azure|white|gray|grey|black|pink|1",
                                              grDevices::colors())])
          )
          corpus <- data
          current_selection <- corpus %>% filter(map_id == map)
          pal <- colorFactor({my_colors[1:length(unique(current_selection$feature_value))]},
                                      domain = current_selection$feature_value)

          title_text <- current_selection$feature_description[1] %>% as.character()

          leaflet(data = current_selection) %>%
            addTiles() %>%
            addCircleMarkers(color = ~pal(feature_value),
                             radius = 4,
                             stroke = FALSE, fillOpacity = 0.5,
                             popup = ~feature_value) %>%
            addLegend("bottomleft", pal = pal, values = ~feature_value,
                      title = title_text,
                      opacity = 1
            )

}

kettunen_names <- names(kettunen)

kettunen <- kettunen %>% mutate(ilmio = as.character(ilmio)) %>%
  rename(feature_id = ilmio_id,
                    feature_value = ilmio,
                    feature_description = kuvaus,
                    location = paikka_nim) %>%
  mutate(map_id = str_extract(alaryhma_n, "^[^:]+(?=:)"))

map_finnic(kettunen, "Kartta 117")

Data for these maps

Features used in my variants of Finnic dialect maps:

names(kettunen_names)
## NULL

Using dialect corpus

skn <- read_rds("data/skn_df.rds") %>%
  left_join(read_csv("data/skn_paikat.csv"))
## Parsed with column specification:
## cols(
##   paikka = col_character(),
##   maa = col_character(),
##   lat = col_double(),
##   lon = col_double()
## )
## Joining, by = "paikka"
skn_names <- names(skn)
leaflet(skn %>% distinct(paikka, lat, lon)) %>%
  addTiles() %>%
  addCircleMarkers()
## Assuming 'lon' and 'lat' are longitude and latitude, respectively

Structure here:

Note! Some annotations automatically created! Quality is good, but this is crucial to remember.

names(skn)
##  [1] "sane"          "alkup"         "pos"           "norm"         
##  [5] "msd"           "lemma"         "dephead"       "deprel"       
##  [9] "ref"           "paikka"        "nauhoitusaika" "puhuja"       
## [13] "sukupuoli"     "murre"         "murrealue"     "rooli"        
## [17] "url"           "wav"           "start"         "length"       
## [21] "id"            "position"      "maa"           "lat"          
## [25] "lon"
skn %>% arrange(position) %>% slice(1:10) %>% knitr::kable()
sane alkup pos norm msd lemma dephead deprel ref paikka nauhoitusaika puhuja sukupuoli murre murrealue rooli url wav start length id position maa lat lon
se se Pron se SUBCAT_Dem|NUM_Sg|CASE_Nom se 3 nsubj 2 Suomussalmi 1978 AR NA Kainuu Savolaismurteet muu https://lat.csc.fi/ds/annex/runLoader?nodeid=MPI7571%23&time=200&duration=1593&tiername=AR-original https://lat.csc.fi/ds/imdi_browser/viewcontroller?nodeid=MPI7512%23&action=Download 00:00:0.20 1.59 s 1 1 Suomussalmi, Finland 64.88411 28.91166
ponttuu ponttuu V ponttuu PRS_Sg3|VOICE_Act|TENSE_Prs|MOOD_Ind|OTHER_UNK ponttuu 4 dobj 3 Suomussalmi 1978 AR NA Kainuu Savolaismurteet muu https://lat.csc.fi/ds/annex/runLoader?nodeid=MPI7571%23&time=200&duration=1593&tiername=AR-original https://lat.csc.fi/ds/imdi_browser/viewcontroller?nodeid=MPI7512%23&action=Download 00:00:0.20 1.59 s 1 2 Suomussalmi, Finland 64.88411 28.91166
tehtiin tehtiin V tehtiin PRS_Pe4|VOICE_Pass|TENSE_Prt|MOOD_Ind tehdä 0 ROOT 4 Suomussalmi 1978 AR NA Kainuu Savolaismurteet muu https://lat.csc.fi/ds/annex/runLoader?nodeid=MPI7571%23&time=200&duration=1593&tiername=AR-original https://lat.csc.fi/ds/imdi_browser/viewcontroller?nodeid=MPI7512%23&action=Download 00:00:0.20 1.59 s 1 3 Suomussalmi, Finland 64.88411 28.91166
? ? Punct ? _ ? 4 punct 5 Suomussalmi 1978 AR NA Kainuu Savolaismurteet muu https://lat.csc.fi/ds/annex/runLoader?nodeid=MPI7571%23&time=200&duration=1593&tiername=AR-original https://lat.csc.fi/ds/imdi_browser/viewcontroller?nodeid=MPI7512%23&action=Download 00:00:0.20 1.59 s 1 4 Suomussalmi, Finland 64.88411 28.91166
no ’noo Adv noo _ no 2 advmod 1 Suomussalmi 1978 RJ M Kainuu Savolaismurteet haastateltava https://lat.csc.fi/ds/annex/runLoader?nodeid=MPI7571%23&time=1957&duration=4426&tiername=RJ-original https://lat.csc.fi/ds/imdi_browser/viewcontroller?nodeid=MPI7512%23&action=Download 00:00:1.96 4.43 s 2 5 Suomussalmi, Finland 64.88411 28.91166
pannaan “pannaan V pannaan PRS_Pe4|VOICE_Pass|TENSE_Prs|MOOD_Ind panna 0 ROOT 2 Suomussalmi 1978 RJ M Kainuu Savolaismurteet haastateltava https://lat.csc.fi/ds/annex/runLoader?nodeid=MPI7571%23&time=1957&duration=4426&tiername=RJ-original https://lat.csc.fi/ds/imdi_browser/viewcontroller?nodeid=MPI7512%23&action=Download 00:00:1.96 4.43 s 2 6 Suomussalmi, Finland 64.88411 28.91166
, , Punct , _ , 7 punct 3 Suomussalmi 1978 RJ M Kainuu Savolaismurteet haastateltava https://lat.csc.fi/ds/annex/runLoader?nodeid=MPI7571%23&time=1957&duration=4426&tiername=RJ-original https://lat.csc.fi/ds/imdi_browser/viewcontroller?nodeid=MPI7512%23&action=Download 00:00:1.96 4.43 s 2 7 Suomussalmi, Finland 64.88411 28.91166
ne ne Pron ne SUBCAT_Dem|NUM_Pl|CASE_Nom se 7 nsubj-cop 4 Suomussalmi 1978 RJ M Kainuu Savolaismurteet haastateltava https://lat.csc.fi/ds/annex/runLoader?nodeid=MPI7571%23&time=1957&duration=4426&tiername=RJ-original https://lat.csc.fi/ds/imdi_browser/viewcontroller?nodeid=MPI7512%23&action=Download 00:00:1.96 4.43 s 2 8 Suomussalmi, Finland 64.88411 28.91166
oli oli V oli PRS_Sg3|VOICE_Act|TENSE_Prt|MOOD_Ind olla 7 cop 5 Suomussalmi 1978 RJ M Kainuu Savolaismurteet haastateltava https://lat.csc.fi/ds/annex/runLoader?nodeid=MPI7571%23&time=1957&duration=4426&tiername=RJ-original https://lat.csc.fi/ds/imdi_browser/viewcontroller?nodeid=MPI7512%23&action=Download 00:00:1.96 4.43 s 2 9 Suomussalmi, Finland 64.88411 28.91166
, , Punct , _ , 7 punct 6 Suomussalmi 1978 RJ M Kainuu Savolaismurteet haastateltava https://lat.csc.fi/ds/annex/runLoader?nodeid=MPI7571%23&time=1957&duration=4426&tiername=RJ-original https://lat.csc.fi/ds/imdi_browser/viewcontroller?nodeid=MPI7512%23&action=Download 00:00:1.96 4.43 s 2 10 Suomussalmi, Finland 64.88411 28.91166
skn_kanssa <- skn %>% mutate(id = as.numeric(id)) %>%
  arrange(id, position) %>%
  filter(rooli == "haastateltava") %>%
#  mutate(context = glue("{lag(sane)} {sane} {lead(sane)}")) %>%
  filter(pos == "Adp") %>%
  filter(deprel == "adpos") %>%# View
  mutate(type = ifelse(dephead > ref, "pre", "post")) %>%
  filter(lemma == "kanssa") %>%
  add_count(paikka) %>%
  rename(count_adpos = n) %>%
  group_by(paikka, type) %>%
  mutate(freq_adpos = n() / count_adpos) %>%
  ungroup() %>%
  distinct(paikka, lat, lon, freq_adpos, type) %>%
  spread(type, freq_adpos) %>%
  replace(is.na(.), 0)

# skn_kanssa_hits %>% slice(1) %>% pull(url) %>% browseURL()

You end up with something like this (in this case, for different scenarios with different structures):

skn_kanssa %>% slice(1:10) %>% knitr::kable()
paikka lat lon post pre
Alastaro 60.95198 22.86193 0.8888889 0.1111111
Alatornio 65.82584 24.16745 0.9333333 0.0666667
Artjärvi 60.74365 26.05508 0.9230769 0.0769231
Askola 60.53044 25.59784 0.8800000 0.1200000
Eurajoki 61.20236 21.73398 0.9696970 0.0303030
Hailuoto 65.03333 24.70000 0.9677419 0.0322581
Heinola 61.20423 26.03810 0.9473684 0.0526316
Hietamäki 63.07211 22.51892 0.9565217 0.0434783
Hinnerjoki 60.99986 21.98383 0.9777778 0.0222222
Hollola 60.98870 25.51632 0.9444444 0.0555556
leaflet() %>%
  leaflet::addTiles() %>%
  addMinicharts(lng = skn_kanssa$lon,
                lat = skn_kanssa$lat,
                type = "pie", width = 20,
                chartdata = skn_kanssa[, c("pre", "post")]) %>%
  map.feature(pipe.data = ., 
              languages = wals_85A_scandinavia$language,
              features = wals_85A_scandinavia$`85A`,
              label = wals_85A_scandinavia$language,
              shape = c("➡", "⬅"))

Fake news!

More realistic workflow:

Current situation

kettunen_names
##  [1] "ilmio_id"   "aineisto_i" "aineisto_n" "alaryhma_i" "alaryhma_n"
##  [6] "ylaryhma_i" "ilmio"      "tyyppi"     "kuvaus"     "paikka_nim"
## [11] "paikantyyp" "mml_paikka" "geometry"   "longitude"  "latitude"
skn_names
##  [1] "sane"          "alkup"         "pos"           "norm"         
##  [5] "msd"           "lemma"         "dephead"       "deprel"       
##  [9] "ref"           "paikka"        "nauhoitusaika" "puhuja"       
## [13] "sukupuoli"     "murre"         "murrealue"     "rooli"        
## [17] "url"           "wav"           "start"         "length"       
## [21] "id"            "position"      "maa"           "lat"          
## [25] "lon"