data.world is an online catalogue for data and analysis.
Create a data.world account, find your API token in advanced settings and save it as a system variable:
Sys.setenv(DW_API_TOKEN = "...")
Install data.world package. Now you’re all set:
library(data.world)
set_config(cfg_env(auth_token_var = "DW_API_TOKEN"))
If you’re not familiar with tidyverse, it’s never too late. I’m a big fan of pipes.
library(tidyverse)
This code downloads all files (about 35 MBytes at the time of writing) from my sumo dataset to the working directory:
# get meta data
get_dataset(
owner_id = "cervus",
dataset_id = "sumo-japan"
) %>%
# extract file names
.$files %>%
map("name") %>%
# download each file
lapply(
function(fn) download_file(
owner_id = "cervus",
dataset_id = "sumo-japan",
file_name = fn,
output = fn
)
)
You’ve got three CSV files:
list.files(pattern = "\\.csv")
## [1] "banzuke.csv" "odds.csv" "results.csv"
Banzuke – rankings published before each tournament – taken from Sumo Reference:
"banzuke.csv" %>%
read_csv() %>%
str()
## Parsed with column specification:
## cols(
## basho = col_double(),
## id = col_double(),
## rank = col_character(),
## rikishi = col_character(),
## heya = col_character(),
## shusshin = col_character(),
## birth_date = col_date(format = ""),
## height = col_double(),
## weight = col_double(),
## prev = col_character(),
## prev_w = col_double(),
## prev_l = col_double()
## )
## tibble [167,115 x 12] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ basho : num [1:167115] 1983 1983 1983 1983 1983 ...
## $ id : num [1:167115] 1354 4080 4095 4104 4112 ...
## $ rank : chr [1:167115] "Y1e" "Y1w" "Y2eHD" "O1e" ...
## $ rikishi : chr [1:167115] "Chiyonofuji" "Kitanoumi" "Wakanohana" "Takanosato" ...
## $ heya : chr [1:167115] "Kokonoe" "Mihogaseki" "Futagoyama" "Futagoyama" ...
## $ shusshin : chr [1:167115] "Hokkaido" "Hokkaido" "Aomori" "Aomori" ...
## $ birth_date: Date[1:167115], format: "1955-06-01" "1953-05-16" ...
## $ height : num [1:167115] 182 179 186 181 183 ...
## $ weight : num [1:167115] 116 165 133 144 163 121 138 181 124 156 ...
## $ prev : chr [1:167115] "Y1e" "Y2eHD" "Y1w" "O1e" ...
## $ prev_w : num [1:167115] 14 9 0 10 10 12 8 9 9 11 ...
## $ prev_l : num [1:167115] 1 3 0 5 5 3 7 6 6 4 ...
## - attr(*, "spec")=
## .. cols(
## .. basho = col_double(),
## .. id = col_double(),
## .. rank = col_character(),
## .. rikishi = col_character(),
## .. heya = col_character(),
## .. shusshin = col_character(),
## .. birth_date = col_date(format = ""),
## .. height = col_double(),
## .. weight = col_double(),
## .. prev = col_character(),
## .. prev_w = col_double(),
## .. prev_l = col_double()
## .. )
Results (top two divisions, at the moment) – also from Sumo Reference:
"results.csv" %>%
read_csv() %>%
str()
## Parsed with column specification:
## cols(
## basho = col_double(),
## day = col_double(),
## rikishi1_id = col_double(),
## rikishi1_rank = col_character(),
## rikishi1_shikona = col_character(),
## rikishi1_result = col_character(),
## rikishi1_win = col_double(),
## kimarite = col_character(),
## rikishi2_id = col_double(),
## rikishi2_rank = col_character(),
## rikishi2_shikona = col_character(),
## rikishi2_result = col_character(),
## rikishi2_win = col_double()
## )
## tibble [218,292 x 13] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ basho : num [1:218292] 1983 1983 1983 1983 1983 ...
## $ day : num [1:218292] 1 1 1 1 1 1 1 1 1 1 ...
## $ rikishi1_id : num [1:218292] 4140 4306 1337 4323 4097 ...
## $ rikishi1_rank : chr [1:218292] "J13w" "Ms1e" "J12w" "J13e" ...
## $ rikishi1_shikona: chr [1:218292] "Chikubayama" "Ofuji" "Tochitsukasa" "Shiraiwa" ...
## $ rikishi1_result : chr [1:218292] "0-1 (7-8)" "1-0 (6-1)" "1-0 (9-6)" "0-1 (3-12)" ...
## $ rikishi1_win : num [1:218292] 0 1 1 0 0 1 0 1 0 1 ...
## $ kimarite : chr [1:218292] "yorikiri" "yorikiri" "oshidashi" "oshidashi" ...
## $ rikishi2_id : num [1:218292] 4306 4140 4323 1337 4319 ...
## $ rikishi2_rank : chr [1:218292] "Ms1e" "J13w" "J13e" "J12w" ...
## $ rikishi2_shikona: chr [1:218292] "Ofuji" "Chikubayama" "Shiraiwa" "Tochitsukasa" ...
## $ rikishi2_result : chr [1:218292] "1-0 (6-1)" "0-1 (7-8)" "0-1 (3-12)" "1-0 (9-6)" ...
## $ rikishi2_win : num [1:218292] 1 0 0 1 1 0 1 0 1 0 ...
## - attr(*, "spec")=
## .. cols(
## .. basho = col_double(),
## .. day = col_double(),
## .. rikishi1_id = col_double(),
## .. rikishi1_rank = col_character(),
## .. rikishi1_shikona = col_character(),
## .. rikishi1_result = col_character(),
## .. rikishi1_win = col_double(),
## .. kimarite = col_character(),
## .. rikishi2_id = col_double(),
## .. rikishi2_rank = col_character(),
## .. rikishi2_shikona = col_character(),
## .. rikishi2_result = col_character(),
## .. rikishi2_win = col_double()
## .. )
Betting odds I’ve been scraping off marathonbet.com since May tournament of 2017:
"odds.csv" %>%
read_csv() %>%
str()
## Parsed with column specification:
## cols(
## rikishi1 = col_character(),
## odds1 = col_double(),
## rikishi2 = col_character(),
## odds2 = col_double(),
## ts = col_datetime(format = "")
## )
## tibble [19,052 x 5] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ rikishi1: chr [1:19052] "Arawashi" "Daishomaru" "Goeido" "Hakuho" ...
## $ odds1 : num [1:19052] 1.8 1.9 1.3 1.1 1.4 1.7 1.6 1.53 1.35 1.26 ...
## $ rikishi2: chr [1:19052] "Ura" "Onosho" "Okinoumi" "Chiyonokuni" ...
## $ odds2 : num [1:19052] 2.01 1.9 3.52 7 2.96 2.15 2.34 2.51 3.2 3.86 ...
## $ ts : POSIXct[1:19052], format: "2017-05-13 09:00:01" "2017-05-13 09:00:01" ...
## - attr(*, "spec")=
## .. cols(
## .. rikishi1 = col_character(),
## .. odds1 = col_double(),
## .. rikishi2 = col_character(),
## .. odds2 = col_double(),
## .. ts = col_datetime(format = "")
## .. )
Examples of what can be done with these data will follow.