5 min read

How to download sumo (or any other) data from data.world

data.world is an online catalogue for data and analysis.

Create a data.world account, find your API token in advanced settings and save it as a system variable:

Sys.setenv(DW_API_TOKEN = "...")

Install data.world package. Now you’re all set:

library(data.world)
set_config(cfg_env(auth_token_var = "DW_API_TOKEN"))

If you’re not familiar with tidyverse, it’s never too late. I’m a big fan of pipes.

library(tidyverse)

This code downloads all files (about 35 MBytes at the time of writing) from my sumo dataset to the working directory:

# get meta data
get_dataset(
    owner_id = "cervus",
    dataset_id = "sumo-japan"
) %>% 
    # extract file names
    .$files %>% 
    map("name") %>% 
    # download each file
    lapply(
        function(fn) download_file(
            owner_id = "cervus",
            dataset_id = "sumo-japan",
            file_name = fn,
            output = fn
        )
    )

You’ve got three CSV files:

list.files(pattern = "\\.csv")
## [1] "banzuke.csv" "odds.csv"    "results.csv"

Banzuke – rankings published before each tournament – taken from Sumo Reference:

"banzuke.csv" %>% 
    read_csv() %>% 
    str()
## Parsed with column specification:
## cols(
##   basho = col_double(),
##   id = col_double(),
##   rank = col_character(),
##   rikishi = col_character(),
##   heya = col_character(),
##   shusshin = col_character(),
##   birth_date = col_date(format = ""),
##   height = col_double(),
##   weight = col_double(),
##   prev = col_character(),
##   prev_w = col_double(),
##   prev_l = col_double()
## )
## tibble [167,115 x 12] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
##  $ basho     : num [1:167115] 1983 1983 1983 1983 1983 ...
##  $ id        : num [1:167115] 1354 4080 4095 4104 4112 ...
##  $ rank      : chr [1:167115] "Y1e" "Y1w" "Y2eHD" "O1e" ...
##  $ rikishi   : chr [1:167115] "Chiyonofuji" "Kitanoumi" "Wakanohana" "Takanosato" ...
##  $ heya      : chr [1:167115] "Kokonoe" "Mihogaseki" "Futagoyama" "Futagoyama" ...
##  $ shusshin  : chr [1:167115] "Hokkaido" "Hokkaido" "Aomori" "Aomori" ...
##  $ birth_date: Date[1:167115], format: "1955-06-01" "1953-05-16" ...
##  $ height    : num [1:167115] 182 179 186 181 183 ...
##  $ weight    : num [1:167115] 116 165 133 144 163 121 138 181 124 156 ...
##  $ prev      : chr [1:167115] "Y1e" "Y2eHD" "Y1w" "O1e" ...
##  $ prev_w    : num [1:167115] 14 9 0 10 10 12 8 9 9 11 ...
##  $ prev_l    : num [1:167115] 1 3 0 5 5 3 7 6 6 4 ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   basho = col_double(),
##   ..   id = col_double(),
##   ..   rank = col_character(),
##   ..   rikishi = col_character(),
##   ..   heya = col_character(),
##   ..   shusshin = col_character(),
##   ..   birth_date = col_date(format = ""),
##   ..   height = col_double(),
##   ..   weight = col_double(),
##   ..   prev = col_character(),
##   ..   prev_w = col_double(),
##   ..   prev_l = col_double()
##   .. )

Results (top two divisions, at the moment) – also from Sumo Reference:

"results.csv" %>% 
    read_csv() %>% 
    str()
## Parsed with column specification:
## cols(
##   basho = col_double(),
##   day = col_double(),
##   rikishi1_id = col_double(),
##   rikishi1_rank = col_character(),
##   rikishi1_shikona = col_character(),
##   rikishi1_result = col_character(),
##   rikishi1_win = col_double(),
##   kimarite = col_character(),
##   rikishi2_id = col_double(),
##   rikishi2_rank = col_character(),
##   rikishi2_shikona = col_character(),
##   rikishi2_result = col_character(),
##   rikishi2_win = col_double()
## )
## tibble [218,292 x 13] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
##  $ basho           : num [1:218292] 1983 1983 1983 1983 1983 ...
##  $ day             : num [1:218292] 1 1 1 1 1 1 1 1 1 1 ...
##  $ rikishi1_id     : num [1:218292] 4140 4306 1337 4323 4097 ...
##  $ rikishi1_rank   : chr [1:218292] "J13w" "Ms1e" "J12w" "J13e" ...
##  $ rikishi1_shikona: chr [1:218292] "Chikubayama" "Ofuji" "Tochitsukasa" "Shiraiwa" ...
##  $ rikishi1_result : chr [1:218292] "0-1 (7-8)" "1-0 (6-1)" "1-0 (9-6)" "0-1 (3-12)" ...
##  $ rikishi1_win    : num [1:218292] 0 1 1 0 0 1 0 1 0 1 ...
##  $ kimarite        : chr [1:218292] "yorikiri" "yorikiri" "oshidashi" "oshidashi" ...
##  $ rikishi2_id     : num [1:218292] 4306 4140 4323 1337 4319 ...
##  $ rikishi2_rank   : chr [1:218292] "Ms1e" "J13w" "J13e" "J12w" ...
##  $ rikishi2_shikona: chr [1:218292] "Ofuji" "Chikubayama" "Shiraiwa" "Tochitsukasa" ...
##  $ rikishi2_result : chr [1:218292] "1-0 (6-1)" "0-1 (7-8)" "0-1 (3-12)" "1-0 (9-6)" ...
##  $ rikishi2_win    : num [1:218292] 1 0 0 1 1 0 1 0 1 0 ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   basho = col_double(),
##   ..   day = col_double(),
##   ..   rikishi1_id = col_double(),
##   ..   rikishi1_rank = col_character(),
##   ..   rikishi1_shikona = col_character(),
##   ..   rikishi1_result = col_character(),
##   ..   rikishi1_win = col_double(),
##   ..   kimarite = col_character(),
##   ..   rikishi2_id = col_double(),
##   ..   rikishi2_rank = col_character(),
##   ..   rikishi2_shikona = col_character(),
##   ..   rikishi2_result = col_character(),
##   ..   rikishi2_win = col_double()
##   .. )

Betting odds I’ve been scraping off marathonbet.com since May tournament of 2017:

"odds.csv" %>% 
    read_csv() %>% 
    str()
## Parsed with column specification:
## cols(
##   rikishi1 = col_character(),
##   odds1 = col_double(),
##   rikishi2 = col_character(),
##   odds2 = col_double(),
##   ts = col_datetime(format = "")
## )
## tibble [19,052 x 5] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
##  $ rikishi1: chr [1:19052] "Arawashi" "Daishomaru" "Goeido" "Hakuho" ...
##  $ odds1   : num [1:19052] 1.8 1.9 1.3 1.1 1.4 1.7 1.6 1.53 1.35 1.26 ...
##  $ rikishi2: chr [1:19052] "Ura" "Onosho" "Okinoumi" "Chiyonokuni" ...
##  $ odds2   : num [1:19052] 2.01 1.9 3.52 7 2.96 2.15 2.34 2.51 3.2 3.86 ...
##  $ ts      : POSIXct[1:19052], format: "2017-05-13 09:00:01" "2017-05-13 09:00:01" ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   rikishi1 = col_character(),
##   ..   odds1 = col_double(),
##   ..   rikishi2 = col_character(),
##   ..   odds2 = col_double(),
##   ..   ts = col_datetime(format = "")
##   .. )

Examples of what can be done with these data will follow.