5 min read

Using mlbgameday to see what pitches get hit

Load packages

library(mlbgameday)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggplot2)
library(magrittr)

Get data. Using arbitrary dates.

dat <- get_payload(start = "2018-08-01", end = "2018-08-31")
## Gathering Gameday data, please be patient...
## Warning: executing %dopar% sequentially: no parallel backend registered

This site is useful for deciphering fields.

dat$pitch has pitch data (location, speed, etc), dat$atbat has at bat data (result of at bat, etc), so we need to join these.

# d2 <- inner_join(dat$pitch, dat$atbat, by=c("num", "url")) # All pitches are kept
# Keep only final pitches of each at bat
d2 <- inner_join(dat$pitch, dat$atbat, by=c("play_guid", "num", "url"))

event has the result of the atbat

d2$event %>% table %>% sort(decreasing = T)
## .
##            Strikeout            Groundout               Single 
##                 6873                 5591                 4690 
##               Flyout                 Walk              Lineout 
##                 3496                 2330                 2001 
##              Pop Out               Double             Home Run 
##                 1596                 1383                  992 
##             Forceout     Grounded Into DP          Intent Walk 
##                  594                  583                  562 
##         Hit By Pitch          Field Error              Sac Fly 
##                  324                  254                  216 
##             Sac Bunt               Triple          Double Play 
##                  153                  136                   82 
##  Fielders Choice Out       Bunt Groundout           Runner Out 
##                   59                   53                   51 
##       Strikeout - DP         Bunt Pop Out      Fielders Choice 
##                   33                   23                   13 
##  Batter Interference Catcher Interference     Fan interference 
##                   10                    6                    3 
##         Bunt Lineout           Sac Fly DP          Triple Play 
##                    2                    1                    1

Let’s only keep the main events.

d3 <- d2 %>% filter(event %in% c("Strikeout", "Groundout", "Single", "Flyout", "Walk", "Lineout", "Pop Out", "Double", "Home Run"))

There are 65 of these that don’t have px or pz. I’ll remove them.

d3$px %>% summary()
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max.     NA's 
## -3.27056 -0.44091  0.02298  0.01852  0.47218  3.03732       65
d3 <- d3 %>% filter(!is.na(px))

Let’s look at where these occur in the strike zone.

ggplot(data=d3, mapping=aes(px, pz, color=event)) + geom_point(alpha=.2) + facet_wrap(. ~ event)

Unsurprisingly, walks are from out of the zone and extra base hits are in the zone.

ggplot(data=d3, mapping=aes(px, pz, color=event)) + geom_point(alpha=.2) + facet_wrap(. ~ event) + coord_fixed() + stat_density_2d()

The zone will show which zone the pitched ended up in. 3x3 for the strike zone, four ball zones. These may change across batters (there is overlap comparing 3v4).

ggplot(data=d3, mapping=aes(px, pz, color=event)) + geom_point(alpha=.04) + facet_wrap(. ~ zone) + coord_fixed()

Here I’m making zones for the sake of plotting heat maps.

x <- c()
zone_values <- data.frame()
zones <- data.frame(
  zone = (c(rep(1:9, each=4), rep(11:14, each=6))),
  x = c(1,2,2,1,2,3,3,2,3,4,4,3,1,2,2,1,2,3,3,2,3,4,4,3,1,2,2,1,2,3,3,2,3,4,4,3,  0,1,1,2.5,2.5,0, 2.5,4,4,5,5,2.5, 2.5,5,5,4,4,2.5, 0,2.5,2.5,1,1,0),
  y = c(3,3,4,4,3,3,4,4,3,3,4,4,2,2,3,3,2,2,3,3,2,2,3,3,1,1,2,2,1,1,2,2,1,1,2,2,  2.5,2.5,4,4,5,5, 4,4,2.5,2.5,5,5, 0,0,2.5,2.5,1,1, 0,0,1,1,2.5,2.5)
)
zones$bs <- ifelse(as.numeric(zones$zone) <=9, "S","B")
ggplot() + geom_polygon(aes(x=x,y=y,fill=as.factor(bs), group=zone), zones, alpha=.5)

Okay, so now I want to show that for a given zone, what is the probability of it being a home run (or other event)?

First we’ll just group the events by their zone.

zone_events <- d3 %>% group_by(event, zone) %>% summarize(n=n())
zone_events %>% head
## # A tibble: 6 x 3
## # Groups:   event [1]
##   event   zone     n
##   <chr>  <dbl> <int>
## 1 Double     1    63
## 2 Double     2    60
## 3 Double     3    58
## 4 Double     4   160
## 5 Double     5   182
## 6 Double     6   128
zone_events_total <- d3 %>% group_by(zone) %>% summarize(ntotal=n())
zone_events_total
## # A tibble: 13 x 2
##     zone ntotal
##    <dbl>  <int>
##  1     1   1139
##  2     2   1391
##  3     3   1158
##  4     4   2113
##  5     5   2476
##  6     6   2029
##  7     7   1680
##  8     8   2189
##  9     9   1901
## 10    11   2464
## 11    12   2008
## 12    13   3693
## 13    14   4646

Now we join the zone_events with the total

zone_events2 <- dplyr::full_join(zone_events, zone_events_total, by="zone") %>% mutate(p=n/ntotal)
zone_events2 %>% head
## # A tibble: 6 x 5
## # Groups:   event [1]
##   event   zone     n ntotal      p
##   <chr>  <dbl> <int>  <int>  <dbl>
## 1 Double     1    63   1139 0.0553
## 2 Double     2    60   1391 0.0431
## 3 Double     3    58   1158 0.0501
## 4 Double     4   160   2113 0.0757
## 5 Double     5   182   2476 0.0735
## 6 Double     6   128   2029 0.0631

Now I’ll join with the zone locations.

zone_events3 <- full_join(zone_events2, zones, by="zone")
zone_events3 %>% head
## # A tibble: 6 x 8
## # Groups:   event [1]
##   event   zone     n ntotal      p     x     y bs   
##   <chr>  <dbl> <int>  <int>  <dbl> <dbl> <dbl> <chr>
## 1 Double     1    63   1139 0.0553     1     3 S    
## 2 Double     1    63   1139 0.0553     2     3 S    
## 3 Double     1    63   1139 0.0553     2     4 S    
## 4 Double     1    63   1139 0.0553     1     4 S    
## 5 Double     2    60   1391 0.0431     2     3 S    
## 6 Double     2    60   1391 0.0431     3     3 S

Now let’s try to plot it.

ggplot(zone_events3) + geom_polygon(aes(x=x,y=y,fill=p, group=zone), alpha=.8)  + scale_fill_gradientn(colours=c('blue','white','red')) + facet_wrap(. ~ event)

This plot looks reasonable. It says pitches in the bottom left lead to strikeouts. Pitches in the strike zone don’t lead to walks, and pitches in the strike zone don’t lead to walks. Groundouts result more often form pitches down than up.

It doesn’t look that good or useful. But it’s a start.