library(tidyverse)
library(tidymodels)
# future::plan("multisession", workers = 5)
Three Strikes but Make Them Data Science Languages (R, Python, Julia)
Introduction
I’m Kevin Baer, second-year undergrad at UCLA, and this is the code notebook to go along with my submission for Nick Wan’s “Predict strikeouts with new MLB arm angle data” competition. It’s finals week here at UCLA, so I didn’t have time to really dig into this project. But what I did decide to do was make models in R, Python, and Julia! At its core, it’s a simple model using just the ~150k rows with two strikes in the training set. I believe that this choice allows a much more straightforward process as our train data is more comparable to the test data.
Note:
I’d rate my coding ability as follows: R <- 8/10, Python <- 4/10, Julia <- 4/10. Feel free to send any suggestions or feedback to kevinbaer@ucla.edu because I’m sure there are many improvements in the latter two languages for me to add and I’d love to learn. Also, I have not run the code here because the training can take an hour or two and I’m not that patient.
Packages
We’ll be using data manipulation through Tidyverse, Polars, and Tidier while building XGBoost models through Tidymodels, XGBoost/sci-kit learn, and MLJ.
import polars as pl
import xgboost as xgb
import sklearn.model_selection as skms
import numpy as np
import polars.selectors as cs
import optuna
from sklearn.model_selection import train_test_split
import sklearn.metrics
using Tidier
using MLJ
Data Management
To wrangle our data, I generally changed non-numeric columns into factor or categorical types, depending on the language. I’m sure that there are some smoother ways to do this in Python and Julia.
<- duckplyr::read_csv_duckdb("data/train.csv") |>
train_strikeout as_tibble() |>
filter(strikes == 2) |>
mutate(k = as.factor(as.logical(k))) |>
mutate(across(where(is.character) | where(is.logical), as.factor)) |>
mutate(response = k) |>
select(-c(is_strike, k))
= (
df "data/train.csv", null_values= "NA")
pl.read_csv(filter(pl.col("strikes") == 2)
.
.with_columns("k").cast(pl.Boolean),
pl.col("stand").cast(pl.Categorical),
pl.col("p_throws").cast(pl.Categorical),
pl.col("pitch_type").cast(pl.Categorical),
pl.col("inning_topbot").cast(pl.Categorical),
pl.col("pitch_name").cast(pl.Categorical)
pl.col(
)"k": "response"})
.rename({"is_strike")
.drop( )
=
df @chain begin
read_csv("data/train.csv", missingstring = ["NA", ""])
@filter(strikes == 2)
@mutate(k = as_categorical(k),
= as_categorical(stand),
stand = as_categorical(p_throws),
p_throws = as_categorical(pitch_type),
pitch_type = as_categorical(on_2b),
on_2b = as_categorical(on_1b),
on_1b = as_categorical(inning_topbot),
inning_topbot = as_categorical(pitch_name)
pitch_name
)@mutate(response = k)
@select(-[is_strike, k])
end;
Set-up for ML
I did some various forms of one hot encoding, it’s cool that Polars has this built in.
set.seed(11042004)
<- initial_split(train_strikeout, strata = response)
pitcher_split <- training(pitcher_split)
pitcher_train <- testing(pitcher_split)
pitcher_test
<- vfold_cv(pitcher_train, strata = response, v = 10)
pitcher_folds
<-
rec_2 recipe(response ~ ., data = pitcher_train) |>
update_role(index, new_role = "ID") |>
step_dummy(all_nominal_predictors())
= df.drop("response").to_dummies(cs.categorical())
X
= df.get_column("response") y
= unpack(df, ==(:response), rng=11042004);
y, X
= OneHotEncoder()
hot = fit!(machine(hot, X))
mach = transform(mach, X)
W
= partition(eachindex(y), 0.75, stratify = y, rng = 11042004); train, test
ML Model
Here I built up the xgboost model and created the various components of tuning. There were some interesting naming differences, particularly mtry (tidymodels), n_estimators (xgboost), and num_round (MLJ).
<-
xgb_spec boost_tree(
tree_depth = tune(),
learn_rate = tune(),
loss_reduction = tune(),
min_n = tune(),
sample_size = tune(),
trees = tune(),
mtry = tune()
|>
) set_engine("xgboost") |>
set_mode("classification")
<-
set workflow_set(
preproc = list(rec = rec_2),
models = list(
boosting = xgb_spec
) )
def objective(trial):
= df.drop("response").to_dummies(cs.categorical())
X
= df.get_column("response")
y
= train_test_split(X, y, test_size = 0.25)
train_x, valid_x, train_y, valid_y
= xgb.XGBClassifier()
xgb_model
= {
param "verbosity": 0,
"objective": "binary:logistic",
'n_estimators': trial.suggest_int("n_estimators", 1, 2000),
'min_child_weight': trial.suggest_float("min_child_weight", 2, 40),
'max_depth': trial.suggest_int("max_depth", 1, 15),
'eta': trial.suggest_float("eta", 10**-10, 10**-1, log = True),
'gamma': trial.suggest_float("gamma", 10**-10, 10**1.5, log = True),
'subsample': trial.suggest_float("subsample", .1, 1),
'colsample_bynode': trial.suggest_float("colsample_bynode", .1, 1)
}= xgb.XGBClassifier(**param)
xgb_model
= xgb_model.fit(train_x, train_y)
bst = bst.predict_proba(valid_x)[:,1]
preds = sklearn.metrics.log_loss(valid_y, preds)
accuracy return accuracy
= @load(XGBoostClassifier, pkg=XGBoost, verbosity = 0)
XGBoostClassifier = XGBoostClassifier();
xgb
= range(xgb, :num_round, lower=1, upper=2000);
r1 = range(xgb, :min_child_weight, lower = 2, upper = 40);
r2 = range(xgb, :max_depth, lower = 1, upper = 15);
r3 = range(xgb, :eta, lower = 10^-10, upper = 10^-1, scale = :log10)
r4 = range(xgb, :gamma, lower = 10^-10, upper = 10^1.5, scale = :log10)
r5 = range(xgb, :subsample, lower = .1, upper = 1)
r6 = range(xgb, :colsample_bynode, lower = .1, upper = 1) r7
Tuning
I then tuned these models over the hyperparameters using latin hypercube sampling in R and Julia. After some advice from Nick, I used the optuna package to optimize the hyperparameters.
<-
grid_ctrl_2 control_grid(
parallel_over = "everything",
)<-
grid_results_2 |>
set workflow_map(
seed = 11042004,
resamples = pitcher_folds,
grid = 50,
control = grid_ctrl_2,
metrics = metric_set(mn_log_loss)
)
<-
best_2 %>%
grid_results_2 extract_workflow_set_result("rec_boosting") %>%
select_best(metric = "mn_log_loss")
<-
boosting_test_results_2 %>%
grid_results_2 extract_workflow("rec_boosting") %>%
finalize_workflow(best_2) %>%
last_fit(split = pitcher_split, metrics = metric_set(mn_log_loss))
= optuna.create_study(direction="minimize")
study =3600)
study.optimize(objective, timeout
print("Number of finished trials: ", len(study.trials))
print("Best trial:")
= study.best_trial
trial
print(" Value: {}".format(trial.value))
print(" Params: ")
for key, value in trial.params.items():
print(" {}: {}".format(key, value))
= xgb.XGBClassifier(**trial.params)
clf clf.fit(X, y)
= LatinHypercube(gens=2, popsize= 100)
latin
= TunedModel(
self_tuning =xgb,
model=latin,
tuning=StratifiedCV(nfolds=10),
resampling=[r1, r2, r3, r4, r5, r6, r7],
range=log_loss,
measure=50
n
);
= machine(self_tuning, W[train,:], y[train])
model fit!(model);
Submission Creation
And now that we have our models, it’s just about treating our test set, predicting, and writing to a csv for upload! Note that Python gave the predictions back in the opposite order of R and Julia (the 0 v 1 based indexing hides it here).
<-
xgb_wf workflow() |>
add_recipe(rec_2) |>
add_model(xgb_spec)
<- finalize_workflow(
final_mod
xgb_wf,
best_2|>
) fit(data = train_strikeout)
<- duckplyr::read_csv_duckdb("data/test.csv") |>
sub_data as_tibble() |>
mutate(across(where(is.character) | where(is.logical), as.factor))
<- duckplyr::read_csv_duckdb("data/sample_solution.csv") |>
sub_file as_tibble()
$k <- predict(final_mod, sub_data, type = "prob")$.pred_TRUE
sub_file
# write_csv(sub_file, "submissions/01_r_xgb_simple.csv")
= (
sub_data "data/test.csv", null_values= "NA")
pl.read_csv(filter(pl.col("strikes") == 2)
.
.with_columns("stand").cast(pl.Categorical),
pl.col("p_throws").cast(pl.Categorical),
pl.col("pitch_type").cast(pl.Categorical),
pl.col("inning_topbot").cast(pl.Categorical),
pl.col("pitch_name").cast(pl.Categorical)
pl.col(
)
.to_dummies(cs.categorical())
)
= pl.read_csv("data/sample_solution.csv")
sub_file
= sub_file.with_columns(k = clf.predict_proba(sub_data)[:,1])
sub_file
# sub_file.write_csv("submissions/08_python_xgb_simple_optuna.csv")
=
sub_data @chain begin
read_csv("data/test.csv", missingstring = ["NA", ""])
@mutate(stand = as_categorical(stand))
@mutate(p_throws = as_categorical(p_throws))
@mutate(pitch_type = as_categorical(pitch_type))
@mutate(on_3b = as_categorical(on_3b))
@mutate(on_2b = as_categorical(on_2b))
@mutate(on_1b = as_categorical(on_1b))
@mutate(inning_topbot = as_categorical(inning_topbot))
@mutate(pitch_name = as_categorical(pitch_name))
end;
= read_csv("data/sample_solution.csv")
sub_file
= transform(mach, sub_data)
sub_data_upd
= machine(self_tuning, W, y)
fin_model fit!(fin_model);
= pdf.(predict(fin_model, sub_data_upd),1);
sub_file.k
# write_csv(sub_file, "submissions/02_julia_xgb_simple.csv", col_names = true)
Conclusion
Thanks for reading, I hope you enjoyed! Shoutout to Nick Wan for running this competition!
Reflections
Here were my private leaderboard scores from best to worst:
Julia model - 0.4268
R model - 0.4272 (submitted and finished 2nd, 0.0002 off leader)
Python model - 0.4273
These scores are all incredibly similar - that suggests we have pretty successfully iterated to a very similar spot for all three models! I did 0 seconds of feature engineering – the fact that other models that did were not clearly better indicates that the data provided contains the vast majority of necessary inputs (or that the other people didn’t find uniquely valuable ones).
Overall, I’m very happy with my second place finish given I wasn’t even particularly trying to optimize my scores. Although it’s slightly disappointing to have not submitted the Julia entry, that’s how it goes!
From other entries I’ve read, I feel that I made the right choice to focus solely on two strike pitches — I believe the data is large enough still for xgboost to be extremely effective. If I were to spend time feature engineering, it would likely focus on enhanced metrics of pitch movement, and trying to analyze swing statistics to predict swings and misses.
Contact Me
You can contact me at kevinbaer@ucla.edu and my linkedin is https://www.linkedin.com/in/kevinmbaer/.