diff --git a/25_07_phd_defense/assets/01_common.R b/25_07_phd_defense/assets/01_common.R index b14e29f..1f13e12 100644 --- a/25_07_phd_defense/assets/01_common.R +++ b/25_07_phd_defense/assets/01_common.R @@ -1,4 +1,5 @@ text_size <- 16 +linesize <- 1 width <- 12 height <- 6 @@ -29,3 +30,26 @@ lamgrid <- c(-Inf, 2^(-15:25)) # Gamma grid gammagrid <- sort(1 - sqrt(seq(0, 0.99, .05))) + +material_pals <- c( + "red", "pink", "purple", "deep-purple", "indigo", + "blue", "light-blue", "cyan", "teal", "green", "light-green", "lime", + "yellow", "amber", "orange", "deep-orange", "brown", "grey", "blue-grey" +) +cols <- purrr::map(material_pals, ~ ggsci::pal_material(.x)(10)) %>% + purrr::reduce(cbind) +colnames(cols) <- material_pals + +cols %>% + as_tibble() %>% + mutate(idx = as.factor(1:10)) %>% + pivot_longer(-idx, names_to = "var", values_to = "val") %>% + mutate(var = factor(var, levels = material_pals[19:1])) %>% + ggplot() + + xlab(NULL) + + ylab(NULL) + + geom_tile(aes(x = idx, y = var, fill = val)) + + scale_fill_identity() + + scale_x_discrete(expand = c(0, 0)) + + scale_y_discrete(expand = c(0, 0)) + + theme_minimal() -> plot_cols diff --git a/25_07_phd_defense/assets/library.bib b/25_07_phd_defense/assets/library.bib index 98491d1..d8a31ee 100644 --- a/25_07_phd_defense/assets/library.bib +++ b/25_07_phd_defense/assets/library.bib @@ -14,6 +14,16 @@ booktitle = {Oxford Research Encyclopedia of Economics and Finance}, year = {2019} } +@article{marcjasz2022distributional, + title = {Distributional neural networks for electricity price forecasting}, + author = {Marcjasz, Grzegorz and Narajewski, Micha{\l} and Weron, Rafa{\l} and Ziel, Florian}, + journal = {Energy Economics}, + volume = {125}, + pages = {106843}, + year = {2023}, + doi = {10.1016/j.eneco.2023.106843}, + publisher = {Elsevier} +} @article{atiya2020does, title = {Why does forecast combination work so well?}, author = {Atiya, Amir F}, diff --git a/25_07_phd_defense/index.qmd b/25_07_phd_defense/index.qmd index 315860c..1aa6ad1 100644 --- a/25_07_phd_defense/index.qmd +++ b/25_07_phd_defense/index.qmd @@ -1428,10 +1428,12 @@ weights %>% ::: {.column width="48%"} Potential Downsides: + - Pointwise optimization can induce quantile crossing - Can be solved by sorting the predictions Upsides: + - Pointwise learning outperforms the Naive solution significantly - Online learning is much faster than batch methods - Smoothing further improves the predictive performance @@ -1464,7 +1466,7 @@ The [`r fontawesome::fa("github")` profoc](https://profoc.berrisch.biz/) R Packa :::: -:::: {.notes} + + +# Multivariate Probabilistic CRPS Learning with an Application to Day-Ahead Electricity Prices + +--- + +## Outline + +```{r, include=FALSE} +col_lightgray <- "#e7e7e7" +col_blue <- "#000088" +col_smooth_expost <- "#a7008b" +col_smooth <- "#187a00" +col_pointwise <- "#008790" +col_constant <- "#dd9002" +col_optimum <- "#666666" +col_green <- "#61B94C" +col_orange <- "#ffa600" +col_yellow <- "#FCE135" +``` + +
+ +**Multivariate CRPS Learning** + +- Introduction +- Smoothing procedures +- Application to multivariate electricity price forecasts + +**The `profoc` R package** + +- Package overview +- Implementation details +- Illustrative examples + +## The Framework of Prediction under Expert Advice + +### The sequential framework + +:::: {.columns} + +::: {.column width="48%"} + +Each day, $t = 1, 2, ... T$ + +- The **forecaster** receives predictions $\widehat{X}_{t,k}$ from $K$ **experts** +- The **forecaster** assings weights $w_{t,k}$ to each **expert** +- The **forecaster** calculates her prediction: + +$$\widetilde{X}_{t}=\sum_{k=1}^K w_{t,k}\widehat{X}_{t,k}$$ + +- The realization for $t$ is observed + +::: + +::: {.column width="2%"} + +::: + +::: {.column width="48%"} + +- The experts can be institutions, persons, or models +- The forecasts can be point-forecasts (i.e., mean or median) or full predictive distributions +- We do not need any assumptions concerning the underlying data +- `r Citet(my_bib, "cesa2006prediction")` + +::: + +:::: + +## The Regret + +Weights are updated sequentially according to the past performance of the $K$ experts. + +`r fontawesome::fa("arrow-right", fill ="#000000")` A loss function $\ell$ is needed (to compute the **cumulative regret** $R_{t,k}$) + +\begin{equation} + R_{t,k} = \widetilde{L}_{t} - \widehat{L}_{t,k} = \sum_{i = 1}^t \ell(\widetilde{X}_{i},Y_i) - \ell(\widehat{X}_{i,k},Y_i) + \label{eq_regret} +\end{equation} + +The cumulative regret: +- Indicates the predictive accuracy of expert $k$ until time $t$. +- Measures how much the forecaster *regrets* not having followed the expert's advice + +Popular loss functions for point forecasting `r Citet(my_bib, "gneiting2011making")`: + +:::: {.columns} + +::: {.column width="48%"} + +- $\ell_2$-loss $\ell_2(x, y) = | x -y|^2$ + - optimal for mean prediction + +::: + +::: {.column width="2%"} + +::: + +::: {.column width="48%"} + +- $\ell_1$-loss $\ell_1(x, y) = | x -y|$ + - optimal for median predictions + +::: + +:::: + +--- + +:::: {.columns} + +::: {.column width="48%"} + +### Probabilistic Setting + +An appropriate loss: + +\begin{align*} + \text{CRPS}(F, y) & = \int_{\mathbb{R}} {(F(x) - \mathbb{1}\{ x > y \})}^2 dx + \label{eq_crps} +\end{align*} + +It's strictly proper `r Citet(my_bib, "gneiting2007strictly")`. + +Using the CRPS, we can calculate time-adaptive weights $w_{t,k}$. However, what if the experts' performance varies in parts of the distribution? + +`r fontawesome::fa("lightbulb", fill = col_yellow)` Utilize this relation: + +\begin{align*} + \text{CRPS}(F, y) = 2 \int_0^{1} \text{QL}_p(F^{-1}(p), y) \, d p. + \label{eq_crps_qs} +\end{align*} + +... to combine quantiles of the probabilistic forecasts individually using the quantile-loss QL. + +::: + +::: {.column width="2%"} + +::: + +::: {.column width="48%"} + +### Optimal Convergence + +
+ +`r fontawesome::fa("exclamation", fill = col_orange)` exp-concavity of the loss is required for *selection* and *convex aggregation* properties + +`r fontawesome::fa("exclamation", fill = col_orange)` QL is convex, but not exp-concave + +`r fontawesome::fa("arrow-right", fill ="#000000")` The Bernstein Online Aggregation (BOA) lets us weaken the exp-concavity condition. + +Convergence rates of BOA are: + +`r fontawesome::fa("arrow-right", fill ="#000000")` Almost optimal w.r.t *selection* `r Citet(my_bib, "gaillard2018efficient")`. + +`r fontawesome::fa("arrow-right", fill ="#000000")` Almost optimal w.r.t *convex aggregation* `r Citet(my_bib, "wintenberger2017optimal")`. + +::: + +:::: + +## Multivariate CRPS Learning + + +:::: {.columns} + +::: {.column width="48%"} + +Additionally, we extend the **B-Smooth** and **P-Smooth** procedures to the multivariate setting: + +- Basis matrices for reducing +- - the probabilistic dimension from $P$ to $\widetilde P$ +- - the multivariate dimension from $D$ to $\widetilde D$ + + +- Hat matrices +- - penalized smoothing across P and D dimensions + +We utilize the mean Pinball Score over the entire space for hyperparameter optimization (e.g, $\lambda$) + +::: + +::: {.column width="2%"} + +::: + +::: {.column width="48%"} + +*Basis Smoothing* + +Represent weights as linear combinations of bounded basis functions: + +\begin{equation} + \underbrace{\boldsymbol w_{t,k}}_{D \text{ x } P} = \sum_{j=1}^{\widetilde D} \sum_{l=1}^{\widetilde P} \beta_{t,j,l,k} \varphi^{\text{mv}}_{j} \varphi^{\text{pr}}_{l} = \underbrace{\boldsymbol \varphi^{\text{mv}}}_{D\text{ x }\widetilde D} \boldsymbol \beta_{t,k} \underbrace{{\boldsymbol\varphi^{\text{pr}}}'}_{\widetilde P \text{ x }P} \nonumber +\end{equation} + +A popular choice: B-Splines + +$\boldsymbol \beta_{t,k}$ is calculated using a reduced regret matrix: + +$\underbrace{\boldsymbol r_{t,k}}_{\widetilde P \times \widetilde D} = \boldsymbol \varphi^{\text{pr}} \underbrace{\left({\boldsymbol{QL}}_{\mathcal{P}}^{\nabla}(\widetilde{\boldsymbol X}_{t},Y_t)- {\boldsymbol{QL}}_{\mathcal{P}}^{\nabla}(\widehat{\boldsymbol X}_{t},Y_t)\right)}_{\text{PxD}}\boldsymbol \varphi^{\text{mv}}$ + +If $\widetilde P = P$ it holds that $\boldsymbol \varphi^{pr} = \boldsymbol{I}$ (pointwise) + +For $\widetilde P = 1$ we receive constant weights + +::: + +:::: + +## Multivariate CRPS Learning + +:::: {.columns} + +::: {.column width="48%"} + +**Penalized smoothing:** + +Let $\boldsymbol{\psi}^{\text{mv}}=(\psi_1,\ldots, \psi_{D})$ and $\boldsymbol{\psi}^{\text{pr}}=(\psi_1,\ldots, \psi_{P})$ be two sets of bounded basis functions on $(0,1)$: + +\begin{equation} + \boldsymbol w_{t,k} = \boldsymbol{\psi}^{\text{mv}} \boldsymbol{b}_{t,k} {\boldsymbol{\psi}^{pr}}' +\end{equation} + +with parameter matix $\boldsymbol b_{t,k}$. The latter is estimated to penalize $L_2$-smoothing which minimizes + +\begin{align} + & \| \boldsymbol{\beta}_{t,d, k}' \boldsymbol{\varphi}^{\text{pr}} - \boldsymbol b_{t, d, k}' \boldsymbol{\psi}^{\text{pr}} \|^2_2 + \lambda^{\text{pr}} \| \mathcal{D}_{q} (\boldsymbol b_{t, d, k}' \boldsymbol{\psi}^{\text{pr}}) \|^2_2 + \nonumber \\ + & \| \boldsymbol{\beta}_{t, p, k}' \boldsymbol{\varphi}^{\text{mv}} - \boldsymbol b_{t, p, k}' \boldsymbol{\psi}^{\text{mv}} \|^2_2 + \lambda^{\text{mv}} \| \mathcal{D}_{q} (\boldsymbol b_{t, p, k}' \boldsymbol{\psi}^{\text{mv}}) \|^2_2 \nonumber +\end{align} + +with differential operator $\mathcal{D}_q$ of order $q$ + +Computation is easy since we have an analytical solution. + +::: + +::: {.column width="2%"} + +::: + +::: {.column width="48%"} + +```{r, fig.align="center", echo=FALSE, out.width = "1000px"} +knitr::include_graphics("assets/mcrps_learning/algorithm.svg") +``` + +::: + +:::: + +## Application + + +:::: {.columns} + +::: {.column width="48%"} + +#### Data + +- Day-Ahead electricity price forecasts from `r Citet(my_bib, "marcjasz2022distributional")` +- Produced using probabilistic neural networks +- 24-dimensional distributional forecasts +- Distribution assumptions: JSU and Normal +- 8 experts (4 JSU, 4 Normal) +- 27th Dec. 2018 to 31st Dec. 2020 (736 days) +- We extract 99 quantiles (percentiles) + +::: + +::: {.column width="2%"} + +::: + +::: {.column width="48%"} + +#### Setup + +Evaluation: Exclude first 182 observations + +Extensions: Penalized smoothing | Forgetting + +Tuning strategies: + +- Bayesian Fix + - Sophisticated Baesian Search algorithm +- Online + - Dynamic based on past performance +- Bayesian Online + - First Bayesian Fix then Online + +Computation Time: ~30 Minutes + +::: + +:::: + +# Special Cases + + +:::: {.columns} + +::: {.column width="48%"} + +::: {.panel-tabset} + +## Constant + +```{r, fig.align="center", echo=FALSE, out.width = "400"} +knitr::include_graphics("assets/mcrps_learning/constant.svg") +``` + +## Constant PR + +```{r, fig.align="center", echo=FALSE, out.width = "400"} +knitr::include_graphics("assets/mcrps_learning/constant_pr.svg") +``` + +## Constant MV + +```{r, fig.align="center", echo=FALSE, out.width = "400"} +knitr::include_graphics("assets/mcrps_learning/constant_mv.svg") +``` + +:::: + +::: + +::: {.column width="2%"} + +::: + +::: {.column width="48%"} + +::: {.panel-tabset} + +## Pointwise + +```{r, fig.align="center", echo=FALSE, out.width = "400"} +knitr::include_graphics("assets/mcrps_learning/pointwise.svg") +``` + +## Smooth + +```{r, fig.align="center", echo=FALSE, out.width = "400"} +knitr::include_graphics("assets/mcrps_learning/smooth_best.svg") +``` + +:::: + +::: + +:::: + +## Results + +```{r, fig.align="center", echo=FALSE, out.width = "400"} +knitr::include_graphics("assets/mcrps_learning/tab_performance_sa.svg") +``` + +## Results + +```{r, warning=FALSE, fig.align="center", echo=FALSE, fig.width=12, fig.height=6} +load("assets/mcrps_learning/pars_data.rds") +pars_data %>% + ggplot(aes(x = dates, y = value)) + + geom_rect(aes( + ymin = 0, + ymax = value * 1.2, + xmin = dates[1], + xmax = dates[182], + fill = "Burn-In" + )) + + geom_line(aes(color = name), linewidth = linesize, show.legend = FALSE) + + scale_colour_manual( + values = as.character(cols[5, c("pink", "amber", "green")]) + ) + + facet_grid(name ~ ., + scales = "free_y", + # switch = "both" + ) + + scale_y_continuous( + trans = "log2", + labels = scaleFUN + ) + + theme_minimal() + + theme( + # plot.margin = unit(c(0.2, 0.2, 0.2, 0.2), "cm"), + text = element_text(size = text_size), + legend.key.width = unit(0.9, "inch"), + legend.position = "none" + ) + + ylab(NULL) + + xlab("date") + + scale_fill_manual(NULL, + values = as.character(cols[3, "grey"]) + ) +``` + +## Results: Hour 16:00-17:00 + +```{r, fig.align="center", echo=FALSE, fig.width=12, fig.height=6} +load("assets/mcrps_learning/weights_h.rds") +weights_h %>% + ggplot(aes(date, q, fill = weight)) + + geom_raster(interpolate = TRUE) + + facet_grid( + Expert ~ . # , labeller = labeller(Mod = mod_labs) + ) + + theme_minimal() + + theme( + # plot.margin = unit(c(0.2, 0.2, 0.2, 0.2), "cm"), + text = element_text(size = text_size), + legend.key.height = unit(0.9, "inch") + ) + + scale_x_date(expand = c(0, 0)) + + scale_fill_gradientn( + oob = scales::squish, + limits = c(0, 1), + values = c(seq(0, 0.4, length.out = 8), 0.65, 1), + colours = c( + cols[8, "red"], + cols[5, "deep-orange"], + cols[5, "amber"], + cols[5, "yellow"], + cols[5, "lime"], + cols[5, "light-green"], + cols[5, "green"], + cols[7, "green"], + cols[9, "green"], + cols[10, "green"] + ), + breaks = seq(0, 1, 0.1) + ) + + xlab("date") + + ylab("probability") + + scale_y_continuous(breaks = c(0.1, 0.5, 0.9)) +``` + +## Results: Median + +```{r, fig.align="center", echo=FALSE, fig.width=12, fig.height=6} +load("assets/mcrps_learning/weights_q.rds") +weights_q %>% + mutate(hour = as.numeric(hour) - 1) %>% + ggplot(aes(date, hour, fill = weight)) + + geom_raster(interpolate = TRUE) + + facet_grid( + Expert ~ . # , labeller = labeller(Mod = mod_labs) + ) + + theme_minimal() + + theme( + # plot.margin = unit(c(0.2, 0.2, 0.2, 0.2), "cm"), + text = element_text(size = text_size), + legend.key.height = unit(0.9, "inch") + ) + + scale_x_date(expand = c(0, 0)) + + scale_fill_gradientn( + oob = scales::squish, + limits = c(0, 1), + values = c(seq(0, 0.4, length.out = 8), 0.65, 1), + colours = c( + cols[8, "red"], + cols[5, "deep-orange"], + cols[5, "amber"], + cols[5, "yellow"], + cols[5, "lime"], + cols[5, "light-green"], + cols[5, "green"], + cols[7, "green"], + cols[9, "green"], + cols[10, "green"] + ), + breaks = seq(0, 1, 0.1) + ) + + xlab("date") + + ylab("hour") + + scale_y_continuous(breaks = c(0, 8, 16, 24)) +``` + +## Profoc R Package + +:::: {.columns} + +::: {.column width="48%"} + +### Probabilistic Forecast Combination - profoc + +Available on [Github](https://github.com/BerriJ/profoc) and [CRAN](https://CRAN.R-project.org/package=profoc) + +Main Function: `online()` for online learning. +- Works with multivariate and/or probabilistic data +- Implements BOA, ML-POLY, EWA (and the gradient versions) +- Implements many extensions like smoothing, forgetting, thresholding, etc. +- Various loss functions are available +- Various methods (`predict`, `update`, `plot`, etc.) + +::: + +::: {.column width="2%"} + +::: + +::: {.column width="48%"} + +### Speed + +Large parts of profoc are implemented in C++. + +
+ +
+ +We use `Rcpp`, `RcppArmadillo`, and OpenMP. + +We use `Rcpp` modules to expose a class to R +- Offers great flexibility for the end-user +- Requires very little knowledge of C++ code +- High-Level interface is easy to use + +::: + +:::: + +## Profoc - B-Spline Basis + +:::: {.columns} + +::: {.column width="48%"} + +Basis specification `b_smooth_pr` is internally passed to `make_basis_mats()`: + +```{r, echo = TRUE, eval = FALSE, cache = FALSE} +mod <- online( + y = Y, + experts = experts, + tau = 1:99 / 100, + b_smooth_pr = list( + knots = 9, + mu = 0.3, # NEW + sigma = 1, + nonc = 0, + tailweight = 1, + deg = 3 + ) +) +``` + +Knots are distributed using the generalized beta distribution. + +TODO: Add actual algorithm to backup slides + +::: + +::: {.column width="2%"} + +::: + +::: {.column width="48%"} + +TODO + +::: + +:::: + +## Wrap-Up + +:::: {.columns} + +::: {.column width="48%"} + + The [`r fontawesome::fa("github")` profoc](https://profoc.berrisch.biz/) R Package: + +Profoc is a flexible framework for online learning. + +- It implements several algorithms +- It implements several loss functions +- It implements several extensions +- Its high- and low-level interfaces offer great flexibility + +Profoc is fast. + +- The core components are written in C++ +- The core components utilize OpenMP for parallelization + +::: + +::: {.column width="2%"} + +::: + +::: {.column width="48%"} + +Multivariate Extension: + +- Code is available now +- [Pre-Print](https://arxiv.org/abs/2303.10019) is available now + +Get these slides: + +
+ +
+[https://berrisch.biz/slides/23_06_ecmi/](https://berrisch.biz/slides/23_06_ecmi/) + +::: + +:::: ## Columns Template diff --git a/flake.nix b/flake.nix index 25d1e8d..f6c6b45 100644 --- a/flake.nix +++ b/flake.nix @@ -79,6 +79,7 @@ rPackages.gganimate rPackages.cowplot rPackages.xtable + rPackages.ggsci rPackages.profoc ];