From b2e0a03df8813afc0c772ea8e7729d8de5c4734b Mon Sep 17 00:00:00 2001
From: ArturoAmorQ <arturo.amor-quiroz@polytechnique.edu>
Date: Mon, 18 Nov 2024 11:11:00 +0100
Subject: [PATCH] Update notebooks

---
 notebooks/01_tabular_data_exploration.ipynb   |  19 ++
 .../02_numerical_pipeline_hands_on.ipynb      |   8 +-
 .../02_numerical_pipeline_introduction.ipynb  |   6 +-
 notebooks/03_categorical_pipeline.ipynb       |  13 +-
 ...egorical_pipeline_column_transformer.ipynb |   2 +-
 notebooks/cross_validation_ex_01.ipynb        |   4 +-
 notebooks/cross_validation_grouping.ipynb     |  77 +++---
 .../cross_validation_learning_curve.ipynb     |   2 +-
 notebooks/cross_validation_sol_01.ipynb       |   2 +-
 .../cross_validation_stratification.ipynb     |  71 +++---
 notebooks/cross_validation_train_test.ipynb   |  10 +-
 notebooks/datasets_blood_transfusion.ipynb    |   2 +-
 notebooks/ensemble_bagging.ipynb              |  72 +++---
 notebooks/ensemble_gradient_boosting.ipynb    | 235 +++++++++---------
 notebooks/ensemble_sol_02.ipynb               |  15 ++
 notebooks/linear_models_ex_02.ipynb           |   2 +-
 notebooks/linear_models_ex_04.ipynb           |   2 +-
 ...s_feature_engineering_classification.ipynb |   4 +-
 notebooks/linear_models_sol_02.ipynb          |   2 +-
 notebooks/linear_models_sol_04.ipynb          |   2 +-
 .../linear_regression_non_linear_link.ipynb   |  13 +-
 .../linear_regression_without_sklearn.ipynb   |   2 +-
 notebooks/metrics_classification.ipynb        |   4 +-
 notebooks/metrics_regression.ipynb            |   5 +-
 notebooks/parameter_tuning_grid_search.ipynb  |   4 +-
 notebooks/parameter_tuning_manual.ipynb       |   2 +-
 notebooks/parameter_tuning_nested.ipynb       |   2 +-
 .../parameter_tuning_randomized_search.ipynb  |   2 +-
 notebooks/trees_dataset.ipynb                 |   2 +-
 29 files changed, 328 insertions(+), 258 deletions(-)

diff --git a/notebooks/01_tabular_data_exploration.ipynb b/notebooks/01_tabular_data_exploration.ipynb
index 6e11251e6..d806e4901 100644
--- a/notebooks/01_tabular_data_exploration.ipynb
+++ b/notebooks/01_tabular_data_exploration.ipynb
@@ -98,6 +98,25 @@
     "adult_census.head()"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "An alternative is to omit the `head` method. This would output the intial and\n",
+    "final rows and columns, but everything in between is not shown by default. It\n",
+    "also provides the dataframe's dimensions at the bottom in the format `n_rows`\n",
+    "x `n_columns`."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "adult_census"
+   ]
+  },
   {
    "cell_type": "markdown",
    "metadata": {},
diff --git a/notebooks/02_numerical_pipeline_hands_on.ipynb b/notebooks/02_numerical_pipeline_hands_on.ipynb
index fff46e8cc..d53039ebe 100644
--- a/notebooks/02_numerical_pipeline_hands_on.ipynb
+++ b/notebooks/02_numerical_pipeline_hands_on.ipynb
@@ -38,7 +38,7 @@
     "adult_census = pd.read_csv(\"../datasets/adult-census.csv\")\n",
     "# drop the duplicated column `\"education-num\"` as stated in the first notebook\n",
     "adult_census = adult_census.drop(columns=\"education-num\")\n",
-    "adult_census.head()"
+    "adult_census"
    ]
   },
   {
@@ -64,7 +64,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "data.head()"
+    "data"
    ]
   },
   {
@@ -157,7 +157,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "data.head()"
+    "data"
    ]
   },
   {
@@ -177,7 +177,7 @@
    "outputs": [],
    "source": [
     "numerical_columns = [\"age\", \"capital-gain\", \"capital-loss\", \"hours-per-week\"]\n",
-    "data[numerical_columns].head()"
+    "data[numerical_columns]"
    ]
   },
   {
diff --git a/notebooks/02_numerical_pipeline_introduction.ipynb b/notebooks/02_numerical_pipeline_introduction.ipynb
index 102280de9..a7bbcbd29 100644
--- a/notebooks/02_numerical_pipeline_introduction.ipynb
+++ b/notebooks/02_numerical_pipeline_introduction.ipynb
@@ -53,7 +53,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "adult_census.head()"
+    "adult_census"
    ]
   },
   {
@@ -86,14 +86,14 @@
    "outputs": [],
    "source": [
     "data = adult_census.drop(columns=[target_name])\n",
-    "data.head()"
+    "data"
    ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "We can now linger on the variables, also denominated features, that we later\n",
+    "We can now focus on the variables, also denominated features, that we later\n",
     "use to build our predictive model. In addition, we can also check how many\n",
     "samples are available in our dataset."
    ]
diff --git a/notebooks/03_categorical_pipeline.ipynb b/notebooks/03_categorical_pipeline.ipynb
index 575268c9f..303ec3411 100644
--- a/notebooks/03_categorical_pipeline.ipynb
+++ b/notebooks/03_categorical_pipeline.ipynb
@@ -129,7 +129,7 @@
    "outputs": [],
    "source": [
     "data_categorical = data[categorical_columns]\n",
-    "data_categorical.head()"
+    "data_categorical"
    ]
   },
   {
@@ -312,7 +312,7 @@
    "outputs": [],
    "source": [
     "print(f\"The dataset is composed of {data_categorical.shape[1]} features\")\n",
-    "data_categorical.head()"
+    "data_categorical"
    ]
   },
   {
@@ -404,7 +404,7 @@
     "and check the generalization performance of this machine learning pipeline using\n",
     "cross-validation.\n",
     "\n",
-    "Before we create the pipeline, we have to linger on the `native-country`.\n",
+    "Before we create the pipeline, we have to focus on the `native-country`.\n",
     "Let's recall some statistics regarding this column."
    ]
   },
@@ -529,9 +529,10 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "As you can see, this representation of the categorical variables is\n",
-    "slightly more predictive of the revenue than the numerical variables\n",
-    "that we used previously."
+    "As you can see, this representation of the categorical variables is slightly\n",
+    "more predictive of the revenue than the numerical variables that we used\n",
+    "previously. The reason being that we have more (predictive) categorical\n",
+    "features than numerical ones."
    ]
   },
   {
diff --git a/notebooks/03_categorical_pipeline_column_transformer.ipynb b/notebooks/03_categorical_pipeline_column_transformer.ipynb
index f9f3d5293..14f0b77c4 100644
--- a/notebooks/03_categorical_pipeline_column_transformer.ipynb
+++ b/notebooks/03_categorical_pipeline_column_transformer.ipynb
@@ -244,7 +244,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "data_test.head()"
+    "data_test"
    ]
   },
   {
diff --git a/notebooks/cross_validation_ex_01.ipynb b/notebooks/cross_validation_ex_01.ipynb
index b55e6efff..2e7ac22b6 100644
--- a/notebooks/cross_validation_ex_01.ipynb
+++ b/notebooks/cross_validation_ex_01.ipynb
@@ -52,7 +52,7 @@
     "exercise.\n",
     "\n",
     "Also, this classifier can become more flexible/expressive by using a so-called\n",
-    "kernel that makes the model become non-linear. Again, no understanding regarding\n",
+    "kernel that makes the model become non-linear. Again, no undestanding regarding\n",
     "the mathematics is required to accomplish this exercise.\n",
     "\n",
     "We will use an RBF kernel where a parameter `gamma` allows to tune the\n",
@@ -160,4 +160,4 @@
  },
  "nbformat": 4,
  "nbformat_minor": 5
-}
+}
\ No newline at end of file
diff --git a/notebooks/cross_validation_grouping.ipynb b/notebooks/cross_validation_grouping.ipynb
index b7aa2032e..705db1ca9 100644
--- a/notebooks/cross_validation_grouping.ipynb
+++ b/notebooks/cross_validation_grouping.ipynb
@@ -5,9 +5,8 @@
    "metadata": {},
    "source": [
     "# Sample grouping\n",
-    "We are going to linger into the concept of sample groups. As in the previous\n",
-    "section, we will give an example to highlight some surprising results. This\n",
-    "time, we will use the handwritten digits dataset."
+    "In this notebook we present the concept of **sample groups**. We use the\n",
+    "handwritten digits dataset to highlight some surprising results."
    ]
   },
   {
@@ -26,8 +25,18 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "We will recreate the same model used in the previous notebook: a logistic\n",
-    "regression classifier with a preprocessor to scale the data."
+    "We create a model consisting of a logistic regression classifier with a\n",
+    "preprocessor to scale the data.\n",
+    "\n",
+    "<div class=\"admonition note alert alert-info\">\n",
+    "<p class=\"first admonition-title\" style=\"font-weight: bold;\">Note</p>\n",
+    "<p class=\"last\">Here we use a <tt class=\"docutils literal\">MinMaxScaler</tt> as we know that each pixel's gray-scale is\n",
+    "strictly bounded between 0 (white) and 16 (black). This makes <tt class=\"docutils literal\">MinMaxScaler</tt>\n",
+    "more suited in this case than <tt class=\"docutils literal\">StandardScaler</tt>, as some pixels consistently\n",
+    "have low variance (pixels at the borders might almost always be zero if most\n",
+    "digits are centered in the image). Then, using <tt class=\"docutils literal\">StandardScaler</tt> can result in\n",
+    "a very high scaled value due to division by a small number.</p>\n",
+    "</div>"
    ]
   },
   {
@@ -47,8 +56,10 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "We will use the same baseline model. We will use a `KFold` cross-validation\n",
-    "without shuffling the data at first."
+    "The idea is to compare the estimated generalization performance using\n",
+    "different cross-validation techniques and see how such estimations are\n",
+    "impacted by underlying data structures. We first use a `KFold`\n",
+    "cross-validation without shuffling the data."
    ]
   },
   {
@@ -97,9 +108,9 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "We observe that shuffling the data improves the mean accuracy. We could go a\n",
-    "little further and plot the distribution of the testing score. We can first\n",
-    "concatenate the test scores."
+    "We observe that shuffling the data improves the mean accuracy. We can go a\n",
+    "little further and plot the distribution of the testing score. For such\n",
+    "purpose we concatenate the test scores."
    ]
   },
   {
@@ -120,7 +131,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "Let's plot the distribution now."
+    "Let's now plot the score distributions."
    ]
   },
   {
@@ -131,7 +142,7 @@
    "source": [
     "import matplotlib.pyplot as plt\n",
     "\n",
-    "all_scores.plot.hist(bins=10, edgecolor=\"black\", alpha=0.7)\n",
+    "all_scores.plot.hist(bins=16, edgecolor=\"black\", alpha=0.7)\n",
     "plt.xlim([0.8, 1.0])\n",
     "plt.xlabel(\"Accuracy score\")\n",
     "plt.legend(bbox_to_anchor=(1.05, 0.8), loc=\"upper left\")\n",
@@ -142,9 +153,9 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "The cross-validation testing error that uses the shuffling has less variance\n",
-    "than the one that does not impose any shuffling. It means that some specific\n",
-    "fold leads to a low score in this case."
+    "Shuffling the data results in a higher cross-validated test accuracy with less\n",
+    "variance compared to when the data is not shuffled. It means that some\n",
+    "specific fold leads to a low score in this case."
    ]
   },
   {
@@ -160,9 +171,9 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "Thus, there is an underlying structure in the data that shuffling will break\n",
-    "and get better results. To get a better understanding, we should read the\n",
-    "documentation shipped with the dataset."
+    "Thus, shuffling the data breaks the underlying structure and thus makes the\n",
+    "classification task easier to our model. To get a better understanding, we can\n",
+    "read the dataset description in more detail:"
    ]
   },
   {
@@ -263,7 +274,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "We can check the grouping by plotting the indices linked to writer ids."
+    "We can check the grouping by plotting the indices linked to writers' ids."
    ]
   },
   {
@@ -284,8 +295,9 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "Once we group the digits by writer, we can use cross-validation to take this\n",
-    "information into account: the class containing `Group` should be used."
+    "Once we group the digits by writer, we can incorporate this information into\n",
+    "the cross-validation process by using group-aware variations of the strategies\n",
+    "we have explored in this course, for example, the `GroupKFold` strategy."
    ]
   },
   {
@@ -309,10 +321,12 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "We see that this strategy is less optimistic regarding the model\n",
-    "generalization performance. However, this is the most reliable if our goal is\n",
-    "to make handwritten digits recognition writers independent. Besides, we can as\n",
-    "well see that the standard deviation was reduced."
+    "We see that this strategy leads to a lower generalization performance than the\n",
+    "other two techniques. However, this is the most reliable estimate if our goal\n",
+    "is to evaluate the capabilities of the model to generalize to new unseen\n",
+    "writers. In this sense, shuffling the dataset (or alternatively using the\n",
+    "writers' ids as a new feature) would lead the model to memorize the different\n",
+    "writer's particular handwriting."
    ]
   },
   {
@@ -337,7 +351,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "all_scores.plot.hist(bins=10, edgecolor=\"black\", alpha=0.7)\n",
+    "all_scores.plot.hist(bins=16, edgecolor=\"black\", alpha=0.7)\n",
     "plt.xlim([0.8, 1.0])\n",
     "plt.xlabel(\"Accuracy score\")\n",
     "plt.legend(bbox_to_anchor=(1.05, 0.8), loc=\"upper left\")\n",
@@ -348,9 +362,14 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "As a conclusion, it is really important to take any sample grouping pattern\n",
-    "into account when evaluating a model. Otherwise, the results obtained will be\n",
-    "over-optimistic in regards with reality."
+    "In conclusion, accounting for any sample grouping patterns is crucial when\n",
+    "assessing a model\u2019s ability to generalize to new groups. Without this\n",
+    "consideration, the results may appear overly optimistic compared to the actual\n",
+    "performance.\n",
+    "\n",
+    "The interested reader can learn about other group-aware cross-validation\n",
+    "techniques in the [scikit-learn user\n",
+    "guide](https://scikit-learn.org/stable/modules/cross_validation.html#cross-validation-iterators-for-grouped-data)."
    ]
   }
  ],
diff --git a/notebooks/cross_validation_learning_curve.ipynb b/notebooks/cross_validation_learning_curve.ipynb
index aaac1c12d..7c3eccc76 100644
--- a/notebooks/cross_validation_learning_curve.ipynb
+++ b/notebooks/cross_validation_learning_curve.ipynb
@@ -11,7 +11,7 @@
     "generalizing. Besides these aspects, it is also important to understand how\n",
     "the different errors are influenced by the number of samples available.\n",
     "\n",
-    "In this notebook, we will show this aspect by looking a the variability of\n",
+    "In this notebook, we will show this aspect by looking at the variability of\n",
     "the different errors.\n",
     "\n",
     "Let's first load the data and create the same model as in the previous\n",
diff --git a/notebooks/cross_validation_sol_01.ipynb b/notebooks/cross_validation_sol_01.ipynb
index 925c7e379..0d46c489f 100644
--- a/notebooks/cross_validation_sol_01.ipynb
+++ b/notebooks/cross_validation_sol_01.ipynb
@@ -52,7 +52,7 @@
     "exercise.\n",
     "\n",
     "Also, this classifier can become more flexible/expressive by using a so-called\n",
-    "kernel that makes the model become non-linear. Again, no requirement regarding\n",
+    "kernel that makes the model become non-linear. Again, no understanding regarding\n",
     "the mathematics is required to accomplish this exercise.\n",
     "\n",
     "We will use an RBF kernel where a parameter `gamma` allows to tune the\n",
diff --git a/notebooks/cross_validation_stratification.ipynb b/notebooks/cross_validation_stratification.ipynb
index 7e330c3d2..0bb42d616 100644
--- a/notebooks/cross_validation_stratification.ipynb
+++ b/notebooks/cross_validation_stratification.ipynb
@@ -54,10 +54,11 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "Once we created our model, we will use the cross-validation framework to\n",
-    "evaluate it. We will use the `KFold` cross-validation strategy. We will define\n",
-    "a dataset with nine samples and repeat the cross-validation three times (i.e.\n",
-    "`n_splits`)."
+    "Once the model is created, we can evaluate it using cross-validation. We start\n",
+    "by using the `KFold` strategy.\n",
+    "\n",
+    "Let's review how this strategy works. For such purpose, we define a dataset\n",
+    "with nine samples and split the dataset into three folds (i.e. `n_splits=3`)."
    ]
   },
   {
@@ -79,12 +80,12 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "By defining three splits, we will use three samples for testing and six for\n",
-    "training each time. `KFold` does not shuffle by default. It means that it will\n",
-    "select the three first samples for the testing set at the first split, then\n",
-    "the three next three samples for the second split, and the three next for the\n",
-    "last split. In the end, all samples have been used in testing at least once\n",
-    "among the different splits.\n",
+    "By defining three splits, we use three samples (1-fold) for testing and six\n",
+    "(2-folds) for training each time. `KFold` does not shuffle by default. It\n",
+    "means that the three first samples are selected for the testing set at the\n",
+    "first split, then the three next three samples for the second split, and the\n",
+    "three next for the last split. In the end, all samples have been used in\n",
+    "testing at least once among the different splits.\n",
     "\n",
     "Now, let's apply this strategy to check the generalization performance of our\n",
     "model."
@@ -111,8 +112,8 @@
    "metadata": {},
    "source": [
     "It is a real surprise that our model cannot correctly classify any sample in\n",
-    "any cross-validation split. We will now check our target's value to understand\n",
-    "the issue."
+    "any cross-validation split. We now check our target's value to understand the\n",
+    "issue."
    ]
   },
   {
@@ -134,18 +135,17 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "We see that the target vector `target` is ordered. It will have some\n",
-    "unexpected consequences when using the `KFold` cross-validation. To illustrate\n",
-    "the consequences, we will show the class count in each fold of the\n",
-    "cross-validation in the train and test set.\n",
+    "We see that the target vector `target` is ordered. This has some unexpected\n",
+    "consequences when using the `KFold` cross-validation. To illustrate the\n",
+    "consequences, we show the class count in each fold of the cross-validation in\n",
+    "the train and test set.\n",
     "\n",
     "Let's compute the class counts for both the training and testing sets using\n",
     "the `KFold` cross-validation, and plot these information in a bar plot.\n",
     "\n",
-    "We will iterate given the number of split and check how many samples of each\n",
-    "are present in the training and testing set. We will store the information\n",
-    "into two distincts lists; one for the training set and one for the testing\n",
-    "set."
+    "We iterate given the number of split and check how many samples of each are\n",
+    "present in the training and testing set. We then store the information into\n",
+    "two distinct lists; one for the training set and one for the testing set."
    ]
   },
   {
@@ -172,8 +172,8 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "To plot the information on a single figure, we will concatenate the\n",
-    "information regarding the fold within the same dataset."
+    "To plot the information on a single figure, we concatenate the information\n",
+    "regarding the fold within the same dataset."
    ]
   },
   {
@@ -218,7 +218,7 @@
     "train_cv_counts.plot.bar()\n",
     "plt.legend(bbox_to_anchor=(1.05, 0.8), loc=\"upper left\")\n",
     "plt.ylabel(\"Count\")\n",
-    "_ = plt.title(\"Training set\")"
+    "_ = plt.title(\"Training set class counts\")"
    ]
   },
   {
@@ -230,7 +230,7 @@
     "test_cv_counts.plot.bar()\n",
     "plt.legend(bbox_to_anchor=(1.05, 0.8), loc=\"upper left\")\n",
     "plt.ylabel(\"Count\")\n",
-    "_ = plt.title(\"Test set\")"
+    "_ = plt.title(\"Test set class counts\")"
    ]
   },
   {
@@ -268,7 +268,7 @@
     "90%. Now that we solved our first issue, it would be interesting to check if\n",
     "the class frequency in the training and testing set is equal to our original\n",
     "set's class frequency. It would ensure that we are training and testing our\n",
-    "model with a class distribution that we will encounter in production."
+    "model with a class distribution that we would encounter in production."
    ]
   },
   {
@@ -303,7 +303,7 @@
     "train_cv_counts.plot.bar()\n",
     "plt.legend(bbox_to_anchor=(1.05, 0.8), loc=\"upper left\")\n",
     "plt.ylabel(\"Count\")\n",
-    "_ = plt.title(\"Training set\")"
+    "_ = plt.title(\"Training set class counts\\n(with suffling)\")"
    ]
   },
   {
@@ -315,7 +315,7 @@
     "test_cv_counts.plot.bar()\n",
     "plt.legend(bbox_to_anchor=(1.05, 0.8), loc=\"upper left\")\n",
     "plt.ylabel(\"Count\")\n",
-    "_ = plt.title(\"Test set\")"
+    "_ = plt.title(\"Test set class counts\\n(with suffling)\")"
    ]
   },
   {
@@ -388,7 +388,7 @@
     "train_cv_counts.plot.bar()\n",
     "plt.legend(bbox_to_anchor=(1.05, 0.8), loc=\"upper left\")\n",
     "plt.ylabel(\"Count\")\n",
-    "_ = plt.title(\"Training set\")"
+    "_ = plt.title(\"Training set class counts\\n(with stratifying)\")"
    ]
   },
   {
@@ -400,7 +400,7 @@
     "test_cv_counts.plot.bar()\n",
     "plt.legend(bbox_to_anchor=(1.05, 0.8), loc=\"upper left\")\n",
     "plt.ylabel(\"Count\")\n",
-    "_ = plt.title(\"Test set\")"
+    "_ = plt.title(\"Test set class counts\\n(with stratifying)\")"
    ]
   },
   {
@@ -411,8 +411,17 @@
     "train set and the test set. The difference is due to the small number of\n",
     "samples in the iris dataset.\n",
     "\n",
-    "In conclusion, this is a good practice to use stratification within the\n",
-    "cross-validation framework when dealing with a classification problem."
+    "In other words, stratifying is more effective than just shuffling when it\n",
+    "comes to making sure that the distributions of classes in all the folds are\n",
+    "representative of the entire dataset. As training and testing folds have\n",
+    "similar class distributions, stratifying leads to a more realistic measure of\n",
+    "the model\u2019s ability to generalize. This is specially important when the\n",
+    "performance metrics depend on the proportion of the positive class, as we will\n",
+    "see in a future notebook.\n",
+    "\n",
+    "The interested reader can learn about other stratified cross-validation\n",
+    "techniques in the [scikit-learn user\n",
+    "guide](https://scikit-learn.org/stable/modules/cross_validation.html#cross-validation-iterators-with-stratification-based-on-class-labels)."
    ]
   }
  ],
diff --git a/notebooks/cross_validation_train_test.ipynb b/notebooks/cross_validation_train_test.ipynb
index 8bcba9aa9..77ccbfc89 100644
--- a/notebooks/cross_validation_train_test.ipynb
+++ b/notebooks/cross_validation_train_test.ipynb
@@ -10,7 +10,7 @@
     "of predictive models. While this section could be slightly redundant, we\n",
     "intend to go into details into the cross-validation framework.\n",
     "\n",
-    "Before we dive in, let's linger on the reasons for always having training and\n",
+    "Before we dive in, let's focus on the reasons for always having training and\n",
     "testing sets. Let's first look at the limitation of using a dataset without\n",
     "keeping any samples out.\n",
     "\n",
@@ -42,7 +42,7 @@
     "notebook. The target to be predicted is a continuous variable and not anymore\n",
     "discrete. This task is called regression.\n",
     "\n",
-    "This, we will use a predictive model specific to regression and not to\n",
+    "Thus, we will use a predictive model specific to regression and not to\n",
     "classification."
    ]
   },
@@ -61,7 +61,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "data.head()"
+    "data"
    ]
   },
   {
@@ -79,7 +79,7 @@
    "outputs": [],
    "source": [
     "target *= 100\n",
-    "target.head()"
+    "target"
    ]
   },
   {
@@ -343,7 +343,7 @@
     "import pandas as pd\n",
     "\n",
     "cv_results = pd.DataFrame(cv_results)\n",
-    "cv_results.head()"
+    "cv_results"
    ]
   },
   {
diff --git a/notebooks/datasets_blood_transfusion.ipynb b/notebooks/datasets_blood_transfusion.ipynb
index 7d6d89006..dabde5ec0 100644
--- a/notebooks/datasets_blood_transfusion.ipynb
+++ b/notebooks/datasets_blood_transfusion.ipynb
@@ -84,7 +84,7 @@
     "* `Recency`: the time in months since the last time a person intended to give\n",
     "  blood;\n",
     "* `Frequency`: the number of time a person intended to give blood in the past;\n",
-    "* `Monetary`: the amount of blood given in the past (in c.c.);\n",
+    "* `Monetary`: the amount of blood given in the past (in cm\u00b3);\n",
     "* `Time`: the time in months since the first time a person intended to give\n",
     "  blood.\n",
     "\n",
diff --git a/notebooks/ensemble_bagging.ipynb b/notebooks/ensemble_bagging.ipynb
index 5fc5a0328..27c29a453 100644
--- a/notebooks/ensemble_bagging.ipynb
+++ b/notebooks/ensemble_bagging.ipynb
@@ -6,16 +6,15 @@
    "source": [
     "# Bagging\n",
     "\n",
-    "This notebook introduces a very natural strategy to build ensembles of machine\n",
-    "learning models named \"bagging\".\n",
+    "In this notebook we introduce a very natural strategy to build ensembles of\n",
+    "machine learning models, named \"bagging\".\n",
     "\n",
     "\"Bagging\" stands for Bootstrap AGGregatING. It uses bootstrap resampling\n",
     "(random sampling with replacement) to learn several models on random\n",
     "variations of the training set. At predict time, the predictions of each\n",
     "learner are aggregated to give the final predictions.\n",
     "\n",
-    "First, we will generate a simple synthetic dataset to get insights regarding\n",
-    "bootstraping."
+    "We first create a simple synthetic dataset to better understand bootstrapping."
    ]
   },
   {
@@ -27,16 +26,14 @@
     "import pandas as pd\n",
     "import numpy as np\n",
     "\n",
-    "# create a random number generator that will be used to set the randomness\n",
-    "rng = np.random.RandomState(1)\n",
-    "\n",
     "\n",
     "def generate_data(n_samples=30):\n",
     "    \"\"\"Generate synthetic dataset. Returns `data_train`, `data_test`,\n",
     "    `target_train`.\"\"\"\n",
     "    x_min, x_max = -3, 3\n",
+    "    rng = np.random.default_rng(1)  # Create a random number generator\n",
     "    x = rng.uniform(x_min, x_max, size=n_samples)\n",
-    "    noise = 4.0 * rng.randn(n_samples)\n",
+    "    noise = 4.0 * rng.normal(size=(n_samples,))\n",
     "    y = x**3 - 0.5 * (x + 1) ** 2 + noise\n",
     "    y /= y.std()\n",
     "\n",
@@ -70,9 +67,8 @@
    "metadata": {},
    "source": [
     "\n",
-    "The relationship between our feature and the target to predict is non-linear.\n",
-    "However, a decision tree is capable of approximating such a non-linear\n",
-    "dependency:"
+    "The target to predict is a non-linear function of the only feature. However, a\n",
+    "decision tree is capable of approximating such a non-linear dependency:"
    ]
   },
   {
@@ -121,16 +117,16 @@
     "\n",
     "## Bootstrap resampling\n",
     "\n",
-    "Given a dataset with `n` data points, bootstrapping corresponds to resampling\n",
-    "with replacement  `n` out of such `n` data points uniformly at random.\n",
+    "Bootstrapping involves uniformly resampling `n` data points from a dataset of\n",
+    "`n` points, with replacement, ensuring each sample has an equal chance of\n",
+    "selection.\n",
     "\n",
     "As a result, the output of the bootstrap sampling procedure is another dataset\n",
-    "with also n data points, but likely with duplicates. As a consequence, there\n",
-    "are also data points from the original dataset that are never selected to\n",
-    "appear in a bootstrap sample (by chance). Those data points that are left away\n",
-    "are often referred to as the out-of-bag sample.\n",
+    "with `n` data points, likely containing duplicates. Consequently, some data\n",
+    "points from the original dataset may not be selected for a bootstrap sample.\n",
+    "These unselected data points are often referred to as the out-of-bag sample.\n",
     "\n",
-    "We will create a function that given `data` and `target` will return a\n",
+    "We now create a function that, given `data` and `target`, returns a\n",
     "resampled variation `data_bootstrap` and `target_bootstrap`."
    ]
   },
@@ -140,9 +136,10 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "def bootstrap_sample(data, target):\n",
+    "def bootstrap_sample(data, target, seed=0):\n",
     "    # Indices corresponding to a sampling with replacement of the same sample\n",
     "    # size than the original data\n",
+    "    rng = np.random.default_rng(seed)\n",
     "    bootstrap_indices = rng.choice(\n",
     "        np.arange(target.shape[0]),\n",
     "        size=target.shape[0],\n",
@@ -160,7 +157,7 @@
    "metadata": {},
    "source": [
     "\n",
-    "We will generate 3 bootstrap samples and qualitatively check the difference\n",
+    "We generate 3 bootstrap samples and qualitatively check the difference\n",
     "with the original dataset."
    ]
   },
@@ -176,6 +173,7 @@
     "    data_bootstrap, target_bootstrap = bootstrap_sample(\n",
     "        data_train,\n",
     "        target_train,\n",
+    "        seed=bootstrap_idx,  # ensure bootstrap samples are different but reproducible\n",
     "    )\n",
     "    plt.figure()\n",
     "    plt.scatter(\n",
@@ -242,9 +240,9 @@
    "source": [
     "\n",
     "On average, roughly 63.2% of the original data points of the original dataset\n",
-    "will be present in a given bootstrap sample. Since the bootstrap sample has\n",
-    "the same size as the original dataset, there will be many samples that are in\n",
-    "the bootstrap sample multiple times.\n",
+    "are present in a given bootstrap sample. Since the bootstrap sample has the\n",
+    "same size as the original dataset, there are many samples that are in the\n",
+    "bootstrap sample multiple times.\n",
     "\n",
     "Using bootstrap we are able to generate many datasets, all slightly different.\n",
     "We can fit a decision tree for each of these datasets and they all shall be\n",
@@ -262,7 +260,7 @@
     "    tree = DecisionTreeRegressor(max_depth=3, random_state=0)\n",
     "\n",
     "    data_bootstrap_sample, target_bootstrap_sample = bootstrap_sample(\n",
-    "        data_train, target_train\n",
+    "        data_train, target_train, seed=bootstrap_idx\n",
     "    )\n",
     "    tree.fit(data_bootstrap_sample, target_bootstrap_sample)\n",
     "    bag_of_trees.append(tree)"
@@ -307,7 +305,7 @@
    "source": [
     "## Aggregating\n",
     "\n",
-    "Once our trees are fitted, we are able to get predictions for each of them. In\n",
+    "Once our trees are fitted, we are able to get predictions from each of them. In\n",
     "regression, the most straightforward way to combine those predictions is just\n",
     "to average them: for a given test data point, we feed the input feature values\n",
     "to each of the `n` trained models in the ensemble and as a result compute `n`\n",
@@ -355,7 +353,7 @@
    "metadata": {},
    "source": [
     "\n",
-    "The unbroken red line shows the averaged predictions, which would be the final\n",
+    "The continuous red line shows the averaged predictions, which would be the final\n",
     "predictions given by our 'bag' of decision tree regressors. Note that the\n",
     "predictions of the ensemble is more stable because of the averaging operation.\n",
     "As a result, the bag of trees as a whole is less likely to overfit than the\n",
@@ -407,7 +405,7 @@
     "bagged_trees_predictions = bagged_trees.predict(data_test)\n",
     "plt.plot(data_test[\"Feature\"], bagged_trees_predictions)\n",
     "\n",
-    "_ = plt.title(\"Predictions from a bagging classifier\")"
+    "_ = plt.title(\"Predictions from a bagging regressor\")"
    ]
   },
   {
@@ -461,15 +459,14 @@
    "metadata": {},
    "source": [
     "We used a low value of the opacity parameter `alpha` to better appreciate the\n",
-    "overlap in the prediction functions of the individual trees.\n",
-    "\n",
-    "This visualization gives some insights on the uncertainty in the predictions\n",
-    "in different areas of the feature space.\n",
+    "overlap in the prediction functions of the individual trees. Such\n",
+    "visualization also gives us an intuition on the variance in the predictions\n",
+    "across different zones of the feature space.\n",
     "\n",
     "## Bagging complex pipelines\n",
     "\n",
-    "While we used a decision tree as a base model, nothing prevents us of using\n",
-    "any other type of model.\n",
+    "Even if here we used a decision tree as a base model, nothing prevents us from\n",
+    "using any other type of model.\n",
     "\n",
     "As we know that the original data generating function is a noisy polynomial\n",
     "transformation of the input variable, let us try to fit a bagged polynomial\n",
@@ -490,7 +487,7 @@
     "\n",
     "polynomial_regressor = make_pipeline(\n",
     "    MinMaxScaler(),\n",
-    "    PolynomialFeatures(degree=4),\n",
+    "    PolynomialFeatures(degree=4, include_bias=False),\n",
     "    Ridge(alpha=1e-10),\n",
     ")"
    ]
@@ -499,10 +496,9 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "This pipeline first scales the data to the 0-1 range with `MinMaxScaler`. Then\n",
-    "it extracts degree-4 polynomial features. The resulting features will all stay\n",
-    "in the 0-1 range by construction: if `x` lies in the 0-1 range then `x ** n`\n",
-    "also lies in the 0-1 range for any value of `n`.\n",
+    "This pipeline first scales the data to the 0-1 range using `MinMaxScaler`. It\n",
+    "then generates degree-4 polynomial features. By design, these features remain\n",
+    "in the 0-1 range, as any power of `x` within this range also stays within 0-1.\n",
     "\n",
     "Then the pipeline feeds the resulting non-linear features to a regularized\n",
     "linear regression model for the final prediction of the target variable.\n",
diff --git a/notebooks/ensemble_gradient_boosting.ipynb b/notebooks/ensemble_gradient_boosting.ipynb
index 6bbb6d517..84e636d56 100644
--- a/notebooks/ensemble_gradient_boosting.ipynb
+++ b/notebooks/ensemble_gradient_boosting.ipynb
@@ -4,21 +4,24 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "# Gradient-boosting decision tree (GBDT)\n",
+    "# Gradient-boosting decision tree\n",
     "\n",
-    "In this notebook, we will present the gradient boosting decision tree\n",
-    "algorithm and contrast it with AdaBoost.\n",
+    "In this notebook, we present the gradient boosting decision tree (GBDT) algorithm.\n",
     "\n",
-    "Gradient-boosting differs from AdaBoost due to the following reason: instead\n",
-    "of assigning weights to specific samples, GBDT will fit a decision tree on the\n",
-    "residuals error (hence the name \"gradient\") of the previous tree. Therefore,\n",
-    "each new tree in the ensemble predicts the error made by the previous learner\n",
-    "instead of predicting the target directly.\n",
+    "Even if AdaBoost and GBDT are both boosting algorithms, they are different in\n",
+    "nature: the former assigns weights to specific samples, whereas GBDT fits\n",
+    "successive decision trees on the residual errors (hence the name \"gradient\") of\n",
+    "their preceding tree. Therefore, each new tree in the ensemble tries to refine\n",
+    "its predictions by specifically addressing the errors made by the previous\n",
+    "learner, instead of predicting the target directly.\n",
     "\n",
-    "In this section, we will provide some intuition about the way learners are\n",
-    "combined to give the final prediction. In this regard, let's go back to our\n",
-    "regression problem which is more intuitive for demonstrating the underlying\n",
-    "machinery."
+    "In this section, we provide some intuitions on the way learners are combined\n",
+    "to give the final prediction. For such purpose, we tackle a single-feature\n",
+    "regression problem, which is more intuitive for demonstrating the underlying\n",
+    "machinery.\n",
+    "\n",
+    "Later in this notebook we compare the performance of GBDT (boosting) with that\n",
+    "of a Random Forest (bagging) for a particular dataset."
    ]
   },
   {
@@ -30,17 +33,14 @@
     "import pandas as pd\n",
     "import numpy as np\n",
     "\n",
-    "# Create a random number generator that will be used to set the randomness\n",
-    "rng = np.random.RandomState(0)\n",
-    "\n",
     "\n",
     "def generate_data(n_samples=50):\n",
     "    \"\"\"Generate synthetic dataset. Returns `data_train`, `data_test`,\n",
     "    `target_train`.\"\"\"\n",
     "    x_max, x_min = 1.4, -1.4\n",
-    "    len_x = x_max - x_min\n",
-    "    x = rng.rand(n_samples) * len_x - len_x / 2\n",
-    "    noise = rng.randn(n_samples) * 0.3\n",
+    "    rng = np.random.default_rng(0)  # Create a random number generator\n",
+    "    x = rng.uniform(x_min, x_max, size=(n_samples,))\n",
+    "    noise = rng.normal(size=(n_samples,)) * 0.3\n",
     "    y = x**3 - 0.5 * x**2 + noise\n",
     "\n",
     "    data_train = pd.DataFrame(x, columns=[\"Feature\"])\n",
@@ -74,9 +74,9 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "As we previously discussed, boosting will be based on assembling a sequence of\n",
-    "learners. We will start by creating a decision tree regressor. We will set the\n",
-    "depth of the tree so that the resulting learner will underfit the data."
+    "As we previously discussed, boosting is based on assembling a sequence of\n",
+    "learners. We start by creating a decision tree regressor. We set the depth of\n",
+    "the tree to underfit the data on purpose."
    ]
   },
   {
@@ -96,12 +96,17 @@
   },
   {
    "cell_type": "markdown",
-   "metadata": {},
+   "metadata": {
+    "lines_to_next_cell": 2
+   },
    "source": [
-    "Using the term \"test\" here refers to data that was not used for training. It\n",
-    "should not be confused with data coming from a train-test split, as it was\n",
-    "generated in equally-spaced intervals for the visual evaluation of the\n",
-    "predictions."
+    "Using the term \"test\" here refers to data not used for training. It should not\n",
+    "be confused with data coming from a train-test split, as it was generated in\n",
+    "equally-spaced intervals for the visual evaluation of the predictions.\n",
+    "\n",
+    "To avoid writing the same code in multiple places we define a helper function\n",
+    "to plot the data samples as well as the decision tree predictions and\n",
+    "residuals."
    ]
   },
   {
@@ -110,23 +115,42 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "# plot the data\n",
-    "sns.scatterplot(\n",
-    "    x=data_train[\"Feature\"], y=target_train, color=\"black\", alpha=0.5\n",
-    ")\n",
-    "# plot the predictions\n",
-    "line_predictions = plt.plot(data_test[\"Feature\"], target_test_predicted, \"--\")\n",
+    "def plot_decision_tree_with_residuals(y_train, y_train_pred, y_test_pred):\n",
+    "    \"\"\"Plot the synthetic data, predictions, and residuals for a decision tree.\n",
+    "    Handles are returned to allow custom legends for the plot.\"\"\"\n",
+    "    _fig_, ax = plt.subplots()\n",
+    "    # plot the data\n",
+    "    sns.scatterplot(\n",
+    "        x=data_train[\"Feature\"], y=y_train, color=\"black\", alpha=0.5, ax=ax\n",
+    "    )\n",
+    "    # plot the predictions\n",
+    "    line_predictions = ax.plot(data_test[\"Feature\"], y_test_pred, \"--\")\n",
     "\n",
-    "# plot the residuals\n",
-    "for value, true, predicted in zip(\n",
-    "    data_train[\"Feature\"], target_train, target_train_predicted\n",
-    "):\n",
-    "    lines_residuals = plt.plot([value, value], [true, predicted], color=\"red\")\n",
+    "    # plot the residuals\n",
+    "    for value, true, predicted in zip(\n",
+    "        data_train[\"Feature\"], y_train, y_train_pred\n",
+    "    ):\n",
+    "        lines_residuals = ax.plot(\n",
+    "            [value, value], [true, predicted], color=\"red\"\n",
+    "        )\n",
     "\n",
-    "plt.legend(\n",
-    "    [line_predictions[0], lines_residuals[0]], [\"Fitted tree\", \"Residuals\"]\n",
+    "    handles = [line_predictions[0], lines_residuals[0]]\n",
+    "\n",
+    "    return handles, ax"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "handles, ax = plot_decision_tree_with_residuals(\n",
+    "    target_train, target_train_predicted, target_test_predicted\n",
     ")\n",
-    "_ = plt.title(\"Prediction function together \\nwith errors on the training set\")"
+    "legend_labels = [\"Initial decision tree\", \"Initial residuals\"]\n",
+    "ax.legend(handles, legend_labels, bbox_to_anchor=(1.05, 0.8), loc=\"upper left\")\n",
+    "_ = ax.set_title(\"Decision Tree together \\nwith errors on the training set\")"
    ]
   },
   {
@@ -139,15 +163,15 @@
     "for all the residual lines.</p>\n",
     "</div>\n",
     "Since the tree underfits the data, its accuracy is far from perfect on the\n",
-    "training data. We can observe this in the figure by looking at the difference\n",
-    "between the predictions and the ground-truth data. We represent these errors,\n",
-    "called \"Residuals\", by unbroken red lines.\n",
+    "training data. We can observe this in the figure above by looking at the\n",
+    "difference between the predictions and the ground-truth data. We represent\n",
+    "these errors, called \"residuals\", using solid red lines.\n",
     "\n",
-    "Indeed, our initial tree was not expressive enough to handle the complexity of\n",
+    "Indeed, our initial tree is not expressive enough to handle the complexity of\n",
     "the data, as shown by the residuals. In a gradient-boosting algorithm, the\n",
-    "idea is to create a second tree which, given the same data `data`, will try to\n",
-    "predict the residuals instead of the vector `target`. We would therefore have\n",
-    "a tree that is able to predict the errors made by the initial tree.\n",
+    "idea is to create a second tree which, given the same `data`, tries to predict\n",
+    "the residuals instead of the vector `target`, i.e. we have a second tree that\n",
+    "is able to predict the errors made by the initial tree.\n",
     "\n",
     "Let's train such a tree."
    ]
@@ -173,33 +197,26 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "sns.scatterplot(x=data_train[\"Feature\"], y=residuals, color=\"black\", alpha=0.5)\n",
-    "line_predictions = plt.plot(\n",
-    "    data_test[\"Feature\"], target_test_predicted_residuals, \"--\"\n",
+    "handles, ax = plot_decision_tree_with_residuals(\n",
+    "    residuals,\n",
+    "    target_train_predicted_residuals,\n",
+    "    target_test_predicted_residuals,\n",
     ")\n",
-    "\n",
-    "# plot the residuals of the predicted residuals\n",
-    "for value, true, predicted in zip(\n",
-    "    data_train[\"Feature\"], residuals, target_train_predicted_residuals\n",
-    "):\n",
-    "    lines_residuals = plt.plot([value, value], [true, predicted], color=\"red\")\n",
-    "\n",
-    "plt.legend(\n",
-    "    [line_predictions[0], lines_residuals[0]],\n",
-    "    [\"Fitted tree\", \"Residuals\"],\n",
-    "    bbox_to_anchor=(1.05, 0.8),\n",
-    "    loc=\"upper left\",\n",
-    ")\n",
-    "_ = plt.title(\"Prediction of the previous residuals\")"
+    "legend_labels = [\n",
+    "    \"Predicted residuals\",\n",
+    "    \"Residuals of the\\npredicted residuals\",\n",
+    "]\n",
+    "ax.legend(handles, legend_labels, bbox_to_anchor=(1.05, 0.8), loc=\"upper left\")\n",
+    "_ = ax.set_title(\"Prediction of the initial residuals\")"
    ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "We see that this new tree only manages to fit some of the residuals. We will\n",
-    "focus on a specific sample from the training set (i.e. we know that the sample\n",
-    "will be well predicted using two successive trees). We will use this sample to\n",
+    "We see that this new tree only manages to fit some of the residuals. We now\n",
+    "focus on a specific sample from the training set (as we know that the sample\n",
+    "can be well predicted using two successive trees). We will use this sample to\n",
     "explain how the predictions of both trees are combined. Let's first select\n",
     "this sample in `data_train`."
    ]
@@ -210,19 +227,19 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "sample = data_train.iloc[[-2]]\n",
+    "sample = data_train.iloc[[-7]]\n",
     "x_sample = sample[\"Feature\"].iloc[0]\n",
-    "target_true = target_train.iloc[-2]\n",
-    "target_true_residual = residuals.iloc[-2]"
+    "target_true = target_train.iloc[-7]\n",
+    "target_true_residual = residuals.iloc[-7]"
    ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "Let's plot the previous information and highlight our sample of interest.\n",
-    "Let's start by plotting the original data and the prediction of the first\n",
-    "decision tree."
+    "Let's plot the original data, the predictions of the initial decision tree and\n",
+    "highlight our sample of interest, i.e. this is just a zoom of the plot\n",
+    "displaying the initial shallow tree."
    ]
   },
   {
@@ -231,35 +248,22 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "# Plot the previous information:\n",
-    "#   * the dataset\n",
-    "#   * the predictions\n",
-    "#   * the residuals\n",
-    "\n",
-    "sns.scatterplot(\n",
-    "    x=data_train[\"Feature\"], y=target_train, color=\"black\", alpha=0.5\n",
+    "handles, ax = plot_decision_tree_with_residuals(\n",
+    "    target_train, target_train_predicted, target_test_predicted\n",
     ")\n",
-    "plt.plot(data_test[\"Feature\"], target_test_predicted, \"--\")\n",
-    "for value, true, predicted in zip(\n",
-    "    data_train[\"Feature\"], target_train, target_train_predicted\n",
-    "):\n",
-    "    lines_residuals = plt.plot([value, value], [true, predicted], color=\"red\")\n",
-    "\n",
-    "# Highlight the sample of interest\n",
-    "plt.scatter(\n",
+    "ax.scatter(\n",
     "    sample, target_true, label=\"Sample of interest\", color=\"tab:orange\", s=200\n",
     ")\n",
-    "plt.xlim([-1, 0])\n",
-    "plt.legend(bbox_to_anchor=(1.05, 0.8), loc=\"upper left\")\n",
-    "_ = plt.title(\"Tree predictions\")"
+    "ax.set_xlim([-1, 0])\n",
+    "ax.legend(bbox_to_anchor=(1.05, 0.8), loc=\"upper left\")\n",
+    "_ = ax.set_title(\"Zoom of sample of interest\\nin the initial decision tree\")"
    ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "Now, let's plot the residuals information. We will plot the residuals computed\n",
-    "from the first decision tree and show the residual predictions."
+    "Similarly we plot a zoom of the plot with the prediction of the initial residuals"
    ]
   },
   {
@@ -268,19 +272,11 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "# Plot the previous information:\n",
-    "#   * the residuals committed by the first tree\n",
-    "#   * the residual predictions\n",
-    "#   * the residuals of the residual predictions\n",
-    "\n",
-    "sns.scatterplot(x=data_train[\"Feature\"], y=residuals, color=\"black\", alpha=0.5)\n",
-    "plt.plot(data_test[\"Feature\"], target_test_predicted_residuals, \"--\")\n",
-    "for value, true, predicted in zip(\n",
-    "    data_train[\"Feature\"], residuals, target_train_predicted_residuals\n",
-    "):\n",
-    "    lines_residuals = plt.plot([value, value], [true, predicted], color=\"red\")\n",
-    "\n",
-    "# Highlight the sample of interest\n",
+    "handles, ax = plot_decision_tree_with_residuals(\n",
+    "    residuals,\n",
+    "    target_train_predicted_residuals,\n",
+    "    target_test_predicted_residuals,\n",
+    ")\n",
     "plt.scatter(\n",
     "    sample,\n",
     "    target_true_residual,\n",
@@ -288,9 +284,13 @@
     "    color=\"tab:orange\",\n",
     "    s=200,\n",
     ")\n",
-    "plt.xlim([-1, 0])\n",
-    "plt.legend()\n",
-    "_ = plt.title(\"Prediction of the residuals\")"
+    "legend_labels = [\n",
+    "    \"Predicted residuals\",\n",
+    "    \"Residuals of the\\npredicted residuals\",\n",
+    "]\n",
+    "ax.set_xlim([-1, 0])\n",
+    "ax.legend(bbox_to_anchor=(1.05, 0.8), loc=\"upper left\")\n",
+    "_ = ax.set_title(\"Zoom of sample of interest\\nin the initial residuals\")"
    ]
   },
   {
@@ -299,7 +299,7 @@
    "source": [
     "For our sample of interest, our initial tree is making an error (small\n",
     "residual). When fitting the second tree, the residual in this case is\n",
-    "perfectly fitted and predicted. We will quantitatively check this prediction\n",
+    "perfectly fitted and predicted. We can quantitatively check this prediction\n",
     "using the fitted tree. First, let's check the prediction of the initial tree\n",
     "and compare it with the true value."
    ]
@@ -376,7 +376,9 @@
     "second tree corrects the first tree's error, while the third tree corrects the\n",
     "second tree's error and so on).\n",
     "\n",
-    "We will compare the generalization performance of random-forest and gradient\n",
+    "## First comparison of GBDT vs. random forests\n",
+    "\n",
+    "We now compare the generalization performance of random-forest and gradient\n",
     "boosting on the California housing dataset."
    ]
   },
@@ -467,14 +469,15 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "In term of computation performance, the forest can be parallelized and will\n",
+    "In terms of computing performance, the forest can be parallelized and then\n",
     "benefit from using multiple cores of the CPU. In terms of scoring performance,\n",
     "both algorithms lead to very close results.\n",
     "\n",
-    "However, we see that the gradient boosting is a very fast algorithm to predict\n",
-    "compared to random forest. This is due to the fact that gradient boosting uses\n",
-    "shallow trees. We will go into details in the next notebook about the\n",
-    "hyperparameters to consider when optimizing ensemble methods."
+    "However, we see that gradient boosting is overall faster than random forest.\n",
+    "One of the reasons is that random forests typically rely on deep trees (that\n",
+    "overfit individually) whereas boosting models build shallow trees (that\n",
+    "underfit individually) which are faster to fit and predict. In the following\n",
+    "exercise we will explore more in depth how these two models compare."
    ]
   }
  ],
diff --git a/notebooks/ensemble_sol_02.ipynb b/notebooks/ensemble_sol_02.ipynb
index 81e1feeec..91d8a5204 100644
--- a/notebooks/ensemble_sol_02.ipynb
+++ b/notebooks/ensemble_sol_02.ipynb
@@ -153,6 +153,21 @@
     "plt.plot(data_range[feature_name], forest_predictions, label=\"Random forest\")\n",
     "_ = plt.legend(bbox_to_anchor=(1.05, 0.8), loc=\"upper left\")"
    ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "tags": [
+     "solution"
+    ]
+   },
+   "source": [
+    "The random forest reduces the overfitting of the individual trees but still\n",
+    "overfits itself. In the section on \"hyperparameter tuning with ensemble\n",
+    "methods\" we will see how to further mitigate this effect. Still, interested\n",
+    "users may increase the number of estimators in the forest and try different\n",
+    "values of, e.g., `min_samples_split`."
+   ]
   }
  ],
  "metadata": {
diff --git a/notebooks/linear_models_ex_02.ipynb b/notebooks/linear_models_ex_02.ipynb
index 89bae664e..6a4fb4b15 100644
--- a/notebooks/linear_models_ex_02.ipynb
+++ b/notebooks/linear_models_ex_02.ipynb
@@ -55,7 +55,7 @@
     "\n",
     "data = penguins_non_missing[columns]\n",
     "target = penguins_non_missing[target_name]\n",
-    "data.head()"
+    "data"
    ]
   },
   {
diff --git a/notebooks/linear_models_ex_04.ipynb b/notebooks/linear_models_ex_04.ipynb
index 5d40693d7..91ba9bad6 100644
--- a/notebooks/linear_models_ex_04.ipynb
+++ b/notebooks/linear_models_ex_04.ipynb
@@ -11,7 +11,7 @@
     "In the previous Module we tuned the hyperparameter `C` of the logistic\n",
     "regression without mentioning that it controls the regularization strength.\n",
     "Later, on the slides on \ud83c\udfa5 **Intuitions on regularized linear models** we\n",
-    "metioned that a small `C` provides a more regularized model, whereas a\n",
+    "mentioned that a small `C` provides a more regularized model, whereas a\n",
     "non-regularized model is obtained with an infinitely large value of `C`.\n",
     "Indeed, `C` behaves as the inverse of the `alpha` coefficient in the `Ridge`\n",
     "model.\n",
diff --git a/notebooks/linear_models_feature_engineering_classification.ipynb b/notebooks/linear_models_feature_engineering_classification.ipynb
index 87544be19..44ac85951 100644
--- a/notebooks/linear_models_feature_engineering_classification.ipynb
+++ b/notebooks/linear_models_feature_engineering_classification.ipynb
@@ -490,10 +490,10 @@
    "metadata": {},
    "source": [
     "\n",
-    "The polynomial kernel approach would be interesting in cases were the\n",
+    "The polynomial kernel approach would be interesting in cases where the\n",
     "original feature space is already of high dimension: in these cases,\n",
     "**computing the complete polynomial expansion** with `PolynomialFeatures`\n",
-    "could be **intractable**, while Nystr\u00f6m method can control the output\n",
+    "could be **intractable**, while the Nystr\u00f6m method can control the output\n",
     "dimensionality with the `n_components` parameter.\n",
     "\n",
     "Let's now explore the use of a radial basis function (RBF) kernel:"
diff --git a/notebooks/linear_models_sol_02.ipynb b/notebooks/linear_models_sol_02.ipynb
index e124537d1..697f5ea08 100644
--- a/notebooks/linear_models_sol_02.ipynb
+++ b/notebooks/linear_models_sol_02.ipynb
@@ -55,7 +55,7 @@
     "\n",
     "data = penguins_non_missing[columns]\n",
     "target = penguins_non_missing[target_name]\n",
-    "data.head()"
+    "data"
    ]
   },
   {
diff --git a/notebooks/linear_models_sol_04.ipynb b/notebooks/linear_models_sol_04.ipynb
index 54b7a613e..79d130284 100644
--- a/notebooks/linear_models_sol_04.ipynb
+++ b/notebooks/linear_models_sol_04.ipynb
@@ -11,7 +11,7 @@
     "In the previous Module we tuned the hyperparameter `C` of the logistic\n",
     "regression without mentioning that it controls the regularization strength.\n",
     "Later, on the slides on \ud83c\udfa5 **Intuitions on regularized linear models** we\n",
-    "metioned that a small `C` provides a more regularized model, whereas a\n",
+    "mentioned that a small `C` provides a more regularized model, whereas a\n",
     "non-regularized model is obtained with an infinitely large value of `C`.\n",
     "Indeed, `C` behaves as the inverse of the `alpha` coefficient in the `Ridge`\n",
     "model.\n",
diff --git a/notebooks/linear_regression_non_linear_link.ipynb b/notebooks/linear_regression_non_linear_link.ipynb
index 9a060a2c0..27f6004b0 100644
--- a/notebooks/linear_regression_non_linear_link.ipynb
+++ b/notebooks/linear_regression_non_linear_link.ipynb
@@ -46,7 +46,7 @@
    "source": [
     "<div class=\"admonition tip alert alert-warning\">\n",
     "<p class=\"first admonition-title\" style=\"font-weight: bold;\">Tip</p>\n",
-    "<p class=\"last\"><tt class=\"docutils literal\">np.random.RandomState</tt> allows to create a random number generator which can\n",
+    "<p class=\"last\"><tt class=\"docutils literal\">np.random.RandomState</tt> allows creating a random number generator which can\n",
     "be later used to get deterministic results.</p>\n",
     "</div>\n",
     "\n",
@@ -284,8 +284,15 @@
     "by the `LinearRegression`. We can verify that this procedure is equivalent to\n",
     "creating the features by hand up to numerical error by computing the maximum\n",
     "of the absolute values of the differences between the features generated by\n",
-    "both methods and checking that it is close to zero:\n",
-    "\n",
+    "both methods and checking that it is close to zero:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
     "np.abs(polynomial_expansion.fit_transform(data) - data_expanded).max()"
    ]
   },
diff --git a/notebooks/linear_regression_without_sklearn.ipynb b/notebooks/linear_regression_without_sklearn.ipynb
index 039e1014a..6308e2d50 100644
--- a/notebooks/linear_regression_without_sklearn.ipynb
+++ b/notebooks/linear_regression_without_sklearn.ipynb
@@ -31,7 +31,7 @@
     "import pandas as pd\n",
     "\n",
     "penguins = pd.read_csv(\"../datasets/penguins_regression.csv\")\n",
-    "penguins.head()"
+    "penguins"
    ]
   },
   {
diff --git a/notebooks/metrics_classification.ipynb b/notebooks/metrics_classification.ipynb
index 5ecbabc0f..6ba0579e4 100644
--- a/notebooks/metrics_classification.ipynb
+++ b/notebooks/metrics_classification.ipynb
@@ -121,7 +121,7 @@
     "predictions a classifier can provide.\n",
     "\n",
     "For this reason, we will create a synthetic sample for a new potential donor:\n",
-    "they donated blood twice in the past (1000 c.c. each time). The last time was\n",
+    "they donated blood twice in the past (1000 cm\u00b3 each time). The last time was\n",
     "6 months ago, and the first time goes back to 20 months ago."
    ]
   },
@@ -664,4 +664,4 @@
  },
  "nbformat": 4,
  "nbformat_minor": 5
-}
+}
\ No newline at end of file
diff --git a/notebooks/metrics_regression.ipynb b/notebooks/metrics_regression.ipynb
index cf9199f37..1ce168093 100644
--- a/notebooks/metrics_regression.ipynb
+++ b/notebooks/metrics_regression.ipynb
@@ -150,8 +150,9 @@
    "source": [
     "The $R^2$ score represents the proportion of variance of the target that is\n",
     "explained by the independent variables in the model. The best score possible\n",
-    "is 1 but there is no lower bound. However, a model that predicts the expected\n",
-    "value of the target would get a score of 0."
+    "is 1 but there is no lower bound. However, a model that predicts the [expected\n",
+    "value](https://en.wikipedia.org/wiki/Expected_value) of the target would get a\n",
+    "score of 0."
    ]
   },
   {
diff --git a/notebooks/parameter_tuning_grid_search.ipynb b/notebooks/parameter_tuning_grid_search.ipynb
index cdf8117cc..3b6da1f83 100644
--- a/notebooks/parameter_tuning_grid_search.ipynb
+++ b/notebooks/parameter_tuning_grid_search.ipynb
@@ -64,7 +64,7 @@
    "outputs": [],
    "source": [
     "data = adult_census.drop(columns=[target_name, \"education-num\"])\n",
-    "data.head()"
+    "data"
    ]
   },
   {
@@ -325,7 +325,7 @@
     "cv_results = pd.DataFrame(model_grid_search.cv_results_).sort_values(\n",
     "    \"mean_test_score\", ascending=False\n",
     ")\n",
-    "cv_results.head()"
+    "cv_results"
    ]
   },
   {
diff --git a/notebooks/parameter_tuning_manual.ipynb b/notebooks/parameter_tuning_manual.ipynb
index fcd4d99ca..38b4da2c7 100644
--- a/notebooks/parameter_tuning_manual.ipynb
+++ b/notebooks/parameter_tuning_manual.ipynb
@@ -52,7 +52,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "data.head()"
+    "data"
    ]
   },
   {
diff --git a/notebooks/parameter_tuning_nested.ipynb b/notebooks/parameter_tuning_nested.ipynb
index 6fe297cdb..fb43c8145 100644
--- a/notebooks/parameter_tuning_nested.ipynb
+++ b/notebooks/parameter_tuning_nested.ipynb
@@ -294,7 +294,7 @@
     "</div>\n",
     "\n",
     "However, this evaluation only provides us a single point estimate of the\n",
-    "generalization performance. As recall at the beginning of this notebook, it is\n",
+    "generalization performance. As you recall from the beginning of this notebook, it is\n",
     "beneficial to have a rough idea of the uncertainty of our estimated\n",
     "generalization performance. Therefore, we should instead use an additional\n",
     "cross-validation for this evaluation.\n",
diff --git a/notebooks/parameter_tuning_randomized_search.ipynb b/notebooks/parameter_tuning_randomized_search.ipynb
index 3189e9301..94bc085dc 100644
--- a/notebooks/parameter_tuning_randomized_search.ipynb
+++ b/notebooks/parameter_tuning_randomized_search.ipynb
@@ -72,7 +72,7 @@
    "outputs": [],
    "source": [
     "data = adult_census.drop(columns=[target_name, \"education-num\"])\n",
-    "data.head()"
+    "data"
    ]
   },
   {
diff --git a/notebooks/trees_dataset.ipynb b/notebooks/trees_dataset.ipynb
index c2509a248..05e3b6f7f 100644
--- a/notebooks/trees_dataset.ipynb
+++ b/notebooks/trees_dataset.ipynb
@@ -62,7 +62,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "penguins.head()"
+    "penguins"
    ]
   },
   {