Skewness and Kurtosis#
Central Moments#
In modern data analysis we are sometimes interested in high-order moments. Here we consider two useful quantities: skewness and kurtosis.
(Central Moments)
For a random variable \(X\) with \(P D F f_X(x)\), define the following central moments as
\[\begin{split}
\begin{aligned}
\text { mean } & =\mathbb{E}[X] \stackrel{\text { def }}{=} \mu, \\
\text { variance } & =\mathbb{E}\left[(X-\mu)^2\right] \stackrel{\text { def }}{=} \sigma^2, \\
\text { skewness } & =\mathbb{E}\left[\left(\frac{X-\mu}{\sigma}\right)^3\right] \stackrel{\text { def }}{=} \gamma \\
\text { kurtosis } & =\mathbb{E}\left[\left(\frac{X-\mu}{\sigma}\right)^4\right] \stackrel{\text { def }}{=} \kappa, \quad \text { excess kurtosis } \stackrel{\text { def }}{=} \kappa-3 .
\end{aligned}
\end{split}\]
As you can see from the definitions above, skewness is the third central moment, whereas kurtosis is the fourth central moment. Both skewness and kurtosis can be regarded as “deviations” from a standard Gaussian - not in terms of mean and variance but in terms of shape[Chan, 2021].
Skewness#
1import numpy as np
2import matplotlib.pyplot as plt
3from scipy import stats
4from typing import List
5
6def generate_skewed_data(size: int, skewness: float) -> np.ndarray:
7 """Generate skewed data using a beta distribution."""
8 if skewness > 0:
9 a, b = 2, 5
10 elif skewness < 0:
11 a, b = 5, 2
12 else:
13 a = b = 5
14 return np.random.beta(a, b, size)
15
16def plot_distributions(data: List[np.ndarray], labels: List[str]) -> None:
17 """Plot the three distributions on the same graph with KDE."""
18 plt.figure(figsize=(12, 6))
19 colors = ['red', 'blue', 'green']
20
21 for d, label, color in zip(data, labels, colors):
22 skewness = stats.skew(d)
23 kde = stats.gaussian_kde(d)
24 x_range = np.linspace(0, 1, 1000)
25
26 plt.hist(d, bins=30, density=True, alpha=0.3, color=color, edgecolor='black')
27 plt.plot(x_range, kde(x_range), color=color, label=f'{label} (Skewness: {skewness:.2f})')
28
29 plt.title('Comparison of Skewed Distributions')
30 plt.xlabel('Value')
31 plt.ylabel('Density')
32 plt.legend()
33 plt.grid(True, alpha=0.3)
34 plt.tight_layout()
35 plt.show()
36
37def main() -> None:
38 np.random.seed(42) # For reproducibility
39 size = 10000
40
41 positive_skew = generate_skewed_data(size, 1)
42 negative_skew = generate_skewed_data(size, -1)
43 symmetric = generate_skewed_data(size, 0)
44
45 data = [positive_skew, negative_skew, symmetric]
46 labels = ['Positive Skewness', 'Negative Skewness', 'Symmetric (No Skewness)']
47
48 plot_distributions(data, labels)
49
50if __name__ == "__main__":
51 main()
Kurtosis#
1import numpy as np
2import matplotlib.pyplot as plt
3from scipy import stats
4from typing import List, Tuple
5
6def generate_skewed_data(size: int, skewness: float) -> np.ndarray:
7 """Generate skewed data using a beta distribution."""
8 if skewness > 0:
9 a, b = 2, 5
10 elif skewness < 0:
11 a, b = 5, 2
12 else:
13 a = b = 5
14 return np.random.beta(a, b, size)
15
16def generate_kurtosis_data(size: int, df: float) -> np.ndarray:
17 """Generate data with different kurtosis using Student's t-distribution."""
18 return stats.t.rvs(df, size=size)
19
20def plot_distributions(data: List[np.ndarray], labels: List[str], title: str, show_histogram: bool = True) -> None:
21 """Plot the distributions on the same graph with KDE."""
22 plt.figure(figsize=(12, 6))
23 colors = ['red', 'blue', 'green']
24
25 for d, label, color in zip(data, labels, colors):
26 skewness = stats.skew(d)
27 kurtosis = stats.kurtosis(d)
28 kde = stats.gaussian_kde(d)
29 x_range = np.linspace(d.min(), d.max(), 1000)
30
31 if show_histogram:
32 plt.hist(d, bins=30, density=True, alpha=0.3, color=color, edgecolor='black')
33 plt.plot(x_range, kde(x_range), color=color, label=f'{label}\n(Skewness: {skewness:.2f}, Kurtosis: {kurtosis:.2f})')
34
35 plt.title(title)
36 plt.xlabel('Value')
37 plt.ylabel('Density')
38 plt.legend()
39 plt.grid(True, alpha=0.3)
40 plt.tight_layout()
41
42def main() -> None:
43 np.random.seed(42) # For reproducibility
44 size = 10000
45
46 # Skewness plot
47 positive_skew = generate_skewed_data(size, 1)
48 negative_skew = generate_skewed_data(size, -1)
49 symmetric = generate_skewed_data(size, 0)
50
51 skew_data = [positive_skew, negative_skew, symmetric]
52 skew_labels = ['Positive Skewness', 'Negative Skewness', 'Symmetric (No Skewness)']
53
54 plot_distributions(skew_data, skew_labels, 'Comparison of Skewed Distributions')
55
56 # Kurtosis plot
57 leptokurtic = generate_kurtosis_data(size, df=5) # High kurtosis
58 mesokurtic = generate_kurtosis_data(size, df=30) # Normal kurtosis
59 platykurtic = np.random.uniform(-np.sqrt(3), np.sqrt(3), size) # Low kurtosis
60
61 kurtosis_data = [leptokurtic, mesokurtic, platykurtic]
62 kurtosis_labels = ['Leptokurtic', 'Mesokurtic', 'Platykurtic']
63
64 plot_distributions(kurtosis_data, kurtosis_labels, 'Comparison of Kurtosis', show_histogram=False)
65
66 plt.show()
67
68if __name__ == "__main__":
69 main()
References and Further Readings#
Chan, Stanley H. “Chapter 4.6.3 Skewness and kurtosis.” In Introduction to Probability for Data Science, 216-220. Ann Arbor, Michigan: Michigan Publishing Services, 2021.