Condensed Topic Notes.tex

\documentclass[12pt, titlepage]{article}

\usepackage[letterpaper,margin=2cm]{geometry}

\usepackage{amsmath}
\usepackage{amsthm}
\usepackage{amsfonts}
\usepackage{amssymb}
\usepackage{mathdots}
\usepackage{mathtools}
\usepackage[]{algorithm2e}
\usepackage{Commons}
\usepackage[T1]{fontenc}
\usepackage[sc]{mathpazo}
\usepackage{tikz}
\usepackage{bm}
\usepackage{bbm}

\input{math-boxes}

\title{ORF 526 Condensed Topic Notes}
\date{\today}


\begin{document}
\maketitle
\tableofcontents
\newpage

\section{Elementary Probability Theory}

\subsection{Conditional Probability}

\begin{dfn}[Conditional Probability]{}
\[\prob{A \mid B} = \frac{\prob{A \cap B}}{\prob{B}}\]
\end{dfn}

\begin{theo}[Baye's Theorem]{}
\[\prob{A \mid B} = \frac{\prob{B \mid A} \prob{A}}{\prob{B}}\]
\end{theo}
\subsection{Statistical Independence}

\begin{dfn}[Statistical Independence]{}
Two events $A, B$ are statistically independent if
\[\prob{A \cap B} = \prob{A} \prob{B}\]
\end{dfn}


\subsection{Random Variables}

\begin{dfn}[Elementary Definition of Random Variables]{}
Given a sample space $\Omega$, a random variable is a numeric function on $\Omega$.
\end{dfn}

The distribution of random variables can be defined by a probability distribution function. This can take multiple forms.

\begin{dfn}[Cumulative Density Function]{}
\[F_X(x) = \prob{X \leq x}\]
\end{dfn}

\begin{dfn}[Probability Mass Function]{}
The distribution of discrete random variables can be defined by a function of the form

\[f_X(x) = \prob{X = x} = \prob{\{\omega \in \Omega \mid X(\omega) = x\}}\]
\end{dfn}

\begin{dfn}[Probability Density Function]{}
The distribution of a continuous random variable can be defined by a function $f_X$ where
\[\prob{a \leq X \leq b} = \int_a^b f_X(x)dx\]

Notice that this also defines the cumulative density function as
\[F_X(x) = \int_{-\infty}^x f_X(t)dt\]
\end{dfn}

\subsection{Expected Value}

\begin{dfn}[Expected Value]{}
The expected value of a discrete random variable $X$ is defined as
\[\expect{X} = \sum_{-\infty}^\infty x \prob{X = x}\]

For a continuous random variable, we define it as
\[\expect{X} = \int_{-\infty}^\infty xf_X(x) dx\]
where $f_X$ is the probability density function of $X$.\\
\end{dfn}

\begin{info}[Properties of Expected Value]{}

\begin{itemize}
	\item Linearity: For $a, b \in \R$, $\expect{aX + bY} = a \expect{X} + b\expect{Y}$.
	\item $\expect{X}$ is finite if and only if $\expect{|X|}$ is finite.
	\item $X \geq 0$ A.S., then $\expect{X} \geq 0$.
	\item If $X \leq Y$ A.S. and both $\expect{X}$ and $\expect{Y}$ exists (that is, $\min \{\expect{X_+}, \expect{X_-}\} < \infty$ and $\min\{\expect{Y_+}, \expect{Y_-}\} < \infty$), then $\expect{X} \leq \expect{Y}$.
	\item If $\expect{|X^b|} < \infty$ and $0 < a \leq b$, then $\expect{|X^a|} < \infty$.
	\item If $X, Y$ are independent random variables then $\expect{XY} = \expect{X}\expect{Y}$.
\end{itemize}
\end{info}

\subsection{Variance}

\begin{dfn}[Variance]{}
\[\sigma^2 = \Var(X) = \expect{(X - \expect{X})^2} = \expect{X^2} - \expect{X}^2\]
\end{dfn}

\begin{dfn}[Covariance]{}
\[\Cov(X, Y) = \expect{(X - \expect{X})}\expect{(Y - \expect{Y})}\]
\end{dfn}

\begin{rmk}[Properties of Variance]{}
\begin{itemize}
	\item $\Var(X) \geq 0$.
	\item $\Var(aX) = a^2\Var(X)$.
	\item $\Var(aX + bY) = a^2\Var(X) + b^2\Var(Y) + 2ab\Cov(X, y)$.
	
\end{itemize}
\end{rmk}

\subsection{Important Distributions}

\subsubsection{Bernoulli Distribution}

\begin{rmk}{}
The Bernoulli distribution is the distribution of the random variable that takes the value $1$ with probability $p$ and the value $0$ with probability $(1-p)$.
\end{rmk}

\begin{info}[Bernoulli Distribution Properties]{}
\begin{itemize}
	\item Expected Value: $\expect{X} = p$.
	\item Variance: $\Var(X) = p(1-p)$.
\end{itemize}
\end{info}

\subsubsection{Binomial Distribution}

\begin{rmk}{}
The binomial distribution is the probability distribution of the number of successes in a sequence of $n$ experiments, each with probability of success $p$. In other words, the sum of $n$ independent random variables each with a Bernoulli distribution and probability $p$. 
\end{rmk}

\begin{info}[Binomial Distribution Properties]{}
\begin{itemize}
	\item PMF: $\prob{X = k} = f(k, n, p) = {n \choose k} p^k (1-p)^{n-k}$.
	\item CDF: $\prob{X \leq k} = F(k, n, p) = \sum_{i = 0}^k {n \choose i}p^i(1-p)^{n-i}$.
	\item Expected Value: $\expect{X} = np$.
	\item Variance: $\Var(X) = np(1-p)$.
\end{itemize}
\end{info}

\subsubsection{Geometric Distribution}

\begin{rmk}{}
The geometric distribution describes the number of repeated Bernoulli trials (experiments with a probability of success $p$) needed to achieve one success. For example, how many times must a coin be flipped to get a heads. 
\end{rmk}

\begin{info}[Geometric Distribution Properties]{}
	\begin{itemize}
		\item PMF: $\prob{X = k} = f(k, p) = (1-p)^{k-1}p$.
		\item CDF: $\prob{X \leq k} = F(k, p) = 1 - (1 - p)^k$.
		\item Expected Value: $\expect{X} = \frac{1}{p}$.
		\item Variance: $\Var(X) = \frac{1-p}{p^2}$.
	\end{itemize}
\end{info}

\subsubsection{Poisson Distribution}

\begin{rmk}{}
	The Poisson distribution describes the probability that a given number of events will occur in a fixed length of time if the events occur at a constant rate and the chance of one occurring is independent of the time since the last event. For example, the number of meteors that strike the earth in one year. $\lambda$ denotes the average number of events in a unit time.
\end{rmk}

\begin{info}[Poisson Distribution Properties]{}
	\begin{itemize}
		\item PMF: $\prob{X = k} = f(k, \lambda) = e^{-\lambda} \frac{\lambda^k}{k!}$.
		\item CDF: $\prob{X \leq k} = F(k, \lambda) = e^{-\lambda} \sum_{i=0}^k \frac{\lambda^i}{i!}$.
		\item Expected Value: $\expect{X} = \lambda$.
		\item Variance: $\Var(X) = \lambda$.
	\end{itemize}
\end{info}

\subsubsection{Exponential Distribution}

\begin{rmk}{}
	The exponential distribution describes the time between events in a Poisson point process. That is, a process in which events occur continuously and independently at a constant average rate. $\lambda$ denotes the average number of events in a unit time.
\end{rmk}

\begin{info}[Poisson Distribution Properties]{}
	\begin{itemize}
		\item PDF: $f(x, \lambda) = \lambda e^{-\lambda x}$.
		\item CDF: $\prob{X \leq k} = F(x, \lambda) = \int_{-\infty}^x f(t, \lambda) dt = 1 - e^{-\lambda x}$.
		\item Expected Value: $\expect{X} = \frac{1}{\lambda}$.
		\item Variance: $\Var(X) = \frac{1}{\lambda^2}$.
	\end{itemize}
\end{info}

\subsubsection{Normal Distribution}

\begin{rmk}{}
	The normal distribution is a continuous probability distribution with a number of unique properties. It is particularly important due to its relation to the central limit theorem.
\end{rmk}

\begin{info}[Normal Distribution Properties]{}
	\begin{itemize}
		\item PDF: $f(x, \mu, \sigma^2) = \frac{1}{\sqrt{2\pi \sigma^2}}e^{-\frac{(x - \mu)^2}{2 \sigma^2}}$.
		\item CDF: $\prob{X \leq k} = F(x, \lambda) = \int_{-\infty}^x f(t, \lambda) dt = \frac{1}{2} [1 + erf(\frac{x - \mu}{\sigma \sqrt{2}})]$, where $erf$, the error function, is non-elementary.
		\item Expected Value: $\expect{X} = \mu$.
		\item Variance: $\Var(X) = \sigma^2$.
	\end{itemize}
\end{info}

\subsection{Important Inequality Theorems}

\begin{theo}[Markov's Inequality]{}
Suppose $X$ is a non-negative random variable and $a > 0$. Then
\[\prob{X \geq a} \leq \frac{\expect{X}}{a}\]
\end{theo}

\begin{theo}[Chebyshev's Inequality]{}
Let $X$ be a random variable with $\expect{X} = \mu$ finite and non-zero variance $\Var(x) = \sigma^2$. Then for an real $k > 0$,
\[\prob{|X - \mu| \geq k\sigma} \leq \frac{1}{k^2}\]
\end{theo}

\subsection{Important Limit Theorems}

\begin{theo}[Weak Law of Large Numbers]{}
Let $\{X_i\}$ be i.i.d. random variables with mean $\mu$, and let $S_n = \frac{X_1 + X_2 + \dots + X_n}{n}$. Then $S_n$ converges to $\mu$ in probability. That is, for any $\epsilon > 0$,
\[\lim_{n \to \infty} \prob{|S_n - \mu| > \epsilon} = 0\]

That is, for any $\epsilon$, we can guarantee that $S_n$ is within $\epsilon$ of $\mu$ with arbitrary probability given a sufficient $n$.
\end{theo}

\begin{theo}[Strong Law of Large Numbers]{}
Let $\{X_i\}$ be i.i.d. random variables with mean $\mu$, and let $S_n = \frac{X_1 + X_2 + \dots + X_n}{n}$. Then $S_n$ converges to $\mu$ almost surely. That is, 
\[\prob{\lim_{n \to \infty} S_n = \mu} = 1\]
\end{theo}

\begin{theo}[Central Limit Theorem]{}
Let $\{X_i\}$ be i.i.d. random variables with mean $\mu$ and finite variance $\sigma^2$, and let $S_n = \frac{X_1 + X_2 + \dots + X_n}{n}$. Then
\[\sqrt{n}(S_n - \mu) \to \mathcal{N}(0, \sigma^2)\]
as $n \to \infty$. That is, as $n \to \infty$, the cumulative distribution function of $\sqrt{n}(S_n - \mu)$ converges pointwise to the CDF of the normal distribution centered at $0$ with variance $\sigma^2$.
\end{theo}

\section{Measure Theory}

\subsection{($\sigma$) Algebras}

\begin{dfn}[($\sigma$) Algebras]
	A collection $C$ of subsets of $E$ (the universe) is called an algebra if:
	\begin{itemize}
		\item $\emptyset \in C$.
		\item $A \in C \implies A^c \in C$.
		\item $A, B \in C \implies A \cup B \in C$.
	\end{itemize}
	$C$ is a $\sigma$-algebra if:
	\begin{itemize}
		\item $\emptyset \in C$.
		\item $A \in C \implies A^c \in C$.
		\item $A_1, A_2, \dots \in C \implies \bigcup^\infty_{i = 1}A_i \in C$.
	\end{itemize}
	Noticed the strengthened version of property three.
\end{dfn}

Examples:
\begin{itemize}
	\item $\mathcal{E} = \{\emptyset, E\}$ trivial $\sigma$-algebra.
	\item $\mathcal{E} = 2^E$, the discrete $\sigma$-algebra.
\end{itemize}

\begin{theo}[Intersections and Unions of $\sigma$-Algebras]{}
\begin{itemize}
	\item Any (countable or uncountable) intersection of $\sigma$-algebras is a $\sigma$-algebra.
	\item The union of two $\sigma$-algebras is not necessarily a $\sigma$-algebra.\\
\end{itemize}
\end{theo}

\begin{dfn}[Generated $\sigma$-algebra]{}
	Let $\mathcal{C}$ be a collection of subsets of $E$. Take all $\sigma$-algebras that contain $\mathcal{C}$. Take their intersection. This $\sigma$-algebra is called the $\sigma$-algebra generated by $\mathcal{C}$, and is denoted by $\sigma(\mathcal{C})$.
\end{dfn}

\begin{dfn}[Borel Algebras and Borel Sets]{}
	If $E$ is a topological space, and $\mathcal{C}$ is the collection of all open sets of $E$, then $\sigma (\mathcal{C})$ is called the Borel $\sigma$-algebra. Its elements are called Borel sets. The Borel $\sigma$-algebra is denoted by $\mathcal{B}_E$ or $\mathcal{B}(E)$.
\end{dfn}

\begin{dfn}[Measurable Spaces and Measurable Sets]{}
	A pair $(E, \mathcal{E})$ is a measurable space if $\mathcal{E}$ is a $\sigma$-algebra on $E$. The sets in $\mathcal{E}$ are called measurable sets.
\end{dfn}

\begin{dfn}[Measurable Rectangles]{}
	Let $(E, \mathcal{E})$, $(F, \mathcal{F})$ are two measurable spaces. If $A \subset E$ and $B \subset F$ are measurable sets, then $A \times B$ is called a measurable rectangle.\\
\end{dfn}

\begin{dfn}[Products of Measurable Spaces]{}
	The product $(E \times F, \mathcal{E} \otimes \mathcal{F})$ where $\mathcal{E} \otimes \mathcal{F} = \sigma(\{A \times B \mid A \in \mathcal{E}$, $B \in \mathcal{F}\})$, is a measurable space.\\
\end{dfn}

\subsection{Measures}

\begin{dfn}[Measure]{}
	$\mu: \mathcal{E} \to \mathbb{R^+}$ is a measure on $(E, \mathcal{E})$ if
	\begin{enumerate}
		\item $\mu(\emptyset) = 0$
		\item If $A_1, A_2, \dots \in \mathcal{E}$ are pairwise disjoint, then $\mu(\bigcup^\infty_{i = 1} A_i) = \sum^\infty_{i = 1}\mu(A_i)$
	\end{enumerate}
\end{dfn}
Property (2) is called countable additivity or $\sigma$-additivity.\\

\begin{info}[Properties of Measures]{}
Let $(E, \mathcal{E}, \mu)$ be a measure space with measure $\mu$. Then $\mu$ has the following properties:\\

\begin{enumerate}
	\item Finite Additivity. $A \cap B = \emptyset \implies \mu (A \cup B) = \mu(A) + \mu(B)$.
	\item  Monotonicity. If $A \subseteq B$ then $\mu(A) \leq \mu(B)$. Note that this is clear because $B = A \cup (B \setminus A)$.
	\item  Sequential Continuity. If $A_n \subset A$ and $A_n$ converges to $A$ as $n \to \infty$ then $\mu(A_n)$ converges to $\mu(A)$ from below.
	\item Boole's Inequality / Union Bound. $A_1, A_2, \dots \in \mathcal{E}$, $\mu(\bigcup^\infty_{i = 1} A_i) \leq \sum_{i = 1}^\infty \mu(A_i)$. Note: We can prove this by creating a sequence of disjoint subsets of $\bigcup_{i = 1}^\infty A_i$, then using sequential continuity.
	\item If $c > 0$, then $c\mu$ is also a measure, $(c\mu)(A) = c \cdot \mu(A)$.
	\item If $\mu_1$, $\mu_2$ are measures then $\mu_1 + \mu_2$ is a measure.
\end{enumerate}
\end{info}

\subsubsection{Examples of Measures}
\begin{enumerate}
	\item The Dirac Measure. $x \in E$, $\delta_x(A) =
	\begin{cases}
	1 & x \in A\\
	0 & x \not \in A
	\end{cases}$
	\item The Counting Measure. $D \in E$, $\mu(A) = $\# of points in $A \cap D$. If $D$ is countable, then $\mu(A) = \sum_{x \in D} \delta_x(A)$.
	\item Discrete Measure. $D \subset E$ countable, $m(x)$ is some real value for every $x \in D$. $\mu(A) = \sum_{x \in D} m(x)\delta_x(A)$.
	\item The Uniform Measure on $\{1, 2, \dots, n\}$. The discrete measure with $m(x) = \frac{1}{n}$.
	\item The Lebesgue Measure. $Leb(A) =$ length of $A$ where $A$ is an interval.
\end{enumerate}

\begin{dfn}[Finite Measure]{}
	If $\mu(E) < \infty$ then $\mu$ is called a finite measure.
\end{dfn}

\begin{dfn}[$\sigma$-Finite Measure]{}
We say that a measure $\mu$ is $\sigma$-finite if there exists a measurable countable partition $\{E_n\}$ of $E$ such that $\mu(E_n) < \infty$ for all $n$. Ex: $Leb$ is $\sigma$-finite.
\end{dfn}

For example, the Lebesgue measure is $\sigma$-finite but not finite.

\begin{dfn}[Probability Measure]{}
	A probability measure is a measure $\mu$ such that $\mu(E) = 1$.
\end{dfn}

\begin{dfn}[Probability Space]{}
	A probability space is a triple $(E, \mathcal{E}, \mu)$ such that $(E, \mathcal{E})$ is a measurable space, and $\mu$ is a probability measure.
\end{dfn}

Probability spaces are often denoted by the triple $(\Omega, \mathcal{F}, \mathbb{P})$.

\subsection{Specification of Measures}

\begin{theo}{}
	Let $(E, \mathcal{E})$ be a measurable space. Let $\mu$ and $\nu$ be two measures on $(E, \mathcal{E})$ with $\mu(E) = \nu(E) < \infty$. If $\mu$ and $\nu$ agree on a collection of subsets that is closed under intersections, that generate $\mathcal{E}$, then $\mu = \nu$.
\end{theo}

\begin{theo}[Corollary]{}
	Cor: If $\mu$ and $\nu$ are two probability measures on $\mathbb{R}$ with the same cumulative distribution functions, then $\mu = \nu$.
\end{theo}

\begin{theo}[Cumulative Distribution]{}
	The cumulative distribution at a point $x$ is $\mu([-\infty, x])$.
\end{theo}

Assume that $\{x\} \in \mathcal{E}$ if $x \in E$. This is true of all standard measurable spaces.\\

\begin{dfn}[Atom]{}
	$x$ is an atom of $\mu$ if $\mu(\{x\}) > 0$.
\end{dfn}

\begin{dfn}[Purely Atomic Measures]{}
	$\mu$ is purely atomic if $\exists D \subset E$ such that $\forall x \in D$, $\mu(\{x\}) > 0$ and $\mu(E \setminus D) = 0$.
\end{dfn}

\begin{dfn}[Diffuse Measures]{}
	$\mu$ is diffuse if it has no atoms. Ex: $Leb$.
\end{dfn}

\begin{lem}{}
	If $\mu$ is a $\sigma$-finite measure on $(E, \mathcal{E})$ then we can write $\mu = \lambda + \nu$ where $\lambda$ is diffuse and $\nu$ is purely atomic.
\end{lem}

\subsection{Completeness and Negligible Sets}

\begin{dfn}[Negligible Set]{}
	A measurable set $A$ is negligible if $\mu(A) = 0$. An arbitrary subset of $E$ is negligible if it is contained in a measurable set that is negligible.
\end{dfn}

\begin{dfn}[Complete Measure Space]{}
	A measure space is complete if every negligible set is measurable.
\end{dfn}

\begin{lem}[]{}
	To make a measure space complete, take $\overline{\mathcal{E}} = \sigma(\mathcal{E} \cup \mathcal{N})$, where $\mathcal{N}$ is the collection of negligible sets. $\forall A \subset \overline{\mathcal{E}}$, $A = B \cup N$ with $B \in \mathcal{E}, N \in \mathcal{N}$. Define $\overline{\mu}(A) = \mu(B)$. This is called the completion of the measure space, $(E, \overline{\mathcal{E}}, \overline{\mu})$. In the case of $(\mathbb{R}, \mathcal{B}_{\mathbb{R}}, Leb)$, the elements of $\overline{\mathcal{B}}_\mathbb{R}$ are called Lebesgue-measurable.\\
\end{lem}

\subsection{Measurable Functions}

\begin{dfn}[Measurable Functions]{}
	Let $(E, \mathcal{E})$ and $(F, \mathcal{F})$ be two measure spaces. $f: E \to F$ is measurable relative to $\mathcal{E}$ and $\mathcal{F}$ if $f^{-1}(A) \in \mathcal{E}$ for every $A \in \mathcal{F}$.
\end{dfn}

\begin{theo}[]{}
	Let $(E, \mathcal{E})$, $(F, \mathcal{F})$ be measure spaces. $f: E \to F$ is measurable relative to $\mathcal{E}$ and $\mathcal{F}$ if and only if there exists a collection $\mathcal{F}_0$ of subsets of $F$ such that $f^{-1}(B) \in \mathcal{E}$ $\forall B \in \mathcal{F_0}$, and $\mathcal{F}_0$ generates $\mathcal{F}$.
\end{theo}

\begin{prf}[]{}
	Left as an exercise
\end{prf}

\begin{theo}[]{}
	Let $(E, \mathcal{E})$, $(F, \mathcal{F})$, $(G, \mathcal{G})$ be measure spaces. $f: E \to F$, $g: F \to G$. If $f$ and $g$ are measurable, then $g \circ f$ is measurable.
\end{theo}

\begin{dfn}[]{}
	$f: E \to \R$ is $\mathcal{E}$-measurable if it is measurable relative to $\mathcal{E}$ and $\mathcal{B}_\R$.\\
\end{dfn}

\begin{dfn}[Borel Function]{}
	If $E$ is a topological space and $\mathcal{E}$ is the Borel $\sigma$-algebra, then we simply say that $f$ is a Borel function.
\end{dfn}

\begin{lem}[]{}
	$f: E \to \R$ is $\mathcal{E}$-measurable, if and only if $f^{-1}((-\infty, r]) \in \mathcal{E}$ for all $r \in \R$.\\
\end{lem}

\begin{prf}
	From HW1: $\sigma(\{(-\infty, r] \mid r \in \R\}) = \mathcal{B}(\R)$. Then it follows from claim stated last time wrt the inverse of a generating set.\\
\end{prf}

\begin{dfn}[]{}
	$f^+:= \max\{f, 0\}$, $f^-:= -\min\{f, 0\}$. Note that $f = f^+ - f^-$.\\
\end{dfn}

\begin{lem}[]{}
	$f$ is $\mathcal{E}$-measurable if and only if $f^+$ and $f^-$ are $\mathcal{E}$-measurable.
\end{lem}

\begin{prf}
	Left as an exercise
\end{prf}

\begin{dfn}[Indicator Function]{}
	An indicator function is of the form
	\[\mathbbm{1}_A(x) =
	\begin{cases}
	1 & x \in A\\
	0 & x \not \in A
	\end{cases}\]
\end{dfn}

Check: $\mathbbm{1}_A$ is $\mathcal{E}$-measurable if and only if $A \in \mathcal{E}$.\\

\begin{dfn}[Simple Function]{}
	A function is simple if $f = \sum_{i = 1}^n a_i \mathbbm{1}_{A_i}$, $a_i \in \R$. Where $A_1, A_2, \dots A_n$ are $\mathcal{E}$-measurable.\\
\end{dfn}

\begin{dfn}[Canonical Form of a Simple Function]
	The canonical form of a simple function is $f = \sum_{j = 1}^m b_j \mathbbm{1}_{B_j}$ where $\{B_j\}$ is a partition of $\mathcal{E}$.\\
\end{dfn}

\begin{info}[]{}
	Conversely, if a function is $\mathcal{E}$-measurable and takes only finitely many real values, then it is a simple function.
\end{info}

\begin{info}[]{}
	If $f$ and $g$ are simple, then so are $f+g$, $f-g$, $fg$, $f/g$, $\max\{f, g\}$, $\min\{f, g\}$.\\
\end{info}

\begin{theo}[]{}
	The class of measurable functions is closed under limits.\\
	
	Let $\{f_n\}$ be a sequence of $\mathcal{E}$-measurable functions then $\inf f_n$, $\sup f_n$, $\liminf f_n$, and $\limsup f_n$, defined pointwise, are $\mathcal{E}$-measurable.
\end{theo}

\begin{prf}[]{}
	For $\sup f_n = f$, we want to show that $f^{-1}(-\infty, r] \in \mathcal{E}$. Since intersections can be rewritten as unions, and $f(x) \leq r \iff f_n(x) \leq r$ $\forall n$, we have
	
	\[f^{-1}(-\infty, r] = \bigcap^\infty_{n=1} f^{-1}_n(-\infty, r]\]
	
	But we know that $f^{-1}_n(-\infty, r] \in \mathcal{E}$ and since this is a countable intersection, $f^{-1}(-\infty, r] \in \mathcal{E}$.
\end{prf}

Let $f: \overline{\R}_+ \to \overline{\R}_+$ and $d_n(x) = \sum_{k = 1}^{n2^n} \frac{k-1}{2^n} \mathbbm{1}_{[\frac{k-1}{2^n}, \frac{k}{2^n}]} + n \mathbbm{1}_{[n, \infty]}$. We can approximate $f$ by a sequence of simple functions $f_n = d_n \circ f$.\\

\begin{theo}[]{}
	A function $f$ is $\mathcal{E}$-measurable if and only if it is the increasing limit of simple functions.\\
\end{theo}

\subsection{Integration of Measurable Functions}

Suppose $(E, \mathcal{E}, \mu)$ is a measure space. Define $f: E \to \R$. We want to find $\int f d_\mu$. That is, the integral of $f$ relative to the measure $\mu$. We denote this $\mu f = \mu(f) = \int f d_\mu = \int \mu(dx)f(x) = \int_E \mu(dx) f(x)$.\\

How we will do this is we will first define integrals over measure spaces for simple functions, then extend this definition by taking limits.\\

\begin{dfn}[Integral of a Measurable Function Relative to a Measure]{}
	If $f$ is a simple function, $f = \sum_{i = 1}^n a_i \mathbbm{1}_{A_i}$, where $\{A_i\}$ is a partition of $E$ we define the integral as
	
	\[\int f d\mu = \sum_{i=1}^n a_i \mu(A_i)\]
	
	Now, suppose that $f$ is a measurable positive function, and let $f_n = d_n \circ f$. Then $\int f d\mu = \lim_{n \to \infty} f_n d\mu$.\\
	
	Finally, if $f = f^+ - f^-$, then $\int f d\mu = \int f^+ d\mu - \int f^- d\mu$, provided that at least one of the two integrals on the right are finite. Otherwise, $\int f d\mu$ is undefined.
\end{dfn}


\begin{dfn}[Integrable Relative to a Measure]{}
	$f$ is integrable if $\int f d\mu$ exists and is finite.\\
\end{dfn}

\begin{theo}[]{}
$f$ is integrable $\iff$ $\int |f| d\mu < \infty$. Notice $|f| = f^+ + f^-$.\\
\end{theo}

\begin{dfn}[Almost Everywhere]{}
	A statement holds almost everywhere (for almost every $x \in E$) if it holds for all $x$ except for $x$ in a negligible set. Denoted $\mu$-a.e. or (a.e.). For probability measures, we say "almost surely".
\end{dfn}

\begin{info}[Properties of Measure Integrals]{}
$a, b \in \R^+$, $f, g \in \mathcal{E}_+$ ($\mathcal{E}$-measurable positive functions).

\begin{enumerate}
	\item Positivity: $\mu(f) \geq 0$. $\mu(f) = 0 \implies f = 0$ a.e.
	\item Linearity: $\mu(af + bg) = a\mu(f) + b\mu(g)$.
	\item Monotonicity: If $f \leq g$ a.e., then $\mu(f) \leq \mu(g)$.
\end{enumerate}
\end{info}

\begin{theo}[Monotone Convergence Theorem]{}
If $f_n \to f$ from below, then $\mu(f_n) \to \mu(f)$ from below.\\

\begin{enumerate}
	\item Dirac measure: using the Dirac delta $\delta_{x_0} (f) = f(x_0)$.\\
	\item $\mu = \sum_{x \in D} m(x)\delta_x$, $D \subset E$, then $\mu(f) = \sum_{x \in D} m(x) f(x)$. Note that if $E$ is countable, then every measure is of this form ($m(x) = \mu(\{x\})$).
\end{enumerate}

Note that if $E$ is a vector space, we can think of $\mu(f)$ as the inner product $\langle \mu, f \rangle$.\\
\end{theo}


\begin{theo}[]{}
If the Reiman integral of $f$ exists, then $Leb$ integral of $f$ does as well and things are equal. However, the converse is false. Notice that if $E = [0, 1]$, $f = \mathbbm{1}_\Q$, then $Leb(f) = 0$.\\
\end{theo}

Suppose that $A \subset E$, $A \in E$, $f \in \mathcal{E}$. Then $f \mathbbm{1}_A \in \mathcal{E}$ and so $\mu(f \mathbbm{1}_A) = \int f \mathbbm{1}_A d\mu = \int_A f d\mu$.

\begin{theo}[]{}
	Let $\{f_n\}$ be a monotone increasing sequence of measurable positive functions. Then $\mu(\lim_{n \to \infty} f_n) = \lim_{n \to \infty} \mu(f_n)$.
\end{theo}

\begin{prf}[]{}
	$f := \lim f_n$ is well defined, so $\mu(f)$ is well defined. For all $n$, $f_n \leq f$, so $\mu(f_n) \leq \mu(f)$ and $\lim_n \mu(f_n) \leq \mu(f)$.\\
	
	For the other direction, we want to show that $\lim_{n \to \infty} \mu(f_n) \geq \mu(d_k \circ f)$ $\forall k$.\\
\end{prf}

\begin{lem}[]{}
	If $A \in \mathcal{E}$ is negligible, then $\int_A f d\mu = 0$ for all measurable $f$.\\
	
	If $f = g$ a.e., then $\mu(f) = \mu(g)$.\\
	
	If $f \in \mathcal{E}_+$, $\mu(f) = 0$ then $f = 0$ a.e.
\end{lem}

\begin{theo}[Fatou's Lemma]{}
	Let $(f_n)_{n \geq 1}$ be a sequence of functions in $\mathcal{E}_+$. Then $\mu(\liminf f_n) \leq \liminf \mu(f_n)$. This follows from MCT (HW).
\end{theo}

\begin{theo}[Dominated Convergence Theorem]
	If $f_n$ is a sequence of functions and there exists a function $g$ such that (a) $|f_n| \leq g$ $\forall n \geq $, and (b) $g$ is integrable, then $f:= \lim f_n$ (if it exists) is integrable and $\mu(f) = \lim \mu(f_n)$. This follows from Faton's (HW).\\
	
	Terminology: $g$ dominates $f_n$ for every $n$.
\end{theo}

\begin{theo}[Corollary: Bounded Convergence Theorem]{}
	Suppose that $\mu$ is a finite measure, and $|f_n| \leq c < \infty$ ($c$ a constant), and $f:= \lim f_n$ exists. Then $\mu(f) = \lim \mu(f_n)$.
\end{theo}

\begin{theo}[]{}
	Let $(E, \mathcal{E})$ be a measurable space and $f$ a measurable function. $L: \mathcal{E}_+ \to \overline{\R}_+$. Then there exists a unique measure $\mu$ on $(E, \mathcal{E})$ such that $L(f) = \mu(f)$ if and only if
	
	\begin{itemize}
		\item $f = 0 \implies L(f) = 0$.
		\item $L(af + bg) = aL(f) + bL(g)$.
		\item If $f_n \to f$ from below, then $L(f_n) \to L(f)$ from below.
	\end{itemize}
\end{theo}

\begin{info}[Products of Measure Spaces]{}
	\begin{itemize}
		\item $(E, \mathcal{E})$, $(F, \mathcal{F})$: $(E \times F, \mathcal{E} \otimes \mathcal{F})$.
		\item $(E, \mathcal{E}, \mu)$, $(F, \mathcal{F}, \nu)$: $(E \times F, \mathcal{E} \otimes \mathcal{F}, \mu \times \nu)$.
		\item $(\mu \times nu)(A \times B) = \mu(A) \times \nu(B)$.
	\end{itemize}
\end{info}

\begin{theo}[Fubini]{}
	Suppose that $f: E \times F \to \overline{\R}$ such that $\int \int_{E \times F} |f| d(\mu \times \nu) < \infty$. Then $\int \int_{E \times F} f d(\mu \times \nu) = \int_F (\int_E f(x, y) \mu(dx)) \nu(dy) = \int_E (\int_F f(x, y) \nu(dy)) \mu(DX)$.\\
\end{theo}

\begin{theo}[Tonelli]{}
	If $f \geq 0$ then the same conclusions hold.
\end{theo}

\subsection{Absolute Continuity of Measures}

\begin{dfn}[Absolute Continuity of Measures]{}
	$(E, \mathcal{E})$ with measures $\mu$ and $\nu$. We say that $\mu$ is absolutely continuous with regard to $\nu$ if $\forall A \in \mathcal{E}$, $\nu(A) = 0 \implies \mu(A) = 0$. Denote this by $\mu << \nu$.
\end{dfn}

Example: If a measure on $\R$ has a density (e.g., $\mu(dx) = \frac{1}{\sqrt{2\pi}} e^{-\frac{x^2}{2}} dx$, the standard Gaussian measure), then it is absolutely continuous with regard to the Lebasgue measure.\\

Example: Discrete distributions with the same support.\\

\begin{theo}[]{}
	Suppose that $\mu$ is $\sigma$-finite, and that $\nu << \mu$. Then there exists a positive $\mathcal{E}$-measurable function $p$ such that $\int_E \nu(dx) f(x) = \int_E \mu(dx) p(x)f(x)$, $\forall f \in \mathcal{E}_+$.\\
	
	Moreover, $p$ is unique up to equivalence (if this holds for $p'$, then $p = p'$ a.e.).\\
\end{theo}

\begin{dfn}[Radon-Nikodym Derivative]{}
	This function $p$ is called the Radon-Nikodym derivative of $\nu$ with regard to $\mu$. We write this as $p(x) = \frac{\nu(dx)}{\mu(dx)}(x)$, or $p = \frac{d\nu}{d\mu}$.\\
\end{dfn}

If we care about $\nu$, but it is difficult to use. If $\nu << \mu$, then we can perform calculations using the nicer $\mu$.\\

\begin{dfn}[Singular Measures]{}
	$\mu$ is singular with regard to $\mu$ if there exists some set $D \in \mathcal{E}$ such that $\mu(D) = 0$ and $\nu(E \setminus D) = 0$.
\end{dfn}

\subsection{Products of Measure Spaces}
A product of measure spaces 

\[\bigotimes_{i = 1}^n (E_i, \mathcal{E}_i, \mu_i)\]

can be seen as $n$ mutually independent random variables (i.e. $n$ coin tosses).\\

How do we define a countably infinite product of measure spaces ($\bigotimes_{i = 1}^\infty (E_i, \mathcal{E}_i, \mu_i)$)?\\

Let $\mathcal{R}$ be the collection of all finite dimension measurable rectangles. That is, all sets of the form $\{ x \mid x_1 \in B_1, \dots, x_n \in B_n, x_{n + 1} \in \R, x_{n + 2} \in \R, \dots \}$ where $n \in \N$ and $B_i \in \mathcal{B}(\R)$. Then $\mathcal{B}_C = \sigma(\mathcal{R})$. We define the measure as

\[\mu(\{x \mid x_1 \in B_1, \dots, x_n \in B_n, \dots\}) = \mu_1(B_1)\mu_2(B_2)\dots\mu_n(B_n)\] 

\begin{theo}[Kolmogarov's Extension Theorem]{}
	Suppose $\{mu_n\}_{n \geq 1}$ is a sequence of probability measures, where $\mu_n$ is a probability measure on $(\R^n, \mathcal{B}_{\R^n})$ that is consistent. That is
	
	\[\mu_{n + 1}(\{x_1 \in B_1, x_2 \in B_2, \dots, x_n \in B_n, x_{n+1} \in \R^n\}) = \mu_n(\{x_1 \in B_1, x_2 \in B_2, \dots, x_n \in B_n \})\]
	
	For all $n \in \N$ and all $B_1, B_2, \dots, B_n \in \mathcal{B}(\R)$. Then there exists a unique probability measure $\mathbb{P}$ on $(\R^\N, \mathcal{B}_C)$ such that $\mathbb{P}(\{w \mid w_1 \in B_1, \dots, w_n \in B_n\}) = \mu_n(B_1 \times B_2 \times \dots \times B_n)$.
\end{theo}

\section{Probability Spaces}

%Treating it as an integral
\subsection{Expected Value}

\subsection{Almost Sure and Almost Everywhere}

\subsection{Inequalities and Bounds}

\subsection{Borel-Cantelli Lemmas}

\subsection{Law of Large Numbers, Central Limit Theorem}

\subsection{Weak Convergence}

\section{Markov Chains}

\end{document}