%\VignetteEngine{knitr::knitr} %\VignetteIndexEntry{3. Comparing distributions with the poweRlaw package} \documentclass[11pt,BCOR2mm,DIV14]{scrartcl} \usepackage{booktabs}%For fancy tables \usepackage[round]{natbib} % References \usepackage{textcomp} \usepackage{amsmath,amsfonts} \usepackage{scrlayer-scrpage} \usepackage[utf8]{inputenc} %unicode support \usepackage{color,hyperref} \urlstyle{rm} \usepackage{xcolor} \hypersetup{ colorlinks, linkcolor={red!50!black}, citecolor={blue!50!black}, urlcolor={blue!80!black} } \pagestyle{scrheadings} \setheadsepline{.4pt} \ihead[]{Comparing distributions} \chead[]{} \ohead[]{} \ifoot[]{} \cfoot[]{} \ofoot[\pagemark]{\pagemark} \newcommand{\cc}{\texttt} \newcommand{\xmin}{x_{\min}} <>= library(knitr) library(poweRlaw) options(replace.assign = FALSE, width = 50) opts_chunk$set(fig.path = "knitr_figure_poweRlaw/graphicsc-", cache.path = "knitr_cache_poweRlaw_c/", fig.align = "center", dev = "pdf", fig.width = 5, fig.height = 5, fig.show = "hold", cache = FALSE, par = TRUE, out.width = "0.4\\textwidth") knit_hooks$set(crop = hook_pdfcrop) knit_hooks$set(par = function(before, options, envir) { if (before && options$fig.show != "none") { par(mar = c(3, 4, 2, 1), cex.lab = .95, cex.axis = .9, mgp = c(3, .7, 0), tcl = -.01, las = 1) }}, crop = hook_pdfcrop) set.seed(1) palette(c(rgb(170, 93, 152, maxColorValue = 255), rgb(103, 143, 57, maxColorValue = 255), rgb(196, 95, 46, maxColorValue = 255), rgb(79, 134, 165, maxColorValue = 255), rgb(205, 71, 103, maxColorValue = 255), rgb(203, 77, 202, maxColorValue = 255), rgb(115, 113, 206, maxColorValue = 255))) @ % \begin{document} \date{Last updated: \today} \title{The poweRlaw package: Comparing distributions} \author{Colin S. Gillespie} \maketitle \begin{abstract} \noindent The \verb$poweRlaw$ package provides an easy to use interface for fitting and visualising heavy tailed distributions, including power-laws. This vignette provides examples of comparing competing distributions. \end{abstract} \section{Comparing distributions} This short vignette aims to provide some guidance when comparing distributions using Vuong's test statistic. The hypothesis being tested is \[ H_0: \text{Both distributions are equally far from the true distribution} \] and \[ H_1: \text{One of the test distributions is closer to the true distribution.} \] To perform this test we use the \cc{compare\_distributions()} function\footnote{The \cc{compare\_distributions()} function also returns a one sided $p$-value. Essentially, the one sided $p$-value is testing whether the first model is better than the second, i.e. a \textbf{one} sided test.} and examine the \cc{p\_two\_sided} value. \section{Example: Simulated data 1} First let's generate some data from a power-law distribution <<>>= library("poweRlaw") set.seed(1) x = rpldis(1000, xmin = 2, alpha = 3) @ \noindent and fit a discrete power-law distribution <<>>= m1 = displ$new(x) m1$setPars(estimate_pars(m1)) @ \noindent The estimated values of $\xmin$ and $\alpha$ are \Sexpr{m1$getXmin()} and \Sexpr{signif(m1$getPars(), 3)}, respectively. As an alternative distribution, we will fit a discrete Poisson distribution\footnote{When comparing distributions, each model must have the same $\xmin$ value. In this example, both models have $\xmin=\Sexpr{m1$getXmin()}$.} <<>>= m2 = dispois$new(x) m2$setPars(estimate_pars(m2)) @ \noindent Plotting both models <>= plot(m2, ylab = "CDF") lines(m1) lines(m2, col = 2, lty = 2) grid() @ \begin{figure}[t] \centering <>= @ \caption{Plot of the simulated data CDF, with power law and Poisson lines of best fit.}\label{F1} \end{figure} \noindent suggests that the power-law model gives a better fit (figure \ref{F1}). Investigating this formally <<>>= comp = compare_distributions(m1, m2) comp$p_two_sided @ \noindent means we can reject $H_0$ since $p=\Sexpr{signif(comp$p_two_sided, 4)}$ and conclude that one model is closer to the true distribution. \subsection*{One or two-sided $p$-value} The two-sided $p$-value does not depend on the order of the model comparison <>= compare_distributions(m1, m2)$p_two_sided compare_distributions(m2, m1)$p_two_sided @ \noindent However, the one-sided $p$-value is order dependent <>= ## We only care if m1 is better than m2 ## m1 is clearly better compare_distributions(m1, m2)$p_one_sided ## m2 isn't better than m1 compare_distributions(m2, m1)$p_one_sided @ \section{Example: Moby Dick data set} \noindent This time we will look at the Moby Dick data set <<>>= data("moby") @ \noindent Again we fit a power law <<>>= m1 = displ$new(moby) m1$setXmin(estimate_xmin(m1)) @ \noindent and a log-normal model\footnote{In order to compare distributions, $\xmin$ must be equal for both distributions.} <<>>= m2 = dislnorm$new(moby) m2$setXmin(m1$getXmin()) m2$setPars(estimate_pars(m2)) @ \noindent Plotting the CDFs <>= plot(m2, ylab = "CDF") lines(m1) lines(m2, col = 2, lty = 2) grid() @ \begin{figure}[t] \centering <>= @ \caption{The Moby Dick data set with power law and log normal lines of best fit.}\label{F2} \end{figure} \noindent suggests that both models perform equally well (figure \ref{F2}). The formal hypothesis test <<>>= comp = compare_distributions(m1, m2) @ \noindent gives a $p\,$-value and test statistic of <<>>= comp$p_two_sided comp$test_statistic @ \noindent which means we can not reject $H_0$. The $p\,$-value and test statistic are similar to the values found in table 6.3 of \citet{clauset2009}. <>= # R compiles all vignettes in the same session, which can be bad rm(list = ls(all = TRUE)) @ \bibliography{poweRlaw} \bibliographystyle{plainnat} \end{document}