%\VignetteEngine{knitr::knitr} %\VignetteIndexEntry{2. Examples using the poweRlaw package} \documentclass[11pt,BCOR2mm,DIV14]{scrartcl} \usepackage{booktabs}%Fancy tables \usepackage[round]{natbib}% References \usepackage{amsmath,amsfonts} \usepackage{scrlayer-scrpage} \usepackage[utf8]{inputenc} %unicode support \usepackage{color,hyperref} \urlstyle{rm} \usepackage{xcolor} \hypersetup{ colorlinks, linkcolor={red!50!black}, citecolor={blue!50!black}, urlcolor={blue!80!black} } \pagestyle{scrheadings} \setheadsepline{.4pt} \ihead[]{Examples} \chead[]{} \ohead[]{} \ifoot[]{} \cfoot[]{} \ofoot[\pagemark]{\pagemark} \newcommand{\cc}{\texttt} \newcommand{\xmin}{x_{\min}} \newcommand{\ntail}{n_{\text{tail}}} <>= library(knitr) library(poweRlaw) options(replace.assign = FALSE) opts_chunk$set(fig.path = "knitr_figure_poweRlaw/graphicsn-", cache.path = "knitr_cache_poweRlaw_b/", fig.align = "center", dev = "pdf", fig.width = 5, fig.height = 5, fig.show = "hold", cache = FALSE, par = TRUE, out.width = "0.4\\textwidth") knit_hooks$set(crop = hook_pdfcrop) knit_hooks$set(par = function(before, options, envir) { if (before && options$fig.show != "none") { par(mar = c(3, 4, 2, 1), cex.lab = .95, cex.axis = .9, mgp = c(3, .7, 0), tcl = -.01, las = 1) }}, crop = hook_pdfcrop) set.seed(1) palette(c(rgb(170, 93, 152, maxColorValue = 255), rgb(103, 143, 57, maxColorValue = 255), rgb(196, 95, 46, maxColorValue = 255), rgb(79, 134, 165, maxColorValue = 255), rgb(205, 71, 103, maxColorValue = 255), rgb(203, 77, 202, maxColorValue = 255), rgb(115, 113, 206, maxColorValue = 255))) @ % \begin{document} \date{Last updated: \today} \title{The poweRlaw package: Examples} \author{Colin S. Gillespie} \maketitle <>= #if(!file.exists("blackouts.txt")) blackouts = c(570, 210.882, 190, 46, 17, 360, 74, 19, 460, 65, 18.351, 25, 25, 63.5, 1, 9, 50, 114.5, 350, 25, 50, 25, 242.91, 55, 164.5, 877, 43, 1140, 464, 90, 2100, 385, 95.63, 166, 71, 100, 234, 300, 258, 130, 246, 114, 120, 24.506, 36.073, 10, 160, 600, 12, 203, 50.462, 40, 59, 15, 1.646, 500, 60, 133, 62, 173, 81, 20, 112, 600, 24, 37, 238, 50, 50, 147, 32, 40.911, 30.5, 14.273, 160, 230, 92, 75, 130, 124, 120, 11, 235, 50, 94.285, 240, 870, 70, 50, 50, 18, 51, 51, 145, 557.354, 219, 191, 2.9, 163, 257.718, 1660, 1600, 1300, 80, 500, 10, 290, 375, 95, 725, 128, 148, 100, 2, 48, 18, 5.3, 32, 250, 45, 38.5, 126, 284, 70, 400, 207.2, 39.5, 363.476, 113.2, 1500, 15, 7500, 8, 56, 88, 60, 29, 75, 80, 7.5, 82.5, 272, 272, 122, 145, 660, 50, 92, 60, 173, 106.85, 25, 146, 158, 1500, 40, 100, 300, 1.8, 300, 70, 70, 29, 18.819, 875, 100, 50, 1500, 650, 58, 142, 350, 71, 312, 25, 35, 315, 500, 404, 246, 43.696, 71, 65, 29.9, 30, 20, 899, 10.3, 490, 115, 2085, 206, 400, 26.334, 598, 160, 91, 33, 53, 300, 60, 55, 60, 66.005, 11.529, 56, 4.15, 40, 320.831, 30.001, 200) * 1000 @ \section{Discrete data: The Moby Dick data set} The Moby Dick data set contains the frequency of unique words in the novel Moby Dick by Herman Melville. This data set was originally downloaded from Clausett's website, but this no longer resolves. \noindent or loaded directly <>= library("poweRlaw") data("moby", package = "poweRlaw") @ \noindent To fit a discrete power law to this data\footnote{The object \texttt{moby} is a simple R vector.}, we use the \texttt{displ} constructor <>= m_pl = displ$new(moby) @ \noindent The resulting object, \cc{m\_pl}, is a \cc{displ}\footnote{\cc{displ}: discrete power law.} object. It also inherits the \cc{discrete\_distribution} class. After creating the \cc{displ} object, a typical first step would be to infer model parameters.\footnote{When the \cc{displ} object is first created, the default parameter values are \cc{NULL} and $\xmin$ is set to the minimum $x$-value.} We estimate the lower threshold via <>= est = estimate_xmin(m_pl) @ \noindent and update the power law object <>= m_pl$setXmin(est) @ \noindent For a given value $\xmin$, the scaling parameter is estimated by numerically optimising the log-likelihood. The optimiser is \textit{initialised} using the analytical MLE \[ \hat \alpha \simeq 1 + n \left[\sum_{i=1}^n \log \left(\frac{x_i}{\xmin -0.5}\right)\right]^{-1} \;. \] \noindent This yields a threshold estimate of $\xmin=\Sexpr{est$xmin}$ and scaling parameter $\alpha=\Sexpr{signif(est$pars, 3)}$, which matches results found in \citet{clauset2009}. Alternatively, we could perform a parameter scan for each value of $\xmin$ <>= estimate_xmin(m_pl, pars = seq(1.8, 2.3, 0.1)) @ \noindent To fit a discrete log-normal distribution, we follow a similar procedure, except we begin by creating a \cc{dislnorm} object\footnote{\cc{dislnorm}: discrete log normal object} <>= m_ln = dislnorm$new(moby) est = estimate_xmin(m_ln) @ <>= if (file.exists("examples1.rds")) { load("examples1.rds") } else { m_ln = dislnorm$new(moby) est = estimate_xmin(m_ln) est_ln = est est_ln$pars = signif(est_ln$pars, 3) m_ln$setXmin(est) m_pois = dispois$new(moby) est = estimate_xmin(m_pois) m_pois$setXmin(est) save(m_pois, m_ln, est_ln, file = "examples1.rds") } @ \noindent which yields a lower threshold of $\xmin=\Sexpr{est_ln$xmin}$ and parameters $(\Sexpr{est_ln$pars[1]},$ $\Sexpr{est_ln$pars[2]})$. A similar procedure is applied to fit the Poisson distribution; we create a distribution object using \cc{dispois}, then fit as before. The data CDF and lines of best fit can be easily plotted <>= plot(m_pl) lines(m_pl, col = 2) lines(m_ln, col = 3) lines(m_pois, col = 4) @ \begin{figure}[t] \centering <>= plot(m_pl, xlab = "x", ylab = "CDF", panel.first = grid(col = "grey80"), pch = 21, bg = 1) lines(m_pl, col = 2, lwd = 2) lines(m_ln, col = 3, lwd = 2) lines(m_pois, col = 4, lwd = 2) @ \caption{Data CDF of the Moby Dick data set. The fitted power law (green line), log-normal (red line) and poisson (blue) distributions are also given.}\label{F1a} \end{figure} \noindent to obtain figure \ref{F1a}. It clear that the Poisson distribution is not appropriate for this data set. However, the log-normal and power law distribution both provide reasonable fits to the data. \subsection{Parameter uncertainty} <>= data(bootstrap_moby, package = "poweRlaw") bs = bootstrap_moby @ To get a handle on the uncertainty in the parameter estimates, we use a bootstrapping procedure, via the \texttt{bootstrap} function. This procedure can be applied to any distribution object.\footnote{For example, \mbox{bootstrap(m\_ln)}.} Furthermore, the bootstrap procedure can utilize multiple CPU cores to speed up inference.\footnote{The output of this bootstrapping procedure can be obtained via \cc{data(bootstrap\_moby)}.} <>= ## 5000 bootstraps using two cores bs = bootstrap(m_pl, no_of_sims = 5000, threads = 2) @ \noindent By default, the \cc{bootstrap} function will use the maximum likelihood estimate to estimate the parameter and check all values of $\xmin$. When possible $\xmin$ values are large, then it is recommend that the search space is reduced. For example, this function call <>= bootstrap(m_pl, xmins = seq(2, 20, 2)) @ \noindent will only calculate the Kolmogorov-Smirnoff statistics at values of $\xmin$ equal to \[ 2, 4, 6, \ldots, 20\;. \] A similar argument exists for the parameters.\footnote{For single parameter models, \cc{pars} should be a vector. For the log-normal distribution, \cc{pars} should be a matrix of values.} The bootstrap function, returns \cc{bs\_xmin} object that has a number of components: \begin{enumerate} \item The goodness of fit statistic obtained from the Kolmogorov-Smirnoff test. This value should correspond to the value obtained from the \mbox{\cc{estimate\_xmin}} function. \item A data frame containing the results for the bootstrap procedure. \item The average simulation time, in seconds, for a single bootstrap. \item The random number seed. \item The package version. \end{enumerate} The boostrap results can be explored in a variety way. First we can estimate the standard deviation of the parameter uncertainty, i.e. \begin{figure}[t] \centering <>= plot(bs) @ \caption{Results from the standard bootstrap procedure (for the power law model) using the Moby Dick data set: \mbox{\texttt{bootstrap(m\_pl)}}. The top row shows the mean estimate of parameters $\xmin$, $\alpha$ and $\ntail$. The bottom row shows the estimate of standard deviation for each parameter. The dashed-lines give approximate 95\% confidence intervals. After 5,000 iterations, the standard deviation of $\xmin$ and $\alpha$ is estimated to be 2.1 and 0.03 respectively.}\label{F1b} \end{figure} <<>>= sd(bs$bootstraps[, 2]) sd(bs$bootstraps[, 3]) @ \noindent Alternatively, we can visualise the results using the \cc{plot} function: <>= ## trim=0.1 only displays the final 90% of iterations plot(bs, trim = 0.1) @ \noindent to obtain figure \ref{F1b}. This top row of graphics in figure \ref{F1b} give a 95\% confidence interval for the mean estimate of the parameters. The bottom row of graphics give a 95\% confidence for the standard deviation of the parameters. The parameter \texttt{trim} in the \texttt{plot} function controls the percentage of samples displayed.\footnote{When \cc{trim=0}, all iterations are displayed.} When \texttt{trim=0.1}, we only display the final 90\% of data. \begin{figure}[t] \centering <>= par(mfrow = c(1, 2)) hist(bs$bootstraps[, 2], xlab = expression(x[min]), ylim = c(0, 1600), xlim = c(0, 20), main = NULL, breaks = "fd") grid() hist(bs$bootstraps[, 3], xlab = expression(alpha), ylim = c(0, 500), xlim = c(1.80, 2.05), main = NULL, breaks = "fd") grid() @ \caption{Characterising uncertainty in parameter values. (a) $\xmin$ uncertainty (standard deviation 2) (b) $\alpha$ uncertainty (std dev. 0.03)}\label{F1c} \end{figure} We can also construct histograms. <>= hist(bs$bootstraps[, 2]) hist(bs$bootstraps[, 3]) @ \noindent to get figure \ref{F1c}. A similar bootstrap analysis can be obtained for the log-normal distribution <>= bs1 = bootstrap(m_ln) @ \noindent in this case we would obtain uncertainty estimates for both of the log-normal parameters. <>= data(bootstrap_p_moby, package = "poweRlaw") bs_p = bootstrap_p_moby @ \begin{figure}[t] \centering <>= plot(bs_p) @ \caption{Results from the bootstrap procedure (for the power law model) using the Moby Dick data set: \mbox{\texttt{bootstrap\_p(m\_pl)}}. The top row shows the mean estimate of parameters $\xmin$, $\alpha$ and the $p\,$-value. The bottom row shows the estimate of standard deviation for each parameter. The dashed-lines give approximate 95\% confidence intervals.}\label{F1d} \end{figure} \subsection{Testing the power law hypothesis} Since it is possible to fit a power law distribution to \textit{any} data set, it is appropriate to test whether the observed data set actually follows a power law. \citet{clauset2009} suggest that this hypothesis is tested using a goodness-of-fit test, via a bootstrapping procedure. This test generates a $p\,$-value that can be used to quantify the plausibility of the hypothesis. If the $p\,$-value is large, than any difference between the empirical data and the model can be explained with statistical fluctuations. If $p \simeq 0$, then the model does not provide a plausible fit to the data and another distribution may be more appropriate. In this scenario, \begin{align*} &H_0: \mbox{data is generated from a power law distribution.}\\ &H_1: \mbox{data is not generated from a power law distribution.} \end{align*} \noindent To test these hypothesis, we use the \texttt{bootstrap\_p} function <>= bs_p = bootstrap_p(m_pl) @ \noindent The point estimate of the $p\,$-value is one of the elements of the \texttt{bs\_p} object\footnote{Also given is the average time of a single bootstrap: \mbox{\texttt{bs\_p\$sim\_time} = \Sexpr{signif(bs_p$sim_time, 3)}} seconds.} <<>>= bs_p$p @ \noindent Alternatively we can plot the results <>= plot(bs_p) @ \noindent to obtain figure \ref{F1d}. The graph in the top right hand corner gives the cumulative estimate of the $p\,$-value; the final value of the purple line corresponds to \texttt{bs\_p\$p}. Also given are approximate 95\% confidence intervals. \subsection{Comparing distributions} A second approach to test the power law hypothesis is a direct comparison of two models.\footnote{While the bootstrap method is useful, it is computationally intensive and will be unsuitable for most models.} A standard technique is to use Vuong's test, which a likelihood ratio test for model selection using the Kullback-Leibler criteria. The test statistic, $R$, is the ratio of the log-likelihoods of the data between the two competing models. The sign of $R$ indicates which model is \textit{better}. Since the value of $R$ is obviously subject to error, we use the method proposed by \cite{Vuong1989}. To compare two distributions, each distribution must have the same lower threshold. So we first set the log normal distribution object to have the same $\xmin$ as the power law object <>= m_ln$setXmin(m_pl$getXmin()) @ \noindent Next we estimate the parameters for this particular value of $\xmin$: <>= est = estimate_pars(m_ln) m_ln$setPars(est) @ \noindent Then we can compare distributions <>= comp = compare_distributions(m_pl, m_ln) @ \noindent This comparison gives a $p$-value of \Sexpr{signif(comp[["p_two_sided"]], 4)}. This $p\,$-value corresponds to the $p$-value on page 29 of the \citeauthor{clauset2009} paper (the paper gives 0.69). Overall these results suggest that one model can't be favoured over the other. \subsection{Investigating the lower bound} The estimate of the scaling parameter, $\alpha$, is typically highly correlated with the threshold limit, $\xmin$. This relationship can be easily investigated with the \cc{poweRlaw} package. First, we create a vector of thresholds to scan <<>>= xmins = seq(1, 1001, 5) @ <>= est_scan = 0 * xmins for (i in seq_along(xmins)) { m_pl$setXmin(xmins[i]) est_scan[i] = estimate_pars(m_pl)$pars } @ \begin{figure}[t] \centering <>= plot(xmins, est_scan, type = "s", panel.first = grid(), xlab = expression(x[min]), ylab = expression(alpha), ylim = c(1.6, 2.8), col = 1) abline(h = 1.95, col = 2, lty = 2) @ \caption{Estimated parameter values conditional on the threshold, $\xmin$. The horizontal line corresponds to $\alpha=1.95$.}\label{F1e} \end{figure} \noindent then a vector to store the results <>= @ \noindent Next, we loop over the $\xmin$ values and estimate the parameter value conditional on the $\xmin$ value <>= @ \noindent The results are plotted figure \ref{F1e}. For this data set, as the lower threshold increases, so does the point estimate of $\alpha$. \clearpage \section{Discrete data: Swiss prot} UniProtKB/Swiss-Prot is a manually annotated, non-redundant protein sequence database. It combines information extracted from scientific literature and biocurator-evaluated computational analysis. In a recent paper, \citet{Bell2012} used the power law distribution to investigate the evolution of the database over time. A single version of the data set is available with the package and can be accessed via <<>>= data("swiss_prot", package = "poweRlaw") head(swiss_prot, 3) @ \noindent This dataset contains all the words extracted from the Swiss-Prot version 9 data (with the resulting frequency for each word). Other datasets for other database versions can be obtained by contacting Michael Bell \begin{center} \url{http://homepages.cs.ncl.ac.uk/m.j.bell1/annotationQualityPaper.php} \end{center} The first column gives the word/symbol, while the second column gives the number of times that particular word occurs. We can fit a discrete power law in the usual way <<>>= m_sp = displ$new(swiss_prot$Value) est_sp = estimate_xmin(m_sp) m_sp$setXmin(est_sp) @ \begin{figure}[t] \centering <>= plot(m_sp, pch = 21, bg = 2, panel.first = grid(col = "grey80"), xlab = "Word Occurance", ylab = "CDF") lines(m_sp, col = 3, lwd = 3) @ \caption{Log-log plot for the Swiss-Prot data sets.}\label{sp1} \end{figure} \noindent which gives estimates of $\xmin = \Sexpr{est_sp$xmin}$ and $\alpha = \Sexpr{signif(est_sp$pars, 3)}$. We can spice up the base graphics plot by changing the plotting defaults. First, we change the graphical parameters <<>>= par(mar = c(3, 3, 2, 1), mgp = c(2, 0.4, 0), tck = -.01, cex.axis = 0.9, las = 1) @ \noindent Then plot data and model, but adding in a background grid, and changing the symbol <>= @ \noindent to get figure \ref{sp1}. \clearpage \section{Continuous data: electrical blackouts} In this example, we will investigate the numbers of customers affected in electrical blackouts in the United States between 1984 and 2002 (see \cite{newman2005} for further details). The data set can be downloaded from Clauset's website \begin{center} \url{http://tuvalu.santafe.edu/~aaronc/powerlaws/data/blackouts.txt} \end{center} and loaded into R in the usual way <>= blackouts = read.table("blackouts.txt")$V1 @ \noindent Although the \texttt{blackouts} data set is discrete, since the values are large it makes sense to treat the data as continuous. Continuous power law objects take vectors as inputs, so <>= m_bl = conpl$new(blackouts) @ \noindent then we estimate the lower-bound via <>= est = estimate_xmin(m_bl) @ \noindent This gives a point estimate of $\xmin=\Sexpr{est$xmin}$. We can then update the distribution object <>= m_bl$setXmin(est) @ \noindent and plot the data with line of best fit <>= plot(m_bl) lines(m_bl, col = 2, lwd = 2) @ \noindent to get figure \ref{F2a}. To fit a discrete log-normal distribution we follow a similar procedure <>= m_bl_ln = conlnorm$new(blackouts) est = estimate_xmin(m_bl_ln) m_bl_ln$setXmin(est) @ \begin{figure}[b] \centering <>= plot(m_bl, pch = 21, bg = 1, panel.first = grid(col = "grey80"), xlab = "Blackouts", ylab = "CDF") lines(m_bl, col = 2, lwd = 3) lines(m_bl_ln, col = 3, lwd = 3) @ \caption{CDF plot of the blackout data set with line of best fit. Since the minimum value of $x$ is large, we fit a continuous power law as this is more it efficient. The power law fit is the green line, the discrete log-normal is the red line.}\label{F2a} \end{figure} <>= @ \noindent and add the line of best fit to the plot via <>= lines(m_bl_ln, col = 3, lwd = 2) @ \noindent It is clear from figure \ref{F2a} that the log-normal distribution provides a better fit to this data set. \clearpage \section{Multiple data sets: the American-Indian war} In a recent paper, \citeauthor{Bohorquez2009} investigated insurgent attacks in Afghanistan, Iraq, Colombia, and Peru. Each time, the data resembled power laws. \citeauthor{Friedman2013} used the power law nature of casualties to infer under-reporting in the American-Indian war. Briefly, by fitting a power law distribution to the observed process, the latent, unobserved casualties can be inferred (\cite{Friedman2013}). The number of casualties observed in the American-Indian War can be obtained via <<>>= data("native_american", package = "poweRlaw") data("us_american", package = "poweRlaw") @ \noindent Each data set is a data frame with two columns. The first column is number of casualties recorded, the second the conflict date <<>>= head(native_american, 3) @ \noindent The records span around one hundred years, 1776 -- 1890. The data is plotted in figure \ref{F3a}. \begin{figure}[!b] \centering <>= plot(native_american$Date, native_american$Cas, log = "y", pch = 21, bg = 1, ylim = c(1, 2000), cex = 0.5, panel.first = grid(col = "grey70"), xlab = "Date", ylab = "#Casualties") points(us_american$Date, us_american$Cas, pch = 24, bg = 2, cex = 0.5) @ \caption{Casualty record for the Indian-American war, 1776 -- 1890. Native Americans casualties (purple circles) and US Americans casualties (green triangles). Data taken from \citet{Friedman2013}.}\label{F3a} \end{figure} It is straightforward to fit a discrete power law to this data set. First, we create discrete power law objects <>= m_na = displ$new(native_american$Cas) m_us = displ$new(us_american$Cas) @ \noindent then we estimate $\xmin$ for each data set <>= est_na = estimate_xmin(m_na, pars = seq(1.5, 2.5, 0.01)) est_us = estimate_xmin(m_us, pars = seq(1.5, 2.5, 0.01)) @ \noindent and update the power law objects <>= m_na$setXmin(est_na) m_us$setXmin(est_us) @ \noindent The resulting fitted distributions can be plotted on the same figure <>= plot(m_na) lines(m_na) ## Don't create a new plot, just store the output d = plot(m_us, draw = FALSE) points(d$x, d$y, col = 2) lines(m_us, col = 2) @ \begin{figure}[t] <>= plot(m_na, bg = 1, pch = 21, cex = 0.5, panel.first = grid(col = "grey70"), xlab = "#Casualties", ylab = "CDF") lines(m_na, lwd = 2, col = 1) d = plot(m_us, draw = FALSE) points(d$x, d$y, bg = 2, pch = 24, cex = 0.5) lines(m_us, lwd = 2, col = 2) @ \caption{Plots of the CDFs for the Native American and US American casualties. The lines of best fit are also given.}\label{F3b} \end{figure} \noindent The result is given in figure \ref{F3b}. The tails of the distributions appear to follow a power law. This is consistent with the expectation that smaller-scale engagements are less likely to be recorded. However, for larger scale engagements it is likely that the event was recorded. <>= # R compiles all vignettes in the same session, which can be bad rm(list = ls(all = TRUE)) @ \bibliography{poweRlaw} \bibliographystyle{plainnat} \end{document}