analysis.Rnw

%See MacArthur_Paper_Analysis.R for instructions on how to configure the
%Analysis supporting this script.
%tools>global options>sweave>weave rnw files using: knitr

%sudo apt-get install texlive
%sudo apt-get install texlive-latex-extra 
%sudo apt-get install texlive-bibtex-extra
%sudo apt-get install texlive-fonts-recommended

\documentclass{article}
\usepackage{float}
\usepackage{wrapfig}
\usepackage{hyperref}
\usepackage{booktabs}
\usepackage{longtable}
\usepackage[backend=bibtex, style=nature, citestyle=authoryear]{biblatex}
\usepackage[table]{xcolor}
\bibliography{WBVFM_IntroPar}
\newenvironment{knitrout}{}{}  %just a dummy environment
\makeatletter
\newcommand\gobblepars{%
    \@ifnextchar\par%
        {\expandafter\gobblepars\@gobble}%
        {}}
\makeatother
\title{Descriptives: SEA and MacArthur}
\begin{document}
%\SweaveOpts{concordance=TRUE}
\begin{knitrout}
<<setup, echo=FALSE, include=FALSE>>=
Sys.setenv(TEXTINPUTS=getwd(),
           BIBINPUTS=getwd(),
           BSTINPUTS=getwd())
render_sweave()
#Make sure this is set correctly for builds.
rebuild_images = TRUE
source("MacArthur_Paper_Analysis.R")
@

\maketitle
<<Pre_warning, echo=FALSE>>=
if(restrict_analysis != FALSE)
  {
  wartxt = paste("Currently, only ",restrict_analysis," cells are being used in this analysis, due to computational limitations.", sep="")
  }else
  {
  wartxt = "Draft Text."
  }
@
\textbf{WARNING:} \Sexpr{wartxt}
\tableofcontents

\newpage

\section{Data}
<<Variable_table, results='asis', echo=FALSE, cache=FALSE>>=
varMatrix <- matrix(data=NA, nrow=length(names(vars)), ncol=2)
for (i in 1:length(names(vars)))
{
  cur_var <- names(vars)[i]
  
  short_name <-eval(parse(text= paste("vars$",cur_var,"$data_mini", sep="")))
  varMatrix[i,] <- c(short_name, cur_var)
}

var_caption <- paste0("Variables used in this analysis.")

colnames(varMatrix) <- c("Short Name", "Long Name")

print(xtable(varMatrix, caption=var_caption, label="Variable_Table"), 
      size="footnotesize",
      include.rownames=FALSE,
      include.colnames=TRUE,
      capiton.placement="bottom"
      )
@
Variable names are defined in Table \ref{Variable_Table}.  

\newpage

\section{Descriptives}

%Cell-level statistics
<<Cell_Stats, results='asis', echo=FALSE>>=
#selecting just variables of interest
dta3 <- dta2[23:297]
#Select only the mean temperature and precip, for now.
colnames(dta3)[2] <- "per_loss_e"
dta3 <- dta3[,grep('e$',names(dta3), invert=FALSE)]

#Process dataframe to summarize temporal variables into single summaries
collapse_temporal <- function(x, prefix)
{
  eval(parse(text=paste("t_dta <- x[,grep('^",prefix,"',names(x), invert=TRUE)]",sep="")))
  eval(parse(text=paste("stat_dta <- x[,grep('^",prefix,"',names(x), invert=FALSE)]",sep="")))
  eval(parse(text=paste("t_dta$",prefix,"_mean <- rowMeans(stat_dta)", sep="")))
  return(t_dta)
}
stargaze_dta <- collapse_temporal(dta3, "lnyx")
stargaze_dta <- collapse_temporal(stargaze_dta, "ncc4")
stargaze_dta <- collapse_temporal(stargaze_dta, "at41")
stargaze_dta <- collapse_temporal(stargaze_dta, "pc41")
ttl = paste("Cell level descriptive statistics (N=",length(dta2[[1]]),")",sep="")
stargazer(stargaze_dta, title=ttl, notes="Variables with 'mean' appendix are summaries of the yearly mean values.")
@

%ADM2-level statistics
<<ADM_Stats, results='asis', echo=FALSE>>=
#selecting just variables of interest
dta3 <- dta2[c(6,23:297)]
colnames(dta3)[1] <- "OBJECTIDe"
colnames(dta3)[3] <- "per_loss_e"
dta3 <- dta3[,grep('e$',names(dta3), invert=FALSE)]

dta3_adm <- aggregate(dta3, by=list(dta3$OBJECTID), FUN=mean, na.rm=TRUE)

#Process dataframe to summarize temporal variables into single summaries
collapse_temporal <- function(x, prefix)
{
  eval(parse(text=paste("t_dta <- x[,grep('^",prefix,"',names(x), invert=TRUE)]",sep="")))
  eval(parse(text=paste("stat_dta <- x[,grep('^",prefix,"',names(x), invert=FALSE)]",sep="")))
  eval(parse(text=paste("t_dta$",prefix,"_mean <- rowMeans(stat_dta)", sep="")))
  return(t_dta)
}
stargaze_dta <- collapse_temporal(dta3_adm, "lnyx")
stargaze_dta <- collapse_temporal(stargaze_dta, "ncc4")
stargaze_dta <- collapse_temporal(stargaze_dta, "at41")
stargaze_dta <- collapse_temporal(stargaze_dta, "pc41")
stargaze_dta2 <- stargaze_dta[-c(1,2)]
ttl = paste("ADM2 level descriptive statistics (N=",length(unique(dta2$OBJECTID)),")",sep="")
stargazer(stargaze_dta2, title=ttl, notes=c("Variables with 'mean' appendix are summaries of the yearly mean values.", "All variables represent the mean cell value within a ADM 2."))
@
\begin{figure}[H]
\caption{Study area (ADM2) overlayed with Chinese development finance project locations.}
\label{LTDR_Hansen}
\centering
<<StudyAreaMap, echo=FALSE, fig="TRUE", out.width="0.8\\linewidth">>=
plot(spdf_adm)
points(Mac_spdf, col="red")
@
\end{figure}

\begin{figure}[H]
\caption{Relationship between LTDR 2000 NDVI and Hansen \% Forest Cover 2000, of LTDR cells with at least \Sexpr{forest_thresh}\% forest cover according to Hansen 2000.}
\label{LTDR_Hansen}
\centering
<<LTDR_NDVI, echo=FALSE, fig="TRUE", out.width="0.8\\linewidth">>=
plot(ndviDTA_for)
@
\end{figure}
The distribution of NDVI values for the remaining LTDR cells can be seen in figure \ref{LTDR_Hansen}.  
As expected, we see a strong correlation - areas with higher levels of NDVI also tend to have higher levels of forest cover as estimated by Hansen.  
Because LTDR and Hansen are not independent products (Hansen leverages the same input as LTDR in the product production), this analysis does not suggest that one product can be used to independently verify the other.  
Rather, here we illustrate that historic LTDR trends can be relevant for establishing baselines when Hansen outcome variables are considered.
\begin{figure}[H]
\caption{Histogram of LTDR NDVI Values in (a) locations with >\Sexpr{forest_thresh}\% forest cover in Hansen 2000, and (b) locations with <\Sexpr{forest_thresh}\% forest cover in Hansen 2000.}
\label{NDVI_Hansen_hist}
\centering
<<LTDR_NDVI_hist, echo=FALSE, fig="TRUE", out.width="0.8\\linewidth">>=
par(mfrow=c(2,1))
ttl = paste("Histogram of NDVI values in LTDR 5km w/ >",forest_thresh,"% Forest Cover", sep="")
hist(ndviDTA_for$lnyx_2000e, main=ttl, col="green", xlim=c(0,10000))
ttl2 = paste("Histogram of NDVI values in LTDR 5km w/ <",forest_thresh,"% Forest Cover", sep="")
hist(ndviDTA_notfor$lnyx_2000e, main=ttl2, col="red", xlim=c(0,10000))
@
\end{figure}
\section{Methods}

From each cell defined as forest (cells with > \Sexpr{forest_thresh}\% forest cover according to Hansen 2000), the euclidean distance (Haversine) to each chinese investment site (N = \Sexpr{length(locations2[[1]])}) is calculated and recorded.  
On average, the minimum distance between a unit of observation (the cell) and each chinese investment site is \Sexpr{round(minDistKm,0)}km, with an average distance of \Sexpr{round(avgDistKm,0)}km.
\par
To establish the degree of impact Chinese investment had on each grid cell, a weighted distance-decay function is approximated.  
The spatial autocorrelation of forest tree cover is approximated using the 1999 LTDR dataset to avoid any potential confounds with treatments that start circa 2000.
By examining the spatial autocorrelation of forest tree cover, we hope to establish the maximum distance at which spillover effects might feasibly be observed (as the spatial pattern of tree cover in 1999 is the produce of all preceeding impacts).
Spatial autocorrelation is examined over 50 kilometer steps - for example, all cells within 50km are contrasted to one another, and the correlation of forest cover is recorded.  
This is then repeated in bands, i.e., 50-100km; 100-150km and so forth until the 1000km limit is reached. 
At each distance band, a summary measure of spatial autocorrelation - Morans I - is calculated, following:
\begin{equation}
I_h = (\frac{N}{\sum_{i}^{N}\sum_{j}^{N}w_{ij}}) * ((\sum_{i}^{N}\sum_{j}^{N}w_{ij} * (X_{i}-\bar{x}) * \frac{X_{j} - \bar{x}}{\sum_{i}^{N}(X_{i}-\bar{x})})^{2})
\end{equation}
where \textit{h} represents each spatial bin, \textit{N} the number of spatial units, \textit{i} and \textit{j} are indexes for each unit, \textit{X} is the variable of itnerest, and \begin{math}W_{ij}\end{math} represents the weights matrix.  
In this application, the weights matrix is specified according to the bin (\textit{h}) being analyzed.  
For example, if a chinese project is 100 kilometers from a cell of interest, the relevant spatial correlation estimated in this function is used to weight that project.
\\


  \begin{figure}[H]
  \caption{Distance decay of spatial autocorrelation observed in 1999 Forest Cover, as measured using the LTDR dataset.}
\label{LTDR_1999_Decay}
\centering
<<LTDR_1999_Decay, echo=FALSE, fig="TRUE", out.width="0.8\\linewidth">>=
plot(correlogram_data)
@
\end{figure}

As this correlogram illustrates, a lack of positive spatial autocorrelation among LTDR measurements in 1999 is first observed at approximately \Sexpr{round(as.numeric(correlogram_data$x.intercept))} km.  
We use this distance as a threshold to screen nearby Chinese aid projects - i.e., the average and sum distance of all Chinese aid projects within \Sexpr{round(as.numeric(correlogram_data$x.intercept))} km is taken for each cell, and projects beyond that distance are not included.  
These two values - the average and summed distance of Chinese aid projects - are used as our approximation of the strength of Chinese interventions within any given cell on the landscape.
\par


  \begin{figure}[H]
  \caption{Potential measurements of treatment indicator.}
\label{Proj_Avg_Dist}
\centering
<<Treatment_Map, echo=FALSE, cache=FALSE>>=
mapAvg = spplot(AOI_cells, "thresh_avgDist", col="transparent", main=list(label=paste("Average Distance \n (within ",round(correlogram_data$x.intercept)," km) \n to Chinese projects", sep=""),cex=0.75), sub="Threshold Approach", sp.layout=list(Mac_spdf, pch=16, col="red", cex=0.5))
mapCnt = spplot(AOI_cells, "thresh_tot_proj", col="transparent", main=list(label=paste("Total Nearby \n (within ",round(correlogram_data$x.intercept)," km) \n Chinese projects", sep=""),cex=0.75), sub="Threshold Approach", sp.layout=list(Mac_spdf, pch=16, col="red", cex=0.5))
mapTotDist =  spplot(AOI_cells, "thresh_totDist", col="transparent", main=list(label=paste("Total Distance of Nearby \n (within ",round(correlogram_data$x.intercept)," km) \n Chinese projects", sep=""),cex=0.75), sub="Threshold Approach", sp.layout=list(Mac_spdf, pch=16, col="red", cex=0.5))
mapWeightedDist =  spplot(AOI_cells, "thresh_weightedDist", col="transparent", main=list(label="Weighted Distance \n of \n Chinese projects",cex=0.75), sub="Distance Decay Approach", sp.layout=list(Mac_spdf, pch=16, col="red", cex=0.5))

if(rebuild_images == TRUE)
{
png(filename="mapFigs.png", width=7, height=5, units="in", res=300)
grid.arrange(mapAvg, mapCnt, mapTotDist,mapWeightedDist, nrow=2, ncol=2, heights=c(unit(2.5, "inches"),unit(2.5, "inches")),widths=c(unit(3.5, "inches"),unit(3.5, "inches")))
}
@
\includegraphics{mapFigs}
\end{figure}

  \begin{figure}[H]
  \caption{Potential measurements of treatment indicator.}
\label{Temporal_Measurements}
\centering
<<OverTime_Treatment_Fig, echo=FALSE, fig="TRUE", out.width="0.8\\linewidth">>=
par(mfrow=c(2,2))
plot(TempDF$all_years, TempDF$Dist_Decay_Yrs, type="b", lty=2, xlab="Year", ylab="Avg. Dist. Weighted Est. Impact of Chinese Aid")
plot(TempDF$all_years, TempDF$AvgYears, type="b", lty=2, xlab="Year", ylab="Avg. Dist. to New Chinese Aid Proj.")
plot(TempDF$all_years, TempDF$CountProj_Years, type="b", lty=2, xlab="Year", ylab="Count of New Chinese Aid Proj.")
plot(TempDF$all_years, TempDF$Avg_MinYears, type="b", lty=2, xlab="Year", ylab="Avg. Dist. to Closest New Chinese Aid Proj.")

@
\end{figure}
Figure \ref{Proj_Avg_Dist} shows an example of the treatment variables examined in this analysis, averaged across the full time period, while \ref{Temporal_Measurements} shows the overall fluctuations of these measurements over time, across all grid cells being examined.


Functional Form of Analysis:
\begin{equation}
Hansen_{it} = \alpha + \theta * WeightedDistanceActiveChineseProject_{it} + \sum(\beta_{j}*X_{j}) + D_{region} + D_{t} + D_{region} * t + \epsilon_{it}
%\frac{1}{(2\pi)^{1/2}*\sigma_{i}} * exp(-\frac{1}{2}*\frac{(x-\mu_{i})^{2}}{\sigma_{i}^{2}})
\end{equation}

<<Model_Results, results='asis', echo=FALSE>>=
stargazer(CMREG, title="Initial Results", keep=c("DecayYr","Forest_Loss", "NighttimeLights","MinTemp","MaxTemp","MeanTemp", "MinPrecip", "MaxPrecip", "MeanPrecip"))
@

\end{knitrout}
\end{document}