From 2985de49623ba9baae6f7d2f06f958b78a22213b Mon Sep 17 00:00:00 2001
From: Julie Nijs <107923146+nysju@users.noreply.github.com>
Date: Wed, 3 May 2023 14:55:49 +0200
Subject: [PATCH] Update hortative_alternation_analysis.R

---
 hortative_alternation_analysis.R | 102 +++++++------------------------
 1 file changed, 23 insertions(+), 79 deletions(-)

diff --git a/hortative_alternation_analysis.R b/hortative_alternation_analysis.R
index 7f0e1f3..c0c4acc 100644
--- a/hortative_alternation_analysis.R
+++ b/hortative_alternation_analysis.R
@@ -26,97 +26,41 @@ hist(log(Datasteden$BAIy1850),
      ylab='Aantal steden',
      main='Histogram van de steden per populatiegrootte (loggetransformeerd)')
 
-# read the data of the hortative alternation: 
-Verandering <- read_xlsx("The hortative alternation.xlsx", 
+Hortatief <- read_xlsx("The hortative alternation.xlsx", 
                          col_names = TRUE)
 
 # add population counts:
-Merged <- left_join(Verandering, Datasteden, by = c("BIRTHPLACE" = "CITY"))
+Hortatief <- left_join(Hortatief, Datasteden, by = c("BIRTHPLACE" = "CITY"))
 
 # throw away rows without birthplace:
-Merged <- Merged %>% drop_na(BAIy1850) 
-Merged$Change <- as.factor(Merged$Change) # recode variable Change to a factor
-Merged <- droplevels(Merged[!Merged$Change=="NA",]) # drop NA values
+Hortatief <- Hortatief %>% drop_na(BAIy1850) 
+Hortatief$Change <- as.factor(Hortatief$Change) 
+Hortatief <- droplevels(Hortatief[!Hortatief$Change=="NA",])
 
 # add log transformed population counts to the data set:
-symbox(Merged$BAIy1850)
-Merged$logBAIy1850=log(Merged$BAIy1850)
+symbox(Hortatief$BAIy1850)
+Hortatief$logBAIy1850=log(Hortatief$BAIy1850)
 
-#### glm model change vs year and population count ####
-Model <- glm(Change ~ year + logBAIy1850, 
-              data=Merged, family = binomial(link="logit"))
-print(summary(Model))
+#### Analysis per bin of cities: 3 bins ####
+
+Hortatief$Change2 <- as.numeric(Hortatief$Change)-1 # rescale to numeric for ggplot
 
-plot(allEffects(Model))
-  
-Merged$Change2 <- as.numeric(Merged$Change)-1 # rescale to numeric for ggplot
+# create bins:
+Hortatief <- Hortatief %>% mutate(logBAIy1850_bin_3 = cut_interval(Hortatief$logBAIy1850, 
+                                                             n = 3))
+Model_bins <- glm(Change ~ year*logBAIy1850_bin_3, 
+                  data=Hortatief, family = binomial(link="logit"))
+print(summary(Model_bins))
+plot(allEffects(Model_bins))
 
-# data visualisation: hortative alternation over time
-print(ggplot(data = Merged, aes(x = year, y = Change2)) +
+print(ggplot(data = Hortatief, aes(x = year, y = Change2, color=logBAIy1850_bin_3)) +
         geom_smooth(method = "glm", 
-                    method.args = list(family = binomial), 
-                    colour = "black")+
-        labs(x="Jaar", y="laat ons vs laten we", title="De hortatiefalternantie") +
+                    method.args = list(family = binomial))+
+        labs(x="Jaar", y="laat ons vs laten we") +
+        guides(col=guide_legend(title="Populatiegrootte")) +
+        scale_color_discrete(labels = c("Klein", "Middelgroot", "Groot")) +
         scale_x_continuous(limits = c(1850, 1990), 
                            breaks = seq(1850, 1990, by = 10)) +
         scale_y_continuous(limits = c(0,1), 
-                            n.breaks = 10)+
+                           n.breaks = 10)+
         theme_bw())
-
-#### Analysis per bin of cities: 3 bins ####
-
-# create 3 bins based on population counts:
-# not log-transformed:
-Merged <- Merged %>% mutate(BAIy1850_bin_3 = cut_interval(Merged$BAIy1850, 
-                                                          n = 3)) 
-# log-transformed:
-Merged <- Merged %>% mutate(logBAIy1850_bin_3 = cut_interval(Merged$logBAIy1850, 
-                                                             n = 3))
-
-# histograms of the amount of cities per population size in the hortative dataset:
-hist(Merged$BAIy1850) # not log-transformed: huge gap in the distribution
-hist(Merged$logBAIy1850) # log-transformed:  nicer distribution
-
-# calculate slopes per bin of cities:
-
-# function calculates slopes and standard errors and plots 
-my_lm <- function(df) {
-  m <-(glm(Change ~ year, data = df, family = binomial(logit))) # make logistic regression model
-  print(summary(m)) # check model
-  summary_m <- summary(m) # store the summary of the model
-  slopes <- summary_m$coefficients[2, 1] # get the slope
-  std_errors <- summary_m$coefficients[2, 2] # get the standard error
-  slopes_df <- expand.grid(slopes, std_errors) # store slope and st error in dataframe
-  df$Change2 <- as.numeric(df$Change)-1 # rescale to numeric for ggplot
-  # make plot:
-  print(ggplot(data = df, aes(x = year, y = Change2)) +
-          geom_smooth(method = "glm", 
-                      method.args = list(family = binomial), 
-                      colour = "black")+
-          labs(x="Jaar", y="", title="De hortatiefalternantie") +
-          scale_x_continuous(limits = c(1850, 1990), 
-                             breaks = seq(1850, 1990, by = 10)) +
-          scale_y_continuous(limits = c(0,1), 
-                             n.breaks = 10)+
-          theme_bw())
-  return(slopes_df)
-}
-
-# calculate slope and standard deviation and plot slope per bin of cities (log-transformed):
-slopes_bin3 <- by(Merged, Merged$logBAIy1850_bin_3, my_lm) 
-
-df_bin3 <- do.call(rbind, slopes_bin3) # make dataframe with slopes and standard deviations
-
-# plotting:
-
-# calculate 1.95 confidence interval based on standard error:
-ymin <- df_bin3$Var1-(1.95*df_bin3$Var2)
-ymax <- (1.95*df_bin3$Var2)+df_bin3$Var1
-
-print(ggplot(df_bin3, aes(c("[0.693,2.27]", "[2.27,3.84]", "[3.84,5.42]"), Var1, 
-            ymin = Var1-(1.95*Var2), ymax = (1.95*Var2)+Var1))+
-  geom_point()+
-  geom_errorbar()+
-  labs(x="Groepering per populatiegrootte van de steden", 
-       y="Richtingscoëfficiënten", 
-       title="De hortatiefalternantie"))