-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcomplexity_analysis_CLMET.R
202 lines (152 loc) · 5.5 KB
/
complexity_analysis_CLMET.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
setwd("./Datasets")
library(readxl)
library(ggplot2)
library(matrixStats)
library(effects)
library(lmtest)
library(reshape2)
library(dplyr)
library(CADFtest)
#### morphology ####
# read data
morph_data <- read_xlsx("CLMET_Morph_Zipped.xlsx", col_names = TRUE)
zipped <- read_xlsx("CLMET_Zipped_Sizes.xlsx", col_names = TRUE)
# merge data
morph_total <- merge(zipped, morph_data, by="...1")
# make all numbers negative
morph_total[,6:105] <- morph_total[,6:105]*(-1)
# divide by full zipped size to get complexity ratio
morph_total[,6:105] <- morph_total[,6:105]/morph_total[,4]
# get means for each row = get mean complexity ratio
morph_means <- rowMeans(morph_total[,6:105])
# add mean complexity ratios to data frame
morph_total$morph_means <- morph_means
# get standard deviations
morph_std = rowSds(as.matrix(morph_total[,6:105]))
# linear model
morph_model <- lm(morph_means ~ year, data=morph_total)
summary(morph_model)
plot(allEffects(morph_model))
# make plots
mp <- ggplot(morph_total,
aes(x = year, y = morph_means))+
ggtitle("Morphological complexity ratio over time") +
xlab("Year")+
ylab("Mean morphological complexity ratio") +
geom_point()
mp
ml <- ggplot(morph_total,
aes(x = year, y = morph_means))+
ggtitle("Morphological complexity ratio over time") +
xlab("Year")+
ylab("Mean morphological complexity ratio")+
geom_point()+
geom_smooth()
ml
#### Syntax ####
# read data
synt_data <- read_xlsx("CLMET_Synt_Zipped.xlsx", col_names = TRUE)
# merge data
synt_total <- merge(zipped, synt_data, by="...1")
# divide by full zipped size to get complexity ratio
synt_total[,6:105] <- synt_total[,6:105]/synt_total[,4]
# get means for each row = get mean complexity ratio
synt_means <- rowMeans(synt_total[,6:105])
# add mean complexity ratios to data frame
synt_total$synt_means <- synt_means
# get standard deviations
synt_std = rowSds(as.matrix(synt_total[,6:105]))
# linear model
synt_model <- lm(synt_means ~ year, data=synt_total)
summary(synt_model)
plot(allEffects(synt_model))
# make plots
sp <- ggplot(synt_total,
aes(x = year, y = synt_means))+
ggtitle("Syntactic complexity ratio over time") +
xlab("Year")+
ylab("Mean syntactic complexity ratio")+
geom_point()
sp
sl <- ggplot(synt_total,
aes(x = year, y = synt_means))+
ggtitle("Syntactic complexity ratio over time") +
xlab("Year")+
ylab("Mean syntactic complexity ratio")+
geom_point()+
geom_smooth()
sl
#### Morphology vs syntax ####
morph_and_synt <- data.frame("Morphology" = morph_total$morph_means,
"filename" = synt_total$filename,
"Syntax" = synt_total$synt_means,
"year" = synt_total$year)
morph_and_synt_model <- lm(morph_means ~ synt_means, data=morph_and_synt)
summary(morph_and_synt_model)
plot(allEffects(morph_and_synt_model))
x <- ggplot(morph_and_synt, aes(x = synt_means, y = morph_means)) +
xlab("Mean word order rigidity ratio") +
ylab("Mean morphological complexity ratio") +
geom_point() +
scale_y_continuous(labels = function(y) gsub("-", "−", as.character(y))) # Fix minus sign for y-axis
x
#### Time series analysis ####
# make time series for morphology means
morph_ts <- ts(morph_and_synt$Morphology)
CADFtest(morph_ts) # significant
plot(morph_ts)
morph_diff_ts <- diff(morph_ts) # detrending
# make time series for syntax means
synt_ts <- ts(morph_and_synt$Syntax)
CADFtest(synt_ts) # not significant: no unit root
plot(synt_ts)
synt_diff_ts <- diff(synt_ts) # detrending
# granger causality test
for (x in 1:5) {
print(grangertest(morph_ts ~ synt_ts, order = x))
}
for (x in 1:5) {
print(grangertest(synt_ts ~ morph_ts, order = x))
}
for (x in 1:5) {
print(grangertest(morph_diff_ts ~ synt_diff_ts, order = x))
}
for (x in 1:5) {
print(grangertest(synt_diff_ts ~ morph_diff_ts, order = x))
}
# Visualization:
library(latticeExtra)
tsDF <- data.frame(Decade = seq(from = 1710, to = 1920, by = 10),
"syntax" = synt_ts,
"morphology" = morph_ts)
m <- xyplot(morphology ~ Decade, tsDF, type = "l" , lwd=2)
s <- xyplot(syntax ~ Decade, tsDF, type = "l", lwd=2)
doubleYScale(m, s, add.ylab2 = TRUE, use.style=TRUE)
# Z-scored plot
# Z-score the time series
z_scored_morph <- scale(morph_ts)
z_scored_synt <- scale(synt_ts)
# Define y-axis range for consistency
y_range <- c(-3, 3) # Assuming this range based on your plot limits
# Adjust margins to allow space below the plot for the legend
par(mar = c(8, 4, 2, 2)) # Increase the bottom margin to 8
# Create a plot with the first z-scored time series
plot(z_scored_morph, type = "l", lty = 1, ylim = y_range,
ylab = "Z-Scored Values",
xlab = "Time",
xaxt = "n",
yaxt = "n")
# Add the second z-scored time series to the plot with a different line type
lines(z_scored_synt, lty = 2)
# Add years to the plot
years <- seq(1710, 1890, by = 10)
# Calculate the positions of the ticks
tick_positions <- seq(1, length(z_scored_morph), length.out = length(years))
axis(1, at = tick_positions, labels = years)
# Custom y-axis labels with proper minus sign
y_ticks <- seq(floor(min(y_range)), ceiling(max(y_range)), by = 1) # Define y-tick positions
y_labels <- gsub("-", "−", as.character(y_ticks)) # Replace hyphen with minus sign
axis(2, at = y_ticks, labels = y_labels) # Add y-axis with custom labels
# Add a legend
legend("bottom", legend = c("Morphological complexity", "Word order rigidity"),
lty = c(1, 2), cex = 0.8, inset = c(0, -0.35), xpd = TRUE, horiz = TRUE)