eda.Rmd

```{r}
# path
path = "E:/StockThing"
knitr::opts_knit$set(root.dir = path) 
setwd(path)
```

```{r}
library(dplyr)
library(ggplot2)
```

```{r}
data <- read.csv("complete.csv")
```

```{r}
head(data)
tail(data)
```
```{r}
#convert the date column into suitable format
data$date <- as.Date(as.character(data$date), format = "%Y%m%d")
```

```{r}
#get the splits information
splits <- read.csv("splits.csv")
splits <- na.omit(splits)
splits$date <- as.Date(as.character(splits$date), format = "%d-%m-%Y")
```

```{r}
#get the names of all the companies
companies <- unique(data$name)
```

```{r}
#plot time series graph for a company
abb <- data %>% filter(name == 'ABB')
ggplot(abb, aes(date, close)) + geom_line() + ggtitle("ABB Stock Price Variation")
```



```{r}
#find out all the companies which are currently being traded as of 2019
comp <- c()
for(company in companies) {
  d <- data %>% filter(name == company)
  latest <- max(d$date)
  y <- as.numeric(format(latest,'%Y'))
  if(y == 2019) {
    comp <- c(comp, company)
  }
}

```

```{r}
#to find whether the companies which are currently being traded have data for all the years from the day that they were listed on the exchange
#if not, then drop the companies 

comp_with_all_data <- c()

for(company in comp) {
  d <- data %>% filter(name == company)
  latest <- max(d$date)
  earliest <- min(d$date)
  
  year_latest <- as.numeric(format(latest,'%Y'))
  year_earliest <- as.numeric(format(earliest, '%Y'))
  
  years <- seq(year_earliest, year_latest)
  y <- format(d['date'], "%Y")
  
  flag <- 1
  for(val in years) {
       c <- sum(y == val)
       if(c == 0) {
         flag <- 0
       }
  }
  if(flag == 1) {
    comp_with_all_data <- c(comp_with_all_data, company)
  }
}

#data <- data %>% filter(name %in% comp)
```


```{r}
#filter out the splits records to take into account only those companies which we have from the previous step 
comp <- comp_with_all_data
splits <- splits %>% filter(code %in% comp)
```


```{r}
c1 <- c()
c2 <- c()
c3 <- c()
c4 <- c()
for(company in comp) {
  
  #Calculating CAGR
  d <- data %>% filter(name == company)
  if(nrow(d) > 100) {
  c1 <- c(c1, company)
  latest <- max(d$date)
  earliest <- min(d$date)
  no_of_years <- as.numeric(difftime(latest, earliest))/365

  close_l <- d %>% filter(date == max(date)) %>% select(close)
  close_e <- d %>% filter(date == min(date)) %>% select(close)
  
  if(nrow(close_l) > 1) {
    close_l <- close_l[1, ]
  }
  
  if(nrow(close_e) > 1) {
    close_e <- close_e[1, ]
  }
  
  
  year_latest <- as.numeric(format(latest,'%Y'))
  year_earliest <- as.numeric(format(earliest, '%Y'))
  years <- seq(year_earliest, year_latest)
  
  
  values <- splits %>% filter(code == company)
  y <- format(values['date'], "%Y")
  
  ratio <- 1
  for(year in years) {
    if(year %in% y) {
        ind <- match(y, year)
        ratio <- ratio * values[ind, "old_fv"]/values[ind, "new_fv"]
      }
    }
  
  cagr <- (((close_l * ratio/close_e) ^ (1/no_of_years)) - 1) * 100
  c2 <- c(c2, cagr)
  
  #number of years of positive returns and number of years of negative returns 
  pos_returns <- 0
  neg_returns <- 0
  
  spl <- splits %>% filter(code == company)
  y <- format(spl['date'], "%Y")
  
  for(val in years) {
    sub <- d %>% filter(as.numeric(format(date, '%Y')) == val)
    max <- max(sub$date)
    min <- min(sub$date)
    last <- sub %>% filter(date == max) %>% select(close)
    first <- sub %>% filter(date == min) %>% select(close)
    ratio <- 1
    
    if(val %in% y) {
      ind <- match(val, y)
      ratio <- ratio * spl[ind, "old_fv"]/spl[ind, "new_fv"]
    }
    
    if(nrow(last) > 1) {
      last <- last[1, ]
    }
    
    if(nrow(first) > 1) {
      first <- first[1, ]
    }
    
    if((last * ratio) - first > 0)
      pos_returns <- pos_returns + 1
    else
      neg_returns <- neg_returns + 1
  }
  c3 <- c(c3, pos_returns)
  c4 <- c(c4, neg_returns)
  }
}
```


```{r}
#create a new data frame with the following parameters - company code, CAGR, Number of years of positive returns, Number of years of negative returns, Proportion of Positive Return Years to the Total Number of Years of existence
stats <- data.frame(c1, stringsAsFactors = FALSE)
stats$cagr <- c2
stats$posyears <- c3
stats$negyears <- c4
stats <- transform(stats, prop = posyears/(posyears + negyears))
```

```{r}
#Find the companies which have given the highest CAGR - top 50
st <- as.data.frame(lapply(stats, unlist))

x <- head(st[order(st$cagr, decreasing = TRUE),], n = 50)
x

ggplot(data=x, aes(c1, cagr)) + geom_bar(stat="identity") +theme(axis.text.x = element_text(angle = 90, hjust = 1)) + xlab("Company") + ggtitle("CAGR vs Company")
```

```{r}
#Find the companies which have the highest proportion of returns - Top 50
y <- head(st[order(st$prop, decreasing = TRUE), ], n = 50)
y

ggplot(data=y, aes(c1, prop)) + geom_bar(stat="identity") +theme(axis.text.x = element_text(angle = 90, hjust = 1)) + xlab("Company") + ggtitle("Company vs Proportion of Positive Years")

```

```{r}
#Find the intersection of both highest CAGR companies and highest proportion companies 
common <- intersect(x$c1, y$c1)  
common
```