Loading ggplot2

library(ggplot2)
library(dplyr)
## 
## Attaching package: 'dplyr'
## 
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## 
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

Loading data

data(movies)

Storing values in df

df <- movies

Checking the dataset

head(df)
##                      title year length budget rating votes   r1   r2  r3
## 1                        $ 1971    121     NA    6.4   348  4.5  4.5 4.5
## 2        $1000 a Touchdown 1939     71     NA    6.0    20  0.0 14.5 4.5
## 3   $21 a Day Once a Month 1941      7     NA    8.2     5  0.0  0.0 0.0
## 4                  $40,000 1996     70     NA    8.2     6 14.5  0.0 0.0
## 5 $50,000 Climax Show, The 1975     71     NA    3.4    17 24.5  4.5 0.0
## 6                    $pent 2000     91     NA    4.3    45  4.5  4.5 4.5
##     r4   r5   r6   r7   r8   r9  r10 mpaa Action Animation Comedy Drama
## 1  4.5 14.5 24.5 24.5 14.5  4.5  4.5           0         0      1     1
## 2 24.5 14.5 14.5 14.5  4.5  4.5 14.5           0         0      1     0
## 3  0.0  0.0 24.5  0.0 44.5 24.5 24.5           0         1      0     0
## 4  0.0  0.0  0.0  0.0  0.0 34.5 45.5           0         0      1     0
## 5 14.5 14.5  4.5  0.0  0.0  0.0 24.5           0         0      0     0
## 6 14.5 14.5 14.5  4.5  4.5 14.5 14.5           0         0      0     1
##   Documentary Romance Short
## 1           0       0     0
## 2           0       0     0
## 3           0       0     1
## 4           0       0     0
## 5           0       0     0
## 6           0       0     0

Removing missing values

df2 <- na.omit(df)

df2 <- filter(df2, Action==1 | Animation ==1 | Comedy==1 | Drama==1 | Documentary ==1 | Romance==1 | Short==1 )

Checking

head(df2)
##                        title year length   budget rating votes   r1   r2
## 1                    'G' Men 1935     85   450000    7.2   281  0.0  4.5
## 2         'Til There Was You 1997    113 23000000    4.8   799  4.5  4.5
## 3 10 Things I Hate About You 1999     97 16000000    6.7 19095  4.5  4.5
## 4              100 Mile Rule 2002     98  1100000    5.6   181  4.5  4.5
## 5                  100 Proof 1997     94   140000    3.3    19 14.5 14.5
## 6                        101 1989    117   200000    7.8   299  4.5  0.0
##    r3   r4   r5   r6   r7   r8   r9  r10  mpaa Action Animation Comedy
## 1 4.5  4.5  4.5 14.5 34.5 34.5  4.5  4.5            0         0      0
## 2 4.5 14.5 14.5 14.5 14.5  4.5  4.5 14.5 PG-13      0         0      1
## 3 4.5  4.5  4.5 14.5 24.5 14.5 14.5 14.5 PG-13      0         0      1
## 4 4.5  4.5 14.5 24.5 14.5 14.5  4.5 14.5     R      0         0      1
## 5 4.5 14.5 14.5 14.5 14.5  0.0  0.0 24.5            0         0      0
## 6 4.5  4.5  4.5  4.5  4.5 14.5 14.5 45.5            0         0      0
##   Drama Documentary Romance Short
## 1     1           0       0     0
## 2     0           0       1     0
## 3     0           0       1     0
## 4     0           0       0     0
## 5     1           0       0     0
## 6     0           1       0     0

Linear Modelling

We’re looking at the effect of “year” on “lenght”

fm1 <- lm(length ~ year, data = df2)

Results of the model

summary(fm1)
## 
## Call:
## lm(formula = length ~ year, data = df2)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -101.601   -9.955    2.507   17.143  290.318 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 457.91527   49.32279   9.284  < 2e-16 ***
## year         -0.18203    0.02483  -7.332 2.69e-13 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 35.49 on 4341 degrees of freedom
## Multiple R-squared:  0.01223,    Adjusted R-squared:  0.01201 
## F-statistic: 53.76 on 1 and 4341 DF,  p-value: 2.689e-13

Extracting coefficients

coef(fm1)
## (Intercept)        year 
## 457.9152745  -0.1820292

Plotting basic diagnostics

plot(fm1)