# +--------------------------------------------------------------------------+
# | Einfuehrung in die Oekonometrie                                          |
# | Lineare Mehrfachregression                                               |
# | Beispiel "Eigenheimpreise" (doc_eigenheimpreise.pdf)                     |
# | R. Schuhr 22.03.2012                                                     |
# + -------------------------------------------------------------------------+


# Einlesen der Daten aus der Datei "eigenheimpreise.csv" und Definition des Data-Frames "ehp"
# -------------------------------------------------------------------------------------------
ehp <- read.csv2("eigenheimpreise.csv", header=TRUE)
dim(ehp)
ehp[1:10,]
attach(ehp)


# Streudiagramme und Box-Plots
# ----------------------------
plot(lotsize, price, type="p")
boxplot(price~bedrooms, xlab="bedrooms", ylab="price")
boxplot(price~bathrooms, xlab="bathrooms", ylab="price")
boxplot(price~stories, xlab="stories", ylab="price")
boxplot(price~garage, xlab="garage", ylab="price")
boxplot(price~fullbase, xlab="fullbase", ylab="price")
boxplot(price~recreation, xlab="recreation", ylab="price")
boxplot(price~gasheat, xlab="gasheat", ylab="price")
boxplot(price~aircon, xlab="aircon", ylab="price")


# Untersuchung des Regressanden
# -----------------------------
plot(lotsize, price, type="p")                   
abline(lm(price~lotsize), col="blue")
# Das Streudiagramm laesst heteroskedastische Stoerungen vermuten.

plot(lotsize, log(price), type="p")               
abline(lm(log(price)~lotsize), col="blue")
# Nach der Log-Transformation des Regressanden erscheint der Variablenzusammenhang tendenziell nichtlinear.

plot(log(lotsize), log(price), type="p")         
abline(lm(log(price)~log(lotsize)), col="blue")
# Nach Log-Transformationen von Rgressand und Regressor erscheinen die Stoerungen homoskedastisch. 
# Der Variablenzusammenhang ist linear.

hist(price, freq=FALSE, xlab="price", ylab="density", main="Home Prices") 
price.den <- density(price, kernel="gaussian", adjust=1.25) # "Glaettung" der Haeufigkeitsdichte mittels "Kernschaetzer"
lines(price.den, col="blue")
# Die Verteilung der Preise ist deutlich rechtsschief.

hist(log(price), freq=FALSE, xlab="log(price)", ylab="density", main="Log - Home Prices") 
logprice.den <- density(log(price), kernel="gaussian",  adjust=1.25) # "Glaettung" der Haeufigkeitsdichte mittels "Kernschaetzer"
lines(logprice.den, col="blue")
# Durch Log-Transformation wird eine Symmetrisierung erreicht.


# logehp wird der neue Data-Frame
# -------------------------------
logehp <- ehp
logehp[,1:2] <- log(ehp[,1:2])
var.names <- names(ehp)
var.names
var.names[1] <-"log.price"
var.names[2] <-"log.lotsize"
names(logehp) <- var.names
detach(ehp)
attach(logehp)
logehp[1:10,]


# Schaetzung des Regressionsmodells
# --------------------------------
modell <- lm(log.price~., data=logehp) 
summary(modell)
confint(modell, level = 0.95)

# Variablen der Klasse "modell"
modell$coefficients
modell$fitted.values
modell$residuals


# Residuenanalyse
# ---------------
# Residuenplot
plot(modell$fitted.values, modell$residuals, xlab="fitted values", ylab="residuals", main="Residual Plot")
abline(h=0.0)
# Residuen-Normalverteilungscheck
studres <- rstudent(modell) # Berechnung der "studentisierten" Residuen
qqnorm(studres)  # QQ-Plot der "studentisierten" Residuen
qqline(studres)
shapiro.test(studres) # Shapiro-Wilks-Tests
# Residuen-Dichte
hist(modell$residuals, freq=FALSE, xlab="residuals", ylab="density", main="Residual Density") 
lines(density(modell$residuals, kernel="gaussian", adjust=1.2), col="blue")


# Marginale Effekte der Regressoren
# ---------------------------------
summary(modell)
modell$coefficients
exp(modell$coefficients)


# Konfidenz- und Prognoseintervalle
# ---------------------------------
# Regressionsfunktion fuer transformierte Daten
fit <- modell$fitted.values
plot(fit, log.price, xlab="fitted values", ylab="log(price)", main="Regression Function")
lines(fit, modell$fitted.values, col="blue")
# Konfidenzintervall
clim <- predict(modell, newdata=logehp, interval="confidence", level = 0.95)
plot(fit, log.price, xlab="fitted values", ylab="log(pice)", main="Regression Function")
lines(clim[,1], clim[,1], col="blue")
lines(clim[,1], clim[,2], col="red")
lines(clim[,1], clim[,3], col="red")
# Prognoseintervall
plim <- predict(modell, newdata=logehp, interval="prediction", level = 0.95)
plot(fit, log.price, xlab="fitted values", ylab="log(price)", main="Regression Function")
lines(plim[,1], plim[,1], col="blue")
lines(plim[,1], plim[,2], col="green")
lines(plim[,1], plim[,3], col="green")

# Regressionsfunktion fuer Originaldaten
price <- exp(log.price)
fit   <- exp(modell$fitted.values)
eplim <- exp(plim)
eclim <- exp(clim)
plot(fit, price, xlab="fitted values", ylab="price", main="Regression Function")
lines(fit, fit, col="blue")
lines(fit, eplim[,2], col="green")
lines(fit, eplim[,3], col="green")
lines(fit, eclim[,2], col="red")
lines(fit, eclim[,3], col="red")


detach(logehp)  # Ende