User Tools

Site Tools


wiki:bivariate_r

Differences

This shows you the differences between two versions of the page.

Link to this comparison view

wiki:bivariate_r [2017/12/05 22:53] (current)
Line 1: Line 1:
 +===== Bivariate plot of predictor variables correlation for testing multicollinearity =====
  
 +  * [[|Multicollinearity]] is a statistical phenomenon in which two or more predictor variables in a multiple regression model are highly correlated.\\
 +In the presence of multicollinearity,​ the estimate of one variable'​s impact on y while checking for the others tends to be less precise than if predictors were uncorrelated with one another (sources Wikipedia).\\
 +
 +Plotting a bivariate correlation matrix between predictor variables ranked according to their importance in a random forest model, allows us to study the potential effect of multicollinearity.
 +We expect high correlated variables having similar ranking values and the opposite.\\
 +
 +
 +
 +\\
 +\\
 +
 +<code r>
 +# load libraries
 +library(foreign) # needed for  "​read.dbf"​
 +library(gdata) # needed for "drop levels"​
 +library(lattice)
 +\\
 +# load predictors DataFrame and clean it from the NA values
 +
 +INPUT = ("​~/​ost4sem/​exercise/​basic_adv_r/​inputs/"​)
 +OUTPUT = ("​~/​ost4sem/​exercise/​basic_adv_r/​outputs/"​)
 +
 +# load data
 +load(paste(INPUT,"​predictor_rank",​sep=""​))
 +
 +# select and rank data
 +pred44 = subset(predictor_rank,​ select =c(pr123,​pr103,​pr101,​pr120,​pr126,​pr127,​pr128,​pr200,​pr201,​pr111,​pr136,​pr202,​pr138,​pr134,​pr198,​pr110,​pr131,​pr122,​pr104,​pr118,​pr105,​pr106,​pr137,​pr102,​pr139,​pr135,​pr108,​pr121,​pr109,​pr2626,​pr124,​pr199,​pr129,​pr116,​pr125,​pr203,​pr112,​pr114,​pr117,​pr113,​pr107,​pr204,​pr115,​pr119))
 +pred44$pr2626 ​ = as.numeric(pred44$pr2626)
 +
 +
 +# Create a bivariate correlation matrix using adjusted R squared error values
 +mat = matrix(NA, nrow=44, ncol=44)
 +for (li in 1:44){
 +for (co in 1:44){
 +temp = lm(pred44[[li]]~ pred44[[co]])
 +mat[li,co] = summary(temp)$adj.r.squared
 +}}
 +colnames(mat)= names(pred44)
 +rownames(mat) = names(pred44)
 +
 +
 +# ------ Define a function for plotting a matrix ------ #
 +# original function image()
 +# modified function : http://​www.phaget4.org/​R/​image_matrix.html
 +myImagePlot <- function(x, ...){
 +     min <- min(x)
 +     max <- max(x)
 +     ​yLabels <- rownames(x)
 +     ​xLabels <- colnames(x)
 +     title <-c()
 +  # check for additional function arguments
 +  if( length(list(...)) ){
 +    Lst <- list(...)
 +    if( !is.null(Lst$zlim) ){
 +       min <- Lst$zlim[1]
 +       max <- Lst$zlim[2]
 +    }
 +    if( !is.null(Lst$yLabels) ){
 +       ​yLabels <- c(Lst$yLabels)
 +    }
 +    if( !is.null(Lst$xLabels) ){
 +       ​xLabels <- c(Lst$xLabels)
 +    }
 +    if( !is.null(Lst$title) ){
 +       title <- Lst$title
 +    }
 +  }
 +# check for null values
 +if( is.null(xLabels) ){
 +   ​xLabels <- c(1:​ncol(x))
 +}
 +if( is.null(yLabels) ){
 +   ​yLabels <- c(1:​nrow(x))
 +}
 +
 +layout(matrix(data=c(1,​2),​ nrow=1, ncol=2), widths=c(4,​1),​ heights=c(1,​1))
 +
 + # Red and green range from 0 to 1 while Blue ranges from 1 to 0
 + ​ColorRamp <- rgb( seq(0,​1,​length=256), ​ # Red
 +                   ​seq(0,​1,​length=256), ​ # Green
 +                   ​seq(1,​0,​length=256)) ​ # Blue
 + ​ColorLevels <- seq(min, max, length=length(ColorRamp))
 +
 + # Reverse Y axis
 + ​reverse <- nrow(x) : 1
 + ​yLabels <- yLabels[reverse]
 + x <- x[reverse,]
 +
 + # Data Map
 + ​par(mar = c(3,​5,​2.5,​2))
 + ​image(1:​length(xLabels),​ 1:​length(yLabels),​ t(x), col=ColorRamp,​ xlab="",​
 + ​ylab="",​ axes=FALSE, zlim=c(min,​max))
 + if( !is.null(title) ){
 +    title(main=title)
 + }
 +axis(BELOW<​-1,​ at=1:​length(xLabels),​ labels=xLabels,​ cex.axis=0.7)
 + ​axis(LEFT <-2, at=1:​length(yLabels),​ labels=yLabels,​ las= HORIZONTAL<​-1,​
 + ​cex.axis=0.7)
 +
 + # Color Scale
 + ​par(mar = c(3,​2.5,​2.5,​2))
 + ​image(1,​ ColorLevels,​
 +      matrix(data=ColorLevels,​ ncol=length(ColorLevels),​nrow=1),​
 +      col=ColorRamp,​
 +      xlab="",​ylab="",​
 +      xaxt="​n"​)
 + ​layout(1)
 +}
 +# ----- END plot function ----- #
 +
 +paste(OUTPUT,"​predictor_rank",​sep=""​)
 +png(paste(INPUT,"​bivariate_correlation.png",​sep=""​),​width = 1500, height = 1000)
 +myImagePlot(mat,​ xlabels=dimnames(mat)[[1]],​ ylabels=dimnames(mat)[[1]], ​ title=c("​Bivariate correlations of predictor variables - Adjusted r squared values"​),​cex=2) ​
 +dev.off()
 +
 +</​code>​
wiki/bivariate_r.txt ยท Last modified: 2017/12/05 22:53 (external edit)