## ## R functions for delling with SCOP data ## Version 0.3 Tue Apr 1 15:32:29 PDT 2008 ## Version 0.2 Wed Mar 5 16:17:29 PST 2008 ## Version 0.1 Wed Aug 31 10:52:29 PDT 2005 ## ## http://scop.mrc-lmb.cam.ac.uk/scop/release-notes.html#scop-parseable-files ## sid => scop domain identifer (d1bg2__) ## sccs => scop compact clasification string (c.37.1.1) ## sunid => scop unique integer ids for each scop node (74353) ## the nodes being:- ## class (cl), fold (cf), superfamily (sf), ## family (fa), protein domain (dm), species (sp) ## ## Requirments: ## bio3d package, ## filehash package ## ## Functions: ## load.scop loads scop data into R ## sunid.search rtn scop classification info for sunids ## sunid.filler rtn scop classification info for sunids ## txt.search search for text in scop description files ## pdb.search search for pdb ids or scop ids ## list.scop.node list all the children of a given sunid ## add.scop.path add the path to coordinate files for a list of sids ## seqaln.scop produce a seq alignment of sids ## ## Usage: ## scop <- load.scop() ## scop <- load.scop(TRUE) ## info <- pdb.search("1bg2", scop) ## kin.sf <- list.scop.node( substr(info[,"sf"],4,100), scop ) ## kin.fa <- list.scop.node( substr(info[,"fa"],4,100), scop ) ## kin.dm <- list.scop.node( substr(info[,"dm"],4,100), scop ) ## ## add.scop.path(kin.dm[,"sid"]) ## aln <- seqaln.scop(kin.dm[,"sid"]) ## scop <- load.scop(T, inpath="http://scop.mrc-lmb.cam.ac.uk/scop/parse/") sunid.search <- function(sunid, scop.data, restrict=c("all","sf","fa","dm","sp")) { ## speeds up search by restricting to 'level' restrict <- match.arg(restrict) if(restrict == "all") { out <- (scop.data$des[ (scop.data$des[,"sunid"] %in% sunid), ]) } else { sc <- scop.data$des[ scop.data$des[,"node"]==restrict, ] out <- (sc[ (sc[,"sunid"] %in% sunid), ]) } if(is.vector(out)) ## ensure we rtn a matrix out <- t(as.matrix(out)) return(out) } sunid.filler <- function(sunid, scop.data, level=c("all","sf","fa","dm","sp")) { ## Take a vector of sunid(s) and look up only unique ## entries filling in the duplicated guys in a slow ## 'for' loop that is still quicker than straight ## 'sunid.search' which looks up everything ## p <- sunid.filler(substr(scop$cla[, "sf"],4,100), scop, level="sf") level <- match.arg(level) blank <- rep(NA,length(sunid)) sunq <- unique(sunid) blank[!duplicated(sunid)] <- sunid.search(sunq, scop.data, level)[,"des"] for(i in sunq) { fill.ind <- which(sunid == i) blank[fill.ind] <- blank[fill.ind[1]] } return(blank) } load.scop <- function(flag2read = FALSE, hash=TRUE, ##inpath="http://scop.mrc-lmb.cam.ac.uk/scop/parse/", inpath="/net/home/lskjaev/Documents/bio3d_groel/scop/1.73/", outpath=inpath, version="1.73") { ## scop <- load.scop() ## scop <- load.scop(flag2read = TRUE) if(!flag2read) { ## Load previously parsed and saved data which can be ## either a filehash key-value database file or a regular ## binary .RData file if(hash) { library(filehash) scop <- dbInit( paste(outpath,"scop.data.fh",sep="") ) } else { load( paste(outpath,"scop.data.R",sep="") ) } } else { ## Otherwise read the four 'raw' scop data files afresh ## and create either a filehash key-value database file ## or a regular binary .RData file cat(paste(" Reading files from path:\n ", inpath, "\n Taking Version:", version,"\n")) files <- paste(inpath, c("dir.des.scop.txt_", "dir.cla.scop.txt_", "dir.com.scop.txt_", "dir.hie.scop.txt_"), version, sep="") ## 1. Description file des <- matrix(scan( files[1], sep = "\t", comment.char = "#", quote="", character()), ncol=5, byrow=T) colnames(des) <- c("sunid","node","sccs","sid","des") ## 2. Clasification file cla.tmp <- matrix(scan( files[2], sep = "\t", comment.char = "#", quote="", character()), ncol=6, byrow=T) lev.id <- matrix(unlist(strsplit(cla.tmp[,6], ",")), ncol=7, byrow=T) cla <- cbind(cla.tmp[,c(1,4)],lev.id) colnames(cla) <- c("sid","sccs","cl","cf","sf","fa","dm","sp","px") rm(cla.tmp,lev.id) ## 3. Comment file (not really necesary) com.tmp <- scan( files[3], sep = "\n", comment.char = "#", quote="", character()) com <- cbind(substr(com.tmp,0,5), substr(com.tmp,7,5000)) colnames(com) <- c("sunid","com") rm(com.tmp) ## 4. Hierarchy file (in terms of sunid) hir <- matrix(scan( files[4], sep = "\t", comment.char = "#", quote="", character()), ncol=3, byrow=T) ##- Create Anotation Object (a combination of des and cla files) cat("Creating new $anot anotation object: this can be slow\n") colnam <- c("pdbid", colnames(cla), "sf.txt", "fa.txt", "dm.txt", "sp.txt") anot <- matrix(NA, ncol=length(colnam), nrow=nrow(cla)) colnames(anot) <- colnam rownames(anot) <- cla[,"sid"] anot[,"pdbid"] <- substr(cla[,"sid"],2,5) anot[,2:(ncol(cla)+1)] <- cla sc <- list(des=des) anot[,"sf.txt"] <- sunid.filler(substr(cla[, "sf"],4,100), sc, level="sf") anot[,"fa.txt"] <- sunid.filler(substr(cla[, "fa"],4,100), sc, level="fa") anot[,"dm.txt"] <- sunid.filler(substr(cla[, "dm"],4,100), sc, level="dm") anot[,"sp.txt"] <- sunid.filler(substr(cla[, "sp"],4,100), sc, level="sp") rm(sc, colnam) ##-- Save parsed output for future use if(hash) { ## A filehash key-value database file library(filehash) dbCreate( paste(outpath,"scop.data.fh",sep="") ) scop <- dbInit( paste(outpath,"scop.data.fh",sep="") ) dbInsert(scop, "des", des) dbInsert(scop, "cla", cla) dbInsert(scop, "com", com) dbInsert(scop, "hir", hir) dbInsert(scop, "anot", anot) } else { ## A binary .RData file scop <- list(des=des, cla=cla, com=com, hir=hir, anot=anot) save(scop, file=paste(outpath,"scop.data.R",sep="") ) } } scop } txt.search <- function(txt, scop.data, ignore.case=TRUE) { ## txt.search("motor", scop) i <- grep(txt, scop.data$des[,"des"], ignore.case=ignore.case ) return(scop.data$des[i,]) } pdb.search <- function(code, scop.data, sid=FALSE) { ## pdb.search2("2mys", scop) if(sid) { out <- scop.data$anot[scop.data$anot[,"sid"] %in% code, , drop = FALSE] } else { out <- scop.data$anot[scop.data$anot[,"pdbid"] %in% code, , drop = FALSE] } ## if(is.vector(out)) ## ensure we rtn a matrix (use 'drop = FALSE') ## out <- t(as.matrix(out)) return(out) } list.scop.node <- function(sunid, scop.data) { ## Description: ## produce a list of all the structures ## that are children of a given sunid ## (i.e. "scop unique id" e.g. 52646) ## Useage: ## ploops <- list.scop.node("52540") ## motors <- list.scop.node("52646") if(length(sunid)>1) warning(" ** Multiple sunid inputs hence childern may be from MULTIPLE nodes **") nodes <- sunid.search( sunid, scop.data )[,"node"] node <- unique(nodes) if(length(node) == 1) { out <- scop.data$anot[ (scop.data$anot[,node] %in% paste(node,"=",sunid,sep="")), , drop = FALSE] } else { index <- NULL for(i in node) { index <- c(index, which(scop.data$anot[,i] %in% paste(i,"=",sunid[(nodes==i)],sep="")) ) } out <- scop.data$anot[ index, , drop = FALSE] } class(out) <- "scoplist" return(out) } summary.scoplist <- function(node.list) { n.all <- node.list[,"pdbid"] n.unq <- unique(n.all) na <- length(n.all) nu <- length(n.unq) f.all <- node.list[,"fa"] f.unq <- unique(f.all) nf <- length(f.unq) d.all <- node.list[,"dm"] d.unq <- unique(d.all) nd <- length(d.unq) ns <- length( unique(node.list[,"sp"]) ) txt <- paste("** Overview: N.all =",na, ", N.unq=",nu, ", N.fam=",nf, ", N.dom=",nd, ", N.sp=", ns," **") dms <- unique(node.list[,"dm.txt"]) cat(txt, dms, txt, sep="\n") out <- c(na, nu, nf, nd) } add.scop.path <- function(sids, path="/u3/bgrant/scopdb/1.73/pdbstyle-1.73/") { ## Description: ## takes scop "sids" (e.g. "d1bg2__") and adds path to ## structure files ## Useage: ## info <- pdb.search2("1bg2", scop) ## dom <- list.scop.node2( substr(info[,"dm"],4,100), scop ) ## coord.files <- add.scop.path( dom[,"sid"] ) return( file.path(path, substr( as.character(sids), 3, 4 ), paste(sids, ".ent", sep="")) ) } seqaln.scop <- function(sids, outfile="raw_aln.fa", ...) { ## Description: ## produce multiple seq alignment for given scop "sid" list ## results are writen to "outfile" and put returned as a ## data.frame ## Useage: ## aln <- seqaln.scop(aln.list, outfile="seqs_aln.fa") files <- add.scop.path(sids) toread <- file.exists(files) if(all(!toread)) stop("No corresponding PDB files found") ## Extract sequences raw <- NULL for(i in 1:length(files)) { cat(paste("pdb/seq:",i," name:", sids[i]),"\n") if(!toread[i]) { warning(paste("No PDB file found for sid", sids[i], ": (with filename) ",files[i]), call.=FALSE) } else { pdb <- read.pdb(files[i]) raw <- seqbind(raw, aa321(pdb$atom[pdb$calpha,"resid"])) } } ## Align sequences (have a look at this file!) return(seqaln(raw, id=files, file=outfile, ...)) } ## http://astral.berkeley.edu/pdbstyle.cgi?id= fetch.scop <- function(ids, path="./") { if(any(nchar(ids) != 7)) { warning("ids should be standard 7 character SID formart: trying first 7 char ...") ids <- substr(basename(ids),1,7) } ids <- unique(ids) pdb.files <- paste(ids, ".pdb", sep="") get.files <- paste("http://astral.berkeley.edu/pdbstyle.cgi?id=", ids, sep="") put.files <- file.path( path, pdb.files) dir.create(path) rtn <- rep(NA, length(pdb.files)) for(k in 1:length(pdb.files)) { rtn[k] <- download.file( get.files[k], put.files[k] ) } names(rtn) <- file.path(path, paste(ids, ".pdb", sep = "")) if (any(rtn == 1)) { warning("Some files could not be downloaded, check returned value") return(rtn) } else { return(names(rtn)) } # names(rtn) <- ids # rtn }