#check all mean and median for na.rm=T ####################################################SOMATICS######################################### #Aim: SNP chip analysis of heterogeneous tumor tissue samples. Works on any tissue sample (blood or tumor) processed with BeadArray #Main features: from one tissue sample: germline deletions and amplifications, somatic deletions and amplifications, proportion of cells with a somatic event, germline genotype inferrence, somatic genotype inferrence #The script is organized by modules,following the "supplemental methods" of the SOMATICs method paper (Assie et al., 2007) #Module 0: data preparation (from Beadstudio data) #Module 1: Determination of Reference logR and BAF distributions (chip noise analysis) #Module 2: Detection of the 2-band patterns of Allelic Imbalance #Module 3: Percentage of cells in Allelic imbalance (for each chromosomal event) #Module 4: Copy Number Variations calls #Module 5: Germline genotype Inferrence ###################################################################################################### ###################################################################################################### #####################Module 0: data preparation####################################################### #Aim: converts the data text files into matrixes saved as Rdata files #Three input files: directly generated with BeadStudio # BAF : column1: Chr; column2: Position; Next columns: samples values (Column names should have no space character). # logR: same # annotation: column1: SNP names, column2: Chr, column3: Position #Two output files: # 1. BAF: numerical matrix. # Structure: Row names: SNP names; Column 1: "Chr"; Column 2: "Position"; Next columns: Samples values # 2. logR: same structure #Remarks: chromosome X is called 23 (chr Y might be called 24) #Remarks: for data sets with several samples, it is better to export the data from Beadstudio BY CHROMOSOME (huge files!!). # The following script imports data set saved by chromosome. # Example: file names for chromosome 1: "BeadStudio_BAF_K1.txt", "BeadStudio_logR_K1.txt", "BeadStudio_annotations_K1.txt" #Remarks: the number of samples has to be specified in the first two lines of the following script, after "ncol=" Example: for 40 samples, ncol=40+2 ###################################################################################################### BAF=matrix(nrow=0,ncol=40+2) logR=matrix(nrow=0,ncol=40+2) #ncol: number of samples+2 SNP_names=vector(mode="character") for (K in 1:23) { BAF_temp=read.table(paste("BeadStudio_BAF_K",K,".txt",sep=''),header=T) BAF_temp=BAF_temp[order(BAF_temp$Position),] #conversion of "X" to 23: if (K==23) BAF_temp[,1]=23 BAF=rbind(BAF,as.matrix(BAF_temp)) logR_temp=read.table(paste("BeadStudio_logR_K",K,".txt",sep=''),header=T) logR_temp=logR_temp[order(logR_temp$Position),] logR=rbind(logR,as.matrix(logR_temp)) #conversion of "X" to 23: if (K==23) logR_temp[,1]=23 annotations_temp=read.table(paste("BeadStudio_annotations_K",K,".txt",sep=''),header=T) annotations_temp=annotations_temp[order(annotations_temp$Position),] SNP_names=c(SNP_names,as.character(annotations_temp[,1])) } dimnames(BAF)=list(SNP_names,dimnames(BAF)[[2]]) save(BAF,file="BAF.Rdata") dimnames(logR)=list(SNP_names,dimnames(logR)[[2]]) save(logR,file="logR.Rdata") #####################End of module 0######################################################### ###################################################################################################### #####################Module 1:Determination Reference logR and BAF distributions################# #Aim: determine the BAF reference distributions for genotypes AA, AB and BB. #input: BAF.Rdata and logR.Rdata #output: "zReference_Chromosomes.txt": text file, 1 line by sample; # column1: chromosome selected as the reference for BAF distribution of SNPs AB # column2: chromosome selected as the reference for BAF distribution of SNPs AA and BB # column1: chromosome selected as the reference for logR distribution of SNPs #Remarks: visual check is recommanded (a plotting module is included) ###################################################################################################### load("BAF.Rdata") load("logR.Rdata") sample.size=dim(BAF)[2]-2 sample.names=dimnames(BAF)[[2]] sample.names=sample.names[3:length(sample.names)] BAF_variances=matrix(nrow=sample.size,ncol=22*3) logR_variance=matrix(nrow=sample.size*2,ncol=22) pos_hetsvar=seq(1,66,by=3) pos_homovar=seq(2,66,by=3) pos_ratio=seq(3,66,by=3) mini=matrix(nrow=sample.size,ncol=3) chromosome=BAF[,1] for (i in 1:sample.size) { #BAF reference chromosomes donnee=BAF[,i+2] for (K in 1:22) { pos=which(chromosome==K) donnee2=donnee[pos] posAB=which((donnee2>=0.25)&(donnee2<=0.75)) BAF_variances[i,(K-1)*3+1]=var(donnee2[posAB],na.rm=T) posAA=which(donnee2<0.25) posBB=which(donnee2>0.75) BAF_variances[i,(K-1)*3+2]=var(c(donnee2[posBB],donnee2[posAA]+1),na.rm=T) BAF_variances[i,(K-1)*3+3]=length(posAB)/(length(posAA)+length(posBB)+length(posAB)) } #selection of chromosomes with at leat 1/3 of het SNPs (for HumanHap300, het ratio is 0.35 globally) pos=which(BAF_variances[i,pos_ratio]>=1/3) pos2=pos_hetsvar[pos] if (length(pos2)>0) { mini[i,1]=which(BAF_variances[i,pos_hetsvar]==min(BAF_variances[i,pos2])) } mini[i,2]=which(BAF_variances[i,pos_homovar]==min(BAF_variances[i,pos_homovar])) #logR reference chromosomes donnee=logR[,i+2] for (K in 1:22) { pos=which(chromosome==K) donnee2=donnee[pos] logR_variance[i*2-1,K]=var(donnee2,na.rm=T) logR_variance[i*2,K]=mean(donnee2,na.rm=T) } #selection of chromosomes with a meanlogR around 0 pos=which((logR_variance[i*2,]<(-0.1))|(logR_variance[i*2,]>(0.1))) if (length(pos)>0) logR_variance[i*2-1,pos]=NA donnee=logR_variance[i*2-1,] mini[i,3]=which(donnee==min(donnee,na.rm=T)) plot(1,1,type='n') text(1,1,paste("Sample",i)) } dimnames(mini)=list(sample.names,c("BAF_Het_RefChr","BAF_Homo_RefChr","logR_RefChr")) write.table(mini,"zReference_Chromosomes.txt",sep='\t') #checking the absence of massive alterations on ref chromosome data=read.table("zReference_Chromosomes.txt") par(mfrow=c(6,4),mar=c(0,0,0,0)) for (i in 1:sample.size) { K=data[i,1] pos=which(chromosome==K) plot(pos,BAF[pos,i+2],pch=".",ylim=c(-1,1),xaxt='n') par(new=T) plot(pos,logR[pos,i+2]/2-0.5,pch=".",ylim=c(-1,1),xaxt='n') text(mean(pos),0.7,paste("BAF_het_ref",i),col='green',cex=1) K=data[i,2] pos=which(chromosome==K) plot(pos,BAF[pos,i+2],pch=".",ylim=c(-1,1),xaxt='n') par(new=T) plot(pos,logR[pos,i+2]/2-0.5,pch=".",ylim=c(-1,1),xaxt='n') text(mean(pos),0.7,paste("BAF_homo_ref",i),col='green',cex=1) K=data[i,3] pos=which(chromosome==K) plot(pos,BAF[pos,i+2],pch=".",ylim=c(-1,1),xaxt='n') par(new=T) plot(pos,logR[pos,i+2]/2-0.5,pch=".",ylim=c(-1,1),xaxt='n') text(mean(pos),0.7,paste("LogR_ref",i),col='green',cex=1) } ###################################End of Module 1#################################################### ###################################################################################################### #####################Module 2: detection of the 2-band patterns of Allelic Imbalance################## #Aim: Identify the 2-band patterns of AI #Input: BAF.Rdata, logR.Rdata, "zReference_Chromosomes.txt" + libraries diptest and aws #Output: # Main output: "zAllelicImbalance_i_raw.txt": col1: start rank; col2: end rank; col3: mean BAF; col4: mean logR, col5: N_hets # Intermediate files: "zRed_samplei.Rdata","zblue_samplei.Rdata","zgreen_samplei.Rdata",""zAI_all_samplei.Rdata": location of abnormal SNPs. #Remarks: The script is currently set for humanhap300. For other chips, replace "317503" by the actual number of SNPs (also replace 317504 by number+1) ###################################################################################################### #library for uni/bimodal distribution library(diptest) library(aws) ####loading the dataset ##1 sample per column. First 2 columns: physical mapping info. load("BAF.Rdata") load("logR.Rdata") ####loading the reference chromosomes information mini=read.table("zReference_Chromosomes.txt")[,1:2] mini_logR=read.table("zReference_Chromosomes.txt")[,3] sample.size=dim(BAF)[2]-2 ####setting the chromosome limits chromosome=BAF[,1] position_all=vector(mode='numeric') SNP_names=dimnames(BAF)[[1]] compteur=0 for (K in 1:23) { pos=which(chromosome==K) position_all=c(position_all,BAF[pos,2]) } minK=vector(mode='numeric') maxK=vector(mode='numeric') for (K in 1:23) { pos=which(chromosome==K) minK[K]=min(pos) maxK[K]=max(pos) } #general loop (by sample) for (i in 1:sample.size) { #AI events are reported in 3 binary (0 or 1) variables (green, red, blue), according to the location of BAF of their hets. #The BAF space is split in 3 regions #green: region of obvious AI fragments #red: AI barely detectable because close to 0.5 #blue: AI barely detectable because close to 0 and 1 green=rep(0,317503) red=rep(0,317503) blue=rep(0,317503) donnees=BAF[,i+2] donnees_logR=logR[,i+2] ##setting the BAF thresholds for splitting the BAF space into green, red, blue regions #Reference distribution of homozygous SNPs pos_homo=which(chromosome==mini[i,2]) posAA=which(donnees[pos_homo]<0.25) posAA=pos_homo[posAA] a=hist(donnees[posAA],breaks=1000) effectives=a[[2]] values=a[[1]] total=sum(effectives) cumulated=0 compteur=1 repeat { cumulated=cumulated+effectives[compteur] if (cumulated>=total*.95) {break} compteur=compteur+1 } CA=values[compteur] cumulated=0 compteur=1 repeat { cumulated=cumulated+effectives[compteur] if (cumulated>=total*.999) {break} compteur=compteur+1 } GA=values[compteur] GA=min(GA,0.1) cumulated=0 compteur=1 repeat { cumulated=cumulated+effectives[compteur] if (cumulated>=total*.99) {break} compteur=compteur+1 } CA99=values[compteur] CA99=min(CA,0.1) posBB=which(donnees[pos_homo]>0.75) posBB=pos_homo[posBB] a=hist(donnees[posBB],breaks=1000) effectives=a[[2]] values=a[[1]] total=sum(effectives) cumulated=0 compteur=1 repeat { cumulated=cumulated+effectives[compteur] if (cumulated>total*.05) {break} compteur=compteur+1 } CB=values[compteur] cumulated=0 compteur=1 repeat { cumulated=cumulated+effectives[compteur] if (cumulated>total*.001) {break} compteur=compteur+1 } GB=values[compteur] GB=max(GB,0.9) cumulated=0 compteur=1 repeat { cumulated=cumulated+effectives[compteur] if (cumulated>total*.01) {break} compteur=compteur+1 } CB99=values[compteur] CB99=max(CB99,0.9) #distr hetero pos_het=which(chromosome==mini[i,1]) posAB=which((donnees[pos_het]>=0.25)&(donnees[pos_het]<=0.75)) posAB=pos_het[posAB] mean_het=mean(donnees[posAB],na.rm=T) posAAB=which((donnees[pos_het]>=0.25)&(donnees[pos_het]<=mean_het)) posAAB=pos_het[posAAB] posABB=which((donnees[pos_het]>=mean_het)&(donnees[pos_het]<=0.75)) posABB=pos_het[posABB] a=hist(donnees[posAAB],breaks=1000) effectives=a[[2]] values=a[[1]] total=sum(effectives) cumulated=0 compteur=1 repeat { cumulated=cumulated+effectives[compteur] if (cumulated>total*.05) {break} compteur=compteur+1 } ChetA=values[compteur] cumulated=0 compteur=1 repeat { cumulated=cumulated+effectives[compteur] if (cumulated>total*.001) {break} compteur=compteur+1 } GhetA=values[compteur] GhetA=max(GhetA,mean_het-0.1) a=hist(donnees[posABB],breaks=1000) effectives=a[[2]] values=a[[1]] total=sum(effectives) cumulated=0 compteur=1 repeat { cumulated=cumulated+effectives[compteur] if (cumulated>=total*.95) {break} compteur=compteur+1 } ChetB=values[compteur] cumulated=0 compteur=1 repeat { cumulated=cumulated+effectives[compteur] if (cumulated>=total*.999) {break} compteur=compteur+1 } GhetB=values[compteur] GhetB=min(GhetB,mean_het+0.1) #other cuts-off BhetA=0.25 RA=0.25 BhetB=0.75 RB=0.75 BA=CA BB=CB RhetA=ChetA RhetB=ChetB #####module : Identification of obvious AI events ("green" region) green=rep(0,317503) for (K in 1:23) { pos_K=which(chromosome==K) donnees_K=donnees[pos_K] #obvious SNPs pos_green=which(((donnees_K>GA)&(donnees_KGhetB)&(donnees_K0) { temp=rep(NA,length(pos_K)) temp[pos_green]=1 #hets pos_het=which((donnees_K>=GhetA)&(donnees_K<=GhetB)) temp[pos_het]=0 #identification of fragments of consecutive SNPs pos_notNA=which(!is.na(temp)) temp2=as.numeric(na.omit(temp)) longueur_temp=length(temp2) fragments=matrix(nrow=longueur_temp,ncol=2) fragments[,1]=1 fragments[,2]=0 pos=which(temp2==1) #identifier les fragments consecutifs et repeter par nbre de SNPs consecutifs #identification de stretchs par sommes decalees #generation d'un vecteur resultat2 de listes avec la taille des fragments et la position de la premiere base somme=rep(0,longueur_temp) somme[pos]=1 somme_ref=somme compteur=1 repeat { somme=somme[1:(length(somme)-1)]+somme_ref[(compteur+1):longueur_temp] compteur=compteur+1 pos_stretch=which(somme==compteur) if ((length(pos_stretch)==0)|(compteur>longueur_temp)) { break } for (l in 1:compteur) { fragments[pos_stretch+l-1,2]=compteur } pos2=which(fragments[,2]-fragments[,1]==1) fragments[pos2,1]=compteur fragments[,2]=0 } pos2=which(fragments[,1]>1) temp2[pos2]=fragments[pos2,1] #stretch identification repeat { pos=which(temp2>2) if (length(pos)==0) { break } pos2=min(pos) delta=temp2[pos2] pos_notNA_1=pos_notNA[pos2] pos_notNA_2=pos_notNA[pos2+delta-1] #identification of homo before and after pos=which(pos_het0) { pos=max(pos_het[pos])+1 pos_notNA_1=pos } pos=which(pos_het>pos_notNA_2) if (length(pos)>0) { pos=min(pos_het[pos])-1 pos_notNA_2=pos } #testing that the 2 SNPs are not very few odds in a large homo region (ratio 1/100) pos_x1=pos_K[pos_notNA_1] pos_x2=pos_K[pos_notNA_2] temp=donnees[pos_x1:pos_x2] if (delta/length(which((tempCB)))>0.01) green[pos_x1:pos_x2]=1 #nota: if a region green is preceded or followed by a blue region, the green boundary might extend in part of the blue, due to the presence of green SNPs within the blue #neutralization of the region tested temp2[pos2:(pos2+delta-1)]=0 } } plot(1,1,type='n') text(1,1,paste("Green region, Chr",K)) } save(green,file=paste("zGreen_sample",i,".Rdata",sep='')) ####module: identification of barely detectable AI events because close to 0.5 ("red" region) pos_het=which(chromosome==mini[i,1]) posAB=which((donnees[pos_het]>0.25)&(donnees[pos_het]<0.75)) posAB=pos_het[posAB] donnees_het_ref=donnees[posAB] mean_het=mean(donnees_het_ref,na.rm=T) n_het_ref=length(donnees_het_ref) pos_red=which((donnees>RA)&(donnees=total*temp_cutoff[k]) {break} compteur=compteur+1 } DIP_cutoff[k]=values[compteur] } for (K in 1:23) { posK=which(chromosome_het==K) donnees2=donnees_het[posK] #red=rep(0,317503) for (k in 1:3) { window.width=window.width_values[k] window.step=window.width/25 if (length(donnees2)DIP_cutoff[k]) { #nota: expansion of the window of half a step in order not to miss #...the beginning/end of barely detectable alterations #Therefore the step has to be small. a=min(x1)-window.step/2 if (a>1) { x1=c(a:(min(x1)-1),x1) } else { x1=c(1:min(x1),x1) } a=max(x1)+window.step/2 if (aBA)&(donneesBhetB)&(donnees=BhetA)&(donnees<=BhetB)) vector_het=rep(0,317503) vector_het[pos_het]=1 #determination of potential AI fragments in the blue zone: =any consecutive SNPs in the blue zone #either consecutive, or separated by homozygote SNPs. Coded as 0 in the 'color' variable. Nota: OK for missing values pos_color_ext=c(1,pos_color,317503) color=rep(0,317503) for (j in 1:(length(pos_color)+1)) { x1=pos_color_ext[j] x2=pos_color_ext[j+1] color[x1:(x2-1)]=sum(vector_het[(x1+1):(x2-1)]) } pos=which(color==0) pos[length(pos)]=NA #expanding the intervals to include the first het (intervals are '[]' instead of ']['): pos1=pos+1 pos_missing=which(!(pos1%in%pos)) pos_missing=pos1[pos_missing] pos=sort(c(pos,pos_missing)) color_temp=rep(0,317503) color_temp[pos]=1 color_temp=c(color_temp,0) #distributions homo #nota: this is redone to get the appropriate totalAA, totalBB. pos_homo=which(chromosome==mini[i,2]) posAA=which(donnees[pos_homo]<0.25) posAA=pos_homo[posAA] a=hist(donnees[posAA],breaks=1000) effectivesAA=a[[2]] valuesAA=a[[1]] lengthAA=length(effectivesAA) totalAA=sum(effectives) posBB=which(donnees[pos_homo]>0.75) posBB=pos_homo[posBB] a=hist(donnees[posBB],breaks=1000) effectivesBB=a[[2]] valuesBB=a[[1]] lengthBB=length(effectivesBB) totalBB=sum(effectivesBB) x2=1 #chromosome X x1=minK[23] x2=maxK[23] donnees2=donnees[x1:x2] co_sexK=317504 #sex identification #In Illumina, there is a normalization pb for the X chromosome: in males, the hemizygote SNPs are much #noiser than the homozygote SNPs on females! Then the strategy of barely detectable AI cannot be applied # in males (or would consider somatic AI!!!). That means that if a loss of X would occur on almost all #the cells we could not detect it! # if there is less than 90 SNPs in the Red region, it is a male or a female with X lost. if (length(which((donnees2>GA)&(donnees2100*317503/317503) { #female without X loss co_sexK=317504 } else { #male or female with X lost co_sexK=minK[23] donnees2=donnees[x1:x2] pos=which(donnees2>0.5) donnees2[pos]=donnees2[pos]-1 #sd_homo pos=which(chromosome==mini[i,2]) temp=BAF[pos,i+2] posAA=which(temp<0.25) posBB=which(temp>0.75) temp=c(temp[posAA],temp[posBB]-1) blue[x1:x2]=var(donnees2,na.rm=T)/var(temp,na.rm=T)*7 } repeat { #determination of likelihood of observation if there is no AI: LOD_obsIFnoAI=0 temp=which(color_temp==1) if (length(temp)==0) { break } if (min(temp)>=co_sexK) { break } x1=min(temp) x2=x1+min(which(color_temp[x1:317504]==0))-2 homo_temp=donnees[x1:x2] posAA=which((homo_temp<0.5)&(homo_temp>CA)) posBB=which((homo_temp>0.5)&(homo_temp0) { #for each SNP in the blue zone, determination of the exact quantile in the ref distribution for (j in 1:length(posAA)) { posAA2=posAA[j] b=which(valuesAA>=homo_temp[posAA2]) if (length(b)>0) { LOD_obsIFnoAI=LOD_obsIFnoAI+log10(sum(effectivesAA[(min(b)-1):lengthAA])/totalAA) } else { #SNPs that are so far from homo in the blue zone are affected a proba of 0.0001 (in replacement of 0) LOD_obsIFnoAI=LOD_obsIFnoAI-4 } if (is.na(LOD_obsIFnoAI)) { break } } } if (length(posBB)>0) { for (j in 1:length(posBB)) { posBB2=posBB[j] b=which(valuesBB<=homo_temp[posBB2]) if (length(b)>0) { LOD_obsIFnoAI=LOD_obsIFnoAI+log10(sum(effectivesBB[1:max(b)])/totalBB) } else { LOD_obsIFnoAI=LOD_obsIFnoAI-4 } } } #determination of the likelihood of observation if there is AI poshomo=length(which((homo_tempCB))) N=x2-x1+1 contingency=matrix(nrow=2,ncol=2) contingency[1,]=c(round(N/3),round(N*2/3)) contingency[2,]=c(N-poshomo,poshomo) if (min(contingency)<5) { LOD_obsIFAI=log10(fisher.test(contingency)[[1]]) } else { LOD_obsIFAI=log10(chisq.test(contingency)[[3]]) } color_temp[x1:x2]=0 blue[x1:x2]=LOD_obsIFAI-LOD_obsIFnoAI plot(1,1,type='n') text(1,1,x2) } pos=which(blue==Inf) if (length(pos)>0) blue[pos]=400 #nota: runs until it reaches 317503. save(blue,file=paste("zLOD_AI_barelyhomo_sample",i,".Rdata",sep='')) # load(paste("zRed_sample",i,".Rdata",sep='')) # load(paste("zGreen_sample",i,".Rdata",sep='')) # load(paste("zLOD_AI_barelyhomo_sample",i,".Rdata",sep='')) ###gathering the data from the 3 regions AI_all=rep(0,317503) pos=which(green==1) AI_all[pos]=1 pos=which(blue>40) AI_all[pos]=1 pos=which(red==1) AI_all[pos]=1 save(AI_all,file=paste("zAI_all_sample",i,".Rdata",sep='')) #####Module Smoothing and segmentation of SNPs in AI #removal of homozygote SNPs. Nota: removal is done before flapping bec distribution is not symetric donnees_nohomo=donnees posAA=which(donneesCB99) donnees_nohomo[posBB]=NA #for regions not in AI, use of a more stringent cutoff pos=which(AI_all==0) cutoff=min(0.1,GA) posAA=which(donnees[pos]cutoff) posBB=pos[posBB] donnees_nohomo[posBB]=NA #flapping the BAF over mean_het pos=which(donnees_nohomo>mean_het) donnees_flapped=donnees_nohomo donnees_flapped[pos]=mean_het-(donnees_nohomo[pos]-mean_het)*mean_het/(1-mean_het) #selection of SNPs in AI pos=which(AI_all==0) AI_only=AI_all AI_only[pos]=NA #Smoothing and segmentation of each AI; collection of the data AI_CNV=matrix(nrow=0,ncol=5) pos_notNA=which(!is.na(donnees_flapped)) repeat { pos=which(!is.na(AI_only)) if (length(pos)==0) { break } x1=min(pos) x2=min(which(is.na(AI_only[x1:317504])))+x1-2 donnees_temp=as.numeric(na.omit(donnees_flapped[x1:x2])) pos_notNA_temp=which(!is.na(donnees_flapped[x1:x2])) a=awsh(donnees_temp,graph=T,hinit=3,hmax=500,p=0,sigma2=0.01) # a=awsh(donnees_temp,graph=T,hinit=3,hmax=500,p=0,sigma2=0.03) b=round(a[[1]],2) b=c(b,-1) #mark for the end of the next loop #boundaries identification and filling AI_CNV #delta: half interval in which values are considered equal delta=0.02 #boundaries: try to extend to previous/next het, first inside the contiguous fragment, then around it if the start/end of the fragment is reached repeat { pos2=which(!is.na(b)) if (length(pos2)==0) {break} x1bis=min(pos2) #if the begining of the tested region is reached, then.. if (x1bis==1) { #have to consider last not NA before x1 pos3=which(!is.na(donnees_flapped[1:(x1-1)])) if (length(pos3)>0) { start_rank=max(pos3) } else { start_rank=x1 } } else { x1ter=pos_notNA_temp[x1bis] start_rank=x1+x1ter-1 } x2bis=min(which((bb[x1bis]+delta))) #if the end of the tested region is reached, then ... if (b[x2bis]==-1) { #have to consider first not NA after x2 pos3=which(!is.na(donnees_flapped[(x2+1):317503])) if (length(pos3)>0) { end_rank=x2+min(pos3) } else { end_rank=317503 } } else { x2ter=pos_notNA_temp[x2bis] end_rank=x1+x2ter-1 } AI_CNV=rbind(AI_CNV,c(start_rank,end_rank,median(donnees_flapped[start_rank:end_rank],na.rm=T),median(donnees_logR[start_rank:end_rank],na.rm=T),length(na.omit(donnees_flapped[start_rank:end_rank])))) b[x1bis:x2bis]=NA } AI_only[x1:x2]=NA } dimnames(AI_CNV)=list(seq(1,dim(AI_CNV)[1]),c("Start_rank","End_rank","BAF_het","logR","N_hets")) write.table(AI_CNV,paste("zAllelicImbalance_",i,"_raw.txt",sep=''),sep='\t') } ########################End of module 2############################################################### ###################################################################################################### #####################Module 3: percentage of cells in Allelic imbalance############################### #Aim: Determine the proportion of cells in which each allelic imbalance event occurs #Aim2: Filter the raw allelic imbalance identified previously #Input: BAF.Rdata, logR.Rdata, "zReference_Chromosomes.txt", "zAllelicImbalance_"i"_raw.txt" #Output: # Main: zAI_CNV_filtered_i.txt: table with all AI events filtered (BAF significantly different from reference BAF of heterozygous SNPs) # col1: Start_rank, 2:End_rank, 3: mean of BAF hets, 4: mean logR, 5: N_hets in the fragment, 6: BAF_pvalue compared to ref BAF of hets, 7: logR_pvalue (compared to ref), 8: proportion of cells in the sample with AI, 9: CNV call(preliminary, with 1 (ampl), 0 (copy-neutral), or -1 (deletion) # Graphic: zPropOfCellsWithAI_distribution.pdf #Remarks: distribution of all the deletions in each sample are plotted in the graphic above. A proportion of cancer cells in the sample can be determined from these distributions ###################################################################################################### ####loading the dataset ##1 sample per column. First 2 columns: physical mapping info. load("BAF.Rdata") load("logR.Rdata") chromosome=BAF[,1] sample.size=dim(BAF)[2]-2 ####loading the reference chromosomes information mini=read.table("zReference_Chromosomes.txt")[,1:2] for (i in 1:sample.size) { AI_CNV=read.table(paste("zAllelicImbalance_",i,"_raw.txt",sep=''),header=T) donnees=BAF[,i+2] donnees_logR=logR[,i+2] load(paste("zAI_all_sample",i,".Rdata",sep='')) #BAF thresholds #distr homo pos_homo=which(chromosome==mini[i,2]) posAA=which(donnees[pos_homo]<0.25) posAA=pos_homo[posAA] a=hist(donnees[posAA],breaks=1000) effectives=a[[2]] values=a[[1]] total=sum(effectives) cumulated=0 compteur=1 repeat { cumulated=cumulated+effectives[compteur] if (cumulated>=total*.99) {break} compteur=compteur+1 } CA99=values[compteur] CA99=min(CA99,0.1) cumulated=0 compteur=1 repeat { cumulated=cumulated+effectives[compteur] if (cumulated>=total*.999) {break} compteur=compteur+1 } GA=values[compteur] GA=min(GA,0.1) posBB=which(donnees[pos_homo]>0.75) posBB=pos_homo[posBB] a=hist(donnees[posBB],breaks=1000) effectives=a[[2]] values=a[[1]] total=sum(effectives) cumulated=0 compteur=1 repeat { cumulated=cumulated+effectives[compteur] if (cumulated>total*.01) {break} compteur=compteur+1 } CB99=values[compteur] CB99=max(CB99,0.9) cumulated=0 compteur=1 repeat { cumulated=cumulated+effectives[compteur] if (cumulated>total*.001) {break} compteur=compteur+1 } GB=values[compteur] GB=max(GB,0.9) #distr hetero pos_het=which(chromosome==mini[i,1]) posAB=which((donnees[pos_het]>=0.25)&(donnees[pos_het]<=0.75)) posAB=pos_het[posAB] mean_het=mean(donnees[posAB],na.rm=T) #removal of homozygote SNPs. Nota: before flapping bec distribution is not symetric donnees_nohomo=donnees posAA=which(donneesCB99) donnees_nohomo[posBB]=NA #for regions not in AI, use of a more stringent cutoff pos=which(AI_all==0) cutoff=min(0.1,GA) posAA=which(donnees[pos]cutoff) posBB=pos[posBB] donnees_nohomo[posBB]=NA #flapping the BAF over mean_het pos=which(donnees_nohomo>mean_het) donnees_flapped=donnees_nohomo donnees_flapped[pos]=mean_het-(donnees_nohomo[pos]-mean_het)*mean_het/(1-mean_het) if (dim(AI_CNV)[1]>0) { #calibration of the BAF: flapped median of het is computed, and assigned to 0.5 pos_het=which(chromosome==mini[i,1]) reference_logR=donnees_logR[pos_het] mean_reference_logR=mean(reference_logR,na.rm=T) donnees_flapped_poshet=as.numeric(na.omit(donnees_flapped[pos_het])) median_het_flapped=median(donnees_flapped_poshet,na.rm=T) #plot(donnees_flapped[pos_het],pch='.') #testing which BAF hets are different from reference hets x1_all=AI_CNV[,1] x2_all=AI_CNV[,2] BAF_pvalues=vector(mode='numeric') for (j in 1:dim(AI_CNV)[1]) { x1=x1_all[j]:x2_all[j] # x1=(x1_all[j]-10):(x2_all[j]+10) #plot(donnees[x1],pch=20) donnees_temp=as.numeric(na.omit(donnees_flapped[x1])) if (length(donnees_temp)>=20) { a=t.test(donnees_temp,donnees_flapped_poshet) } else { a=wilcox.test(donnees_temp,donnees_flapped_poshet) } BAF_pvalues[j]=a[[3]] } #BAF_calibrated=pmin(AI_CNV[,3]*.5/median_het_flapped,.5) BAF_calibrated=pmin(AI_CNV[,3],.5) #CNV preliminary assignement x1_all=AI_CNV[,1] x2_all=AI_CNV[,2] logR_pvalues=vector(mode='numeric') CNV_prelim=rep(0,dim(AI_CNV)[1]) for (j in 1:dim(AI_CNV)[1]) { x1=x1_all[j]:x2_all[j] intensity=donnees_logR[x1] mean.intensity=mean(intensity,na.rm=T) if (length(intensity)>=20) { #centre sur 0 a=t.test(intensity,reference_logR-mean_reference_logR) } else { a=wilcox.test(intensity,reference_logR-mean_reference_logR) } logR_pvalues[j]=a[[3]] if ((logR_pvalues[j]<0.05)&(mean.intensity>0)) CNV_prelim[j]=1 if ((logR_pvalues[j]<0.05)&(mean.intensity<0)) CNV_prelim[j]=-1 } #cbind(AI_CNV,BAF_pvalues,logR_pvalues) ##percentage of cells percentage=vector(mode='numeric') #amplifications pos=which(CNV_prelim==1) if (length(pos)>0) percentage[pos]=1/tan(BAF_calibrated[pos]*pi/2)-1 #deletions pos=which(CNV_prelim==-1) if (length(pos)>0) percentage[pos]=1-tan(BAF_calibrated[pos]*pi/2) #deletion-duplications pos=which(CNV_prelim==0) if (length(pos)>0) percentage[pos]=(1-tan(BAF_calibrated[pos]*pi/2))/(1+tan(BAF_calibrated[pos]*pi/2)) percentage=round(percentage,2) AI_CNV_complete=cbind(AI_CNV,BAF_pvalues,logR_pvalues,percentage,CNV_prelim) #fitering AI_CNV based on the significance of BAF pos=which(AI_CNV_complete[,7]<0.05) if (length(pos)>0) { AI_CNV_filtered=matrix(as.matrix(AI_CNV_complete[pos,]),ncol=9) } else { AI_CNV_filtered=matrix(ncol=8,nrow=1) } dimnames(AI_CNV_filtered)=list(seq(1,length(pos)),dimnames(AI_CNV_complete)[[2]]) write.table(AI_CNV_filtered,paste("zAI_CNV_filtered_",i,".txt",sep=''),sep='\t') } } ##percentage of cells (distribution) pdf("zPropOfCellsWithAI_distribution.pdf",height=9,width=12) par(mfrow=c(3,6)) for (i in c(1:30,33:54)) { AI_CNV_filtered=read.table(paste("zAI_CNV_filtered_",i,".txt",sep=''),header=T) AI=rep(0,317503) pos=which(AI_CNV_filtered[,9]==(-1)) if (length(pos>0)) { x1_all=AI_CNV_filtered[pos,1] x2_all=AI_CNV_filtered[pos,2] percents_all=AI_CNV_filtered[pos,8] for (j in 1:length(x1_all)) { x1=x1_all[j] x2=x2_all[j] AI[x1:x2]=percents_all[j] } #filtering out the chromosome X AI[308331:317503]=0 #adding 1 for distribution AI[317503]=1 pos=which(AI>0) #par(mar=c(5,5,5,5)) #hist(AI[pos],xlim=c(0,1),breaks=10,ylab="Number of SNPs in AI",xlab="Proportion of cells in AI (c)",main="",cex.lab=2,cex.axis=1.5) hist(AI[pos],xlim=c(0,1),ylim=c(0,1000),breaks=10,main="",cex.lab=2,cex.axis=1.5) #for n=22: #hist(c(AI[pos],1),breaks=10,xlim=c(0,1),ylab="Number of SNPs in AI",xlab="Proportion of cells in AI (c)",main="",cex.lab=2,cex.axis=1.5) text(0.5,200,i) } } dev.off() ##############################End of module 3########################################################## ###################################################################################################### #####################Module 4: Copy Number Variations calls########################################### #Aim: Ascertain copy number variations. Especially exclude noise related variations of logR #Input: BAF.Rdata, logR.Rdata, zReference_Chromosomes.txt, zAI_CNV_filtered_i.txt #Output: zAI_CNV_complete_i.txt # #Remarks: ######################################################################################## load("BAF.Rdata") load("logR.Rdata") chromosome=BAF[,1] sample.size=dim(BAF)[2]-2 #loading the reference chromosomes information mini=read.table("zReference_Chromosomes.txt") for (i in 1:sample.size) { AI_CNV_filtered=read.table(paste("zAI_CNV_filtered_",i,".txt",sep=''),header=T) donnees_logR=logR[,i+2] longueur=dim(AI_CNV_filtered)[[1]] ##reading all the 2-band patterns (in the file AI_CNV_filtered) for (j in 1:longueur) { x1=AI_CNV_filtered[j,1] x2=AI_CNV_filtered[j,2] percentage=AI_CNV_filtered[j,8] CNV_prelim=AI_CNV_filtered[j,9] CNV=0 #default value #is the AI related to hemizygous deletion (somatic)? if (CNV_prelim==(-1)) { CNV=0 zone=x1:x2 logR_zone=donnees_logR[zone] if (median(logR_zone,na.rm=T)<(-.3*percentage)) { #testing the intensities around the fragment: need for at least .3 of difference b=0 c=0 if (x1>5) { b=median(donnees_logR[(x1-5):(x1-1)],na.rm=T) } if (x1<317503-5) { c=median(donnees_logR[(x2+1):(x2+5)],na.rm=T) } #if third criterion is met, call deletion if ((median(logR_zone,na.rm=T)-b<(-0.3*percentage))&(median(logR_zone,na.rm=T)-c<(-0.3*percentage))) CNV=-1 } if (CNV==0) percentage=percentage/(2-percentage) } #is the AI related to amplification of one allele? if (CNV_prelim==1) { CNV=0 zone=x1:x2 logR_zone=donnees_logR[zone] if (median(logR_zone,na.rm=T)>(0.2*percentage)) { #testing the intensities around the fragment: need for at least .3 of difference b=0 c=0 if (x1>5) { b=median(donnees_logR[(x1-5):(x1-1)],na.rm=T) } if (x1<317503-5) { c=median(donnees_logR[(x2+1):(x2+5)],na.rm=T) } #if third criterion is met, call deletion if ((median(logR_zone,na.rm=T)-b>(0.2*percentage))&(median(logR_zone,na.rm=T)-c>(0.2*percentage))) CNV=1 } if (CNV==0) percentage=percentage/(2+percentage) } AI_CNV_filtered[j,9]=CNV AI_CNV_filtered[j,8]=percentage } ##testing all the stretches of >5 homo SNPs #identification of homozygous SNPs donnees_BAF=BAF[,i+2] #reference distributions homo pos_homo=which(chromosome==mini[i,2]) posAA=which(donnees_BAF[pos_homo]<0.25) posAA=pos_homo[posAA] a=hist(donnees_BAF[posAA],breaks=1000) effectives=a[[2]] values=a[[1]] total=sum(effectives) cumulated=0 compteur=1 repeat { cumulated=cumulated+effectives[compteur] if (cumulated>=total*.999) {break} compteur=compteur+1 } CA=values[compteur] posBB=which(donnees_BAF[pos_homo]>0.75) posBB=pos_homo[posBB] a=hist(donnees_BAF[posBB],breaks=1000) effectives=a[[2]] values=a[[1]] total=sum(effectives) cumulated=0 compteur=1 repeat { cumulated=cumulated+effectives[compteur] if (cumulated>total*.001) {break} compteur=compteur+1 } CB=values[compteur] #reference intensity pos=which(chromosome==mini[i,3]) reference_logR=donnees_logR[pos] mean_reference_logR=mean(reference_logR,na.rm=T) # sd_reference_logR=sd(reference_logR,na.rm=T) #identification of stretches of consecutive homo pos_noNA=which(!is.na(donnees_BAF)) donnees_noNA=donnees_BAF[pos_noNA] longueur=length(donnees_noNA) posAA=which(donnees_noNACB) homo=rep(0,317503) homo[posAA]=1 homo[posBB]=1 fragments=matrix(nrow=longueur,ncol=2) fragments[,1]=1 fragments[,2]=0 pos=which(homo==1) somme=rep(0,longueur) somme[pos]=1 somme_ref=somme compteur=1 repeat { somme=somme[1:(length(somme)-1)]+somme_ref[(compteur+1):longueur] compteur=compteur+1 pos_stretch=which(somme==compteur) if (length(pos_stretch)==0) { break } for (l in 1:compteur) { fragments[pos_stretch+l-1,2]=compteur } pos2=which(fragments[,2]-fragments[,1]==1) fragments[pos2,1]=compteur fragments[,2]=0 } fragment=fragments[,1] fragment=c(fragment,0,6) taille.globale=0 # germline_deletions=matrix(nrow=0,ncol=9) repeat { CNV=0 x1=min(which(fragment>5)) if (x1==longueur+2) { break } fragment.taille=fragment[x1] x2=x1+fragment.taille-1 x1_withNA=pos_noNA[x1] x2_withNA=pos_noNA[x2] #deletion criterion 1 logR_zone=donnees_logR[x1_withNA:x2_withNA] median.logRzone=median(logR_zone,na.rm=T) if (median.logRzone<(-.3)) { if (fragment.taille>20) { a=t.test(logR_zone,reference_logR-mean_reference_logR) } else { a=wilcox.test(logR_zone,reference_logR-mean_reference_logR) } a=a[[3]] #deletion criterion 2 and 3 b=0 c=0 if (x1_withNA>5) { b=median(donnees_logR[(x1_withNA-5):(x1_withNA-1)],na.rm=T) } if (x1_withNA<317503-5) { c=median(donnees_logR[(x2_withNA+1):(x2_withNA+5)],na.rm=T) } if ((a<0.05)&(median.logRzone-b<(-0.3))&(median.logRzone-c<(-0.3))) { #completion of AI_CNV_filtered germline_deletions=rbind(germline_deletions,c(x1_withNA,x2_withNA,0,mean(logR_zone,na.rm=T),fragment.taille,0,a,1,-1)) plot(1,1,type='n') text(1,1,x2) } } fragment[x1:x2]=0 } ##looking for homozygous deletions (regions with very low logR) co_deletion=-1 pos_noNA=which(!is.na(donnees_logR)) donnees_logR_noNA=donnees_logR[pos_noNA] pos_deletions=which(donnees_logR_noNA1)) if (x1==length(na.omit(fragment))) { break } fragment.taille=fragment[x1] zone=compteur+(x1:(x1+fragment.taille-1)) temp=vector(mode="numeric") a=compteur+x1 x1_temp=pos_noNA[a] temp[1]=x1_temp a=compteur+x1+fragment.taille-1 x2_temp=pos_noNA[a] temp[2]=x2_temp logR_zone=donnees_logR[x1_temp:x2_temp] temp[4]=median(logR_zone,na.rm=T) a=wilcox.test(logR_zone,reference_logR-mean_reference_logR) temp[7]=a[[3]] temp[8]=1 temp[9]=-2 homo_deletion=rbind(homo_deletion,temp) fragment=fragment[(x1+fragment.taille):(longueur+2)] compteur=compteur+fragment.taille+x1-1 } #gathering the 3 types of CNV: 1- Somatic deletions+amplifications (AI_CNV_filtered), 2-Germline hemizygous deletions (germline_deletions), 3- Germline homozygous deletions (homo_deletions) dimnames(AI_CNV_filtered)[[2]][9]="CNV" length_temp=dim(AI_CNV_filtered)[[1]] noms_temp=vector(mode="character") for (j in 1:length_temp) { noms_temp[j]=paste("AmplOrSomaticDel_",j) } dimnames(AI_CNV_filtered)[[1]]=noms_temp dimnames(germline_deletions)[[2]]=dimnames(AI_CNV_filtered)[[2]] length_temp=dim(germline_deletions)[[1]] noms_temp=vector(mode="character") for (j in 1:length_temp) { noms_temp[j]=paste("GermDelHemi_",j) } dimnames(germline_deletions)[[1]]=noms_temp dimnames(homo_deletion)[[2]]=dimnames(AI_CNV_filtered)[[2]] length_temp=dim(homo_deletion)[[1]] noms_temp=vector(mode="character") for (j in 1:length_temp) { noms_temp[j]=paste("GermDelHomo_",j) } dimnames(homo_deletion)[[1]]=noms_temp AI_CNV_filtered_complete=rbind(AI_CNV_filtered,germline_deletions,homo_deletion) write.table(AI_CNV_filtered_complete,paste("zAI_CNV_complete_",i,".txt",sep=''),sep='\t') } ########################end of module 4########################### ###################################################################################################### #####################Module 5: Germline genotype Inferrence########################################### #Aim: Infers a reliable germline genotype from any tissue sample (either cancer or normal tissue) #Input: BAF AI_CNV #Output: genotype inferred # zLOD_AI_barelyhomo_sample #Remarks: ######################################################################################## #########################Module genotyping################ ####all barely homo fragments are analyzed (contained in the LOD score "zLOD_AI_barelyhomo_sample") #####Automatic indentification of AI (germline homozygosity study excluded) load("BAF.Rdata") mini=read.table("zReference_Chromosomes.txt") chromosome=BAF[,1] position_all=vector(mode='numeric') compteur=0 for (K in 1:23) { pos=which(chromosome==K) position_all=c(position_all,BAF[pos,2]) } minK=vector(mode='numeric') maxK=vector(mode='numeric') for (K in 1:23) { pos=which(chromosome==K) minK[K]=min(pos) maxK[K]=max(pos) } genotypes_predicted=matrix(nrow=317503,ncol=54) breaks_vector_AA=seq(0,25000,by=2)/100000 breaks_vector_BB=seq(75000,100000,by=2)/100000 maxi=length(breaks_vector_BB)-1 for (i in 1:54) { load(paste("zLOD_AI_barelyhomo_sample",i,".Rdata",sep='')) donnees=BAF[,i+2] #setting different cuts-off from the reference distributions pos_homo=which(chromosome==mini[i,2]) posAA=which(donnees[pos_homo]<0.25) posAA=pos_homo[posAA] a=hist(donnees[posAA],breaks=breaks_vector_AA) effectives_AA=a[[2]] values_AA=a[[1]] total_AA=sum(effectives_AA) cumulated=0 compteur=1 repeat { cumulated=cumulated+effectives_AA[compteur] if (cumulated>=total_AA*.95) {break} compteur=compteur+1 } CAA.95=min((values_AA[compteur]+values_AA[compteur+1])/2,0.25) cumulated=0 compteur=1 repeat { cumulated=cumulated+effectives_AA[compteur] if (cumulated>=total_AA*.999) {break} compteur=compteur+1 } CAA.999=min((values_AA[compteur]+values_AA[compteur+1])/2,0.25) cumulated=0 compteur=1 repeat { cumulated=cumulated+effectives_AA[compteur] if (cumulated>=total_AA*.5) {break} compteur=compteur+1 } CAA.5=min((values_AA[compteur]+values_AA[compteur+1])/2,0.25) cumulated=0 compteur=1 repeat { cumulated=cumulated+effectives_AA[compteur] if (cumulated>=total_AA*.75) {break} compteur=compteur+1 } CAA.75=min((values_AA[compteur]+values_AA[compteur+1])/2,0.25) #Region BB posBB=which(donnees[pos_homo]>0.75) posBB=pos_homo[posBB] a=hist(donnees[posBB],breaks=breaks_vector_BB) effectives_BB=a[[2]] values_BB=a[[1]] total_BB=sum(effectives_BB) cumulated=0 compteur=1 repeat { cumulated=cumulated+effectives_BB[compteur] if (cumulated>total_BB*.001) {break} compteur=compteur+1 } CBB.999=max((values_BB[compteur]+values_BB[compteur+1])/2,0.75) cumulated=0 compteur=1 repeat { cumulated=cumulated+effectives_BB[compteur] if (cumulated>total_BB*.25) {break} compteur=compteur+1 } CBB.75=max((values_BB[compteur]+values_BB[compteur+1])/2,0.75) cumulated=0 compteur=1 repeat { cumulated=cumulated+effectives_BB[compteur] if (cumulated>total_BB*.5) {break} compteur=compteur+1 } CBB.5=max((values_BB[compteur]+values_BB[compteur+1])/2,0.75) cumulated=0 compteur=1 repeat { cumulated=cumulated+effectives_BB[compteur] if (cumulated>total_BB*.05) {break} compteur=compteur+1 } CBB.95=max((values_BB[compteur]+values_BB[compteur+1])/2,0.75) #distr hetero pos_het=which(chromosome==mini[i,1]) posAB=which((donnees[pos_het]>=0.25)&(donnees[pos_het]<=0.75)) posAB=pos_het[posAB] mean_het=mean(donnees[posAB],na.rm=T) sd_het=sd(donnees[posAB],na.rm=T) breaks_vector_AB_lowres=seq(mean_het-0.125,mean_het+(trunc(.25/sd_het*3)+1)*sd_het/3-0.125,by=sd_het/3) breaks_vector_AB=seq(round((mean_het-0.125)*100000),round((mean_het+0.125)*100000),by=2)/100000 breaks_vector_AAB_lowres=seq(0,(trunc(.25/sd_het*3)+1)*sd_het/3,by=sd_het/3) breaks_vector_ABB_lowres=sort(1-breaks_vector_AAB_lowres) breaks_vector_AAB=seq(round(mean_het*100000)-25000,round(mean_het*100000),by=2)/100000 breaks_vector_ABB=seq(round(mean_het*100000),round(mean_het*100000)+25000,by=2)/100000 #determination of low resolution distribution of hets (used for alignement with observed data) donnees_tempAB=donnees[posAB] donnees_tempAB[which(donnees_tempABmean_het+(trunc(.25/sd_het*3)+1)*sd_het/3-0.125)]=mean_het+(trunc(.25/sd_het*3)+1)*sd_het/3-0.125 a=hist(donnees_tempAB,breaks=breaks_vector_AB_lowres) effectives_het_lowres=a[[2]] values_het_lowres=a[[1]] total_het_lowres=sum(effectives_het_lowres) pos_max_het_lowres=which(effectives_het_lowres==max(effectives_het_lowres)) #determination of hets precise stats (2-sided) donnees_tempAB=donnees[posAB] donnees_tempAB[which(donnees_tempABround((mean_het+0.125)*100000)/100000)]=round((mean_het-0.125)*100000)/100000 a=hist(donnees_tempAB,breaks=breaks_vector_AB) effectives_het=a[[2]] values_het=a[[1]] total_het=sum(effectives_het_lowres) #determination of AAB and ABB cuts-off based on reference distribution (high resolution) posAAB=which((donnees[pos_het]>=0.25)&(donnees[pos_het]<=mean_het)) posAAB=pos_het[posAAB] posABB=which((donnees[pos_het]>=mean_het)&(donnees[pos_het]<=0.75)) posABB=pos_het[posABB] donnees_temp=donnees[posAAB] donnees_temp[which(donnees_tempmax(breaks_vector_AAB))]=max(breaks_vector_AAB) a=hist(donnees_temp,breaks=breaks_vector_AAB) effectives_AAB=a[[2]] values_AAB=a[[1]] total_AAB=sum(effectives_AAB) cumulated=0 compteur=1 repeat { cumulated=cumulated+effectives_AAB[compteur] if (cumulated>total_AAB*.05) {break} compteur=compteur+1 } CAAB.95=(values_AAB[compteur]+values_AAB[compteur+1])/2 threshold_AAB_fittingmethod=CAA.95+mean_het-CAAB.95 #ABB donnees_temp=donnees[posABB] donnees_temp[which(donnees_tempmax(breaks_vector_ABB))]=max(breaks_vector_ABB) a=hist(donnees_temp,breaks=breaks_vector_ABB) effectives_ABB=a[[2]] values_ABB=a[[1]] total_ABB=sum(effectives_ABB) cumulated=0 compteur=1 repeat { cumulated=cumulated+effectives_ABB[compteur] if (cumulated>=total_ABB*.95) {break} compteur=compteur+1 } CABB.95=(values_ABB[compteur]+values_ABB[compteur+1])/2 threshold_ABB_fittingmethod=CBB.95+mean_het-CABB.95 #SNPs easy to genotype pos_blue=which(blue<40) temp_posAA=which(donnees[pos_blue]CBB.999) temp_posBB=pos_blue[temp_posBB] #nota: for hets: no need to exclude blue regions, because blue or not, hets are hets (which is not the case of homo) temp_posAB=which((donnees>=CAA.999)&(donnees<=CBB.999)) genotypes_predicted[temp_posAA,i]=0 genotypes_predicted[temp_posAB,i]=0.5 genotypes_predicted[temp_posBB,i]=1 #SNPs hard to genotype repeat { pos_blue=which(blue>=40) if (length(pos_blue)==0) { break } else { posx1=pos_blue[1] x1=blue[posx1] posx2=max(which(blue==x1)) donnees_temp=donnees[posx1:posx2] pos1=which(donnees_temp<0.5) donnees_tempAA=donnees_temp[pos1] pos2=which(donnees_temp>0.5) donnees_tempBB=donnees_temp[pos2] # hist(donnees_tempAA,breaks=20) # hist(donnees_tempBB,breaks=20) ##dealing with the AA region... #cases without AA if (length(which(donnees_tempAACAA.95) donnees_tempAAB=donnees_tempAA[pos] a=hist(donnees_tempAAB,breaks=breaks_vector_AAB_lowres) effectives_obs_lowres=a[[2]] values_obs_lowres=a[[1]] total_obs_lowres=sum(effectives_obs_lowres) pos_max=which(effectives_obs_lowres==max(effectives_obs_lowres)) if (length(pos_max)>1) pos_max=round(median(pos_max)) peak_AAB=(values_obs_lowres[pos_max]+values_obs_lowres[pos_max+1])/2 if (peak_AAB>threshold_AAB_fittingmethod) { #method 1: case of separate distribution of hets and homo: 1 homo and 1 het distribution are fitted # ratio_het=sum(effectives_obs_lowres[(pos_max-1):(pos_max+1)])/sum(effectives_het_lowres[(pos_max_het_lowres-1):(pos_max_het_lowres+1)]) ratio_het=sum(effectives_obs_lowres)/sum(effectives_het_lowres) ratio_homo=length(which(donnees_tempAA=0)) temp=distrib_function_het_adjusted[pos:maxi] distrib_function_het_adjusted=c(temp,rep(total_het_adjusted,pos-1)) } if (values_het_adjusted[1]>0) { pos=min(which(values_AA>values_het_adjusted[1])) temp=distrib_function_het_adjusted[pos:maxi] distrib_function_het_adjusted=c(rep(0,pos-1),temp) } } else { ####method 2: 2 distributions cannot be distinguished: fitting a homo distribution is still #reliable on the first class of the distrib. For the hets, I take the 2nd half of distrib #(the half close to the hets) as a reference, assuming the distrib is symetric. ##cuts-off close to AA a=hist(donnees_tempAA,breaks=breaks_vector_AA,xlim=c(0,0.25)) effectives_obs=a[[2]] values_obs=a[[1]] ratio_homo=length(which(donnees_tempAApeak_AAB) effectives_het_adjusted=effectives_obs pos_mini=min(pos) effectives_het_adjusted[1:(pos_mini-1)]=0 if (pos_mini0) { pos=max(pos) coAA=(values_AA[pos]+values_AA[pos+1])/2 } else { coAA=CAA.5 } pos=which(False_het_rate<0.05) if (length(pos)>0) { pos=min(pos) coAAB=(values_AA[pos]+values_AA[pos+1])/2 } else { coAAB=CAA.999 } } ##dealing with the B region... #cases without B if (length(which(donnees_tempBB>CBB.5))==0) donnees_tempBB=c(donnees_tempBB,1) #finding the distribution peak of hets around B pos=which(donnees_tempBB1) pos_max=round(median(pos_max)) peak_ABB=(values_obs_lowres[pos_max]+values_obs_lowres[pos_max+1])/2 if (peak_ABBCBB.75))/length(which(donnees[posBB]>CBB.75)) ###ratios are then used with high resolution distributions effectives_het_adjusted=ratio_het*effectives_het effectives_homo_adjusted=ratio_homo*effectives_BB total_homo_adjusted=sum(effectives_homo_adjusted) total_het_adjusted=sum(effectives_het_adjusted) #distribution functions distrib_function_het_adjusted=rep(NA,maxi) distrib_function_homo_adjusted=rep(NA,maxi) #lowering resolution (for algo efficiency) for (j in seq(1,maxi,by=10)) { distrib_function_het_adjusted[j:(j+9)]=sum(effectives_het_adjusted[j:maxi]) distrib_function_homo_adjusted[j:(j+9)]=sum(effectives_homo_adjusted[1:j]) } #alignement of values_het_adjusted: centering the het distribution on the peak_AAB values_het_adjusted=values_het-round(mean_het,5)+peak_ABB if (values_het_adjusted[maxi]>1) { pos=max(which(values_het_adjusted<=1)) temp=distrib_function_het_adjusted[1:pos] distrib_function_het_adjusted=c(rep(total_het_adjusted,maxi-pos),temp) } if (values_het_adjusted[maxi]<1) { pos=min(which(values_BB>values_het_adjusted[maxi])) temp=distrib_function_het_adjusted[1:pos] distrib_function_het_adjusted=c(temp,rep(0,maxi-pos)) } } else { ####method 2 for B region: a=hist(donnees_tempBB,breaks=breaks_vector_BB,xlim=c(0.75,1)) effectives_obs=a[[2]] values_obs=a[[1]] ratio_homo=length(which(donnees_tempBB>CBB.5))/length(which(donnees[posBB]>CBB.5)) effectives_homo_adjusted=effectives_BB*ratio_homo pos=which(values_obsmaxi/2) { pos2=pos_maxi*2-pos+1 pos2=pos2[(2*pos_maxi-maxi+1):pos_maxi] pos3=pos[(2*pos_maxi-maxi+1):pos_maxi] effectives_het_adjusted[pos2]=effectives_het_adjusted[pos3] } else { pos2=pos_maxi*2-pos+1 effectives_het_adjusted[pos2]=effectives_het_adjusted[pos] effectives_het_adjusted=c(effectives_het_adjusted,rep(0,maxi-2*length(pos))) } # plot(effectives_het_adjusted,type='h') total_het_adjusted=sum(effectives_het_adjusted) total_homo_adjusted=sum(effectives_homo_adjusted) #general counter (there because not visible otherwise!) plot(1,1,type='n') text(1,1,posx2,cex=3) #distribution functions distrib_function_het_adjusted=rep(NA,maxi) distrib_function_homo_adjusted=rep(NA,maxi) #limiting the window screened to 1/3 for (j in seq(maxi/3,maxi)) { distrib_function_het_adjusted[j]=sum(effectives_het_adjusted[j:maxi]) distrib_function_homo_adjusted[j]=sum(effectives_homo_adjusted[1:j]) } } #False homo and hetero rates N_false_homo=distrib_function_het_adjusted N_false_het=distrib_function_homo_adjusted False_homo_rate=N_false_homo/total_homo_adjusted False_het_rate=N_false_het/total_het_adjusted # plot(False_homo_rate,type='h',ylim=c(0,1)) # par(new=T) # plot(False_het_rate,type='h',ylim=c(0,1),col='red') #finding the most discriminant points a=abs(False_het_rate-False_homo_rate) # plot(a,type='h') pos_discr=which(a==min(a,na.rm=T)) pos_discr=mean(pos_discr) if ((False_homo_rate[pos_discr]<0.05)&(False_het_rate[pos_discr]<.05)) { coBB=(values_BB[pos_discr]+values_BB[pos_discr+1])/2 coABB=coBB } else { pos=which(False_homo_rate<.05) #pos=which(False_homo_rate<.01) if (length(pos)>0) { pos=min(pos) coBB=(values_BB[pos]+values_BB[pos+1])/2 } else { coBB=CBB.5 } pos=which(False_het_rate<0.05) # pos=which(False_het_rate<0.01) if (length(pos)>0) { pos=max(pos) coABB=(values_BB[pos]+values_BB[pos+1])/2 } else { coABB=CBB.999 } } ###genotype calls temp_posAA=which(donnees_tempcoAAB)&(donnees_tempcoBB)+posx1-1 genotypes_predicted[temp_posAA,i]=0 genotypes_predicted[temp_posAB,i]=0.5 genotypes_predicted[temp_posBB,i]=1 temp_posAAB=which((donnees_temp>coAA)&(donnees_tempcoABB)&(donnees_temp