#!usr/bin/perl #convert all scores for svm input, run libsvm, output result alignments and/or result human reference sequence #input: -s tab separated list scores # -d directory where libsvm (from svm-predict) is # -a where the full clustalw alignments are # -m svm model use warnings; use Tie::IxHash; my $i=0; my $svm_dir; my $model; my $score_file; my $align_dir; my $argc=@ARGV; if($argc==0){ print "-h for input instructions\n"; } foreach(@ARGV){ if($ARGV[$i] eq '-h'){ print "-s tab scores of RNAalifold, LocARNA (mlocarna), Foldalign, Cove (coveb), GC, APSI\n", " each line starts with alignment_name.start.end and scores separated by tab or space\n", "-d directory where svm-predict (libsvm2.9) is\n", "-a directory where the input alignments are, clustalw format, filename: alignment_name.extension and alignment_name contains no . \n", "-m svm model\n"; } if($ARGV[$i] eq '-h') {exit; } if($ARGV[$i] eq '-s'){ $score_file=$ARGV[$i+1]; } if($ARGV[$i] eq '-d'){ $svm_dir=$ARGV[$i+1]; } if($ARGV[$i] eq '-a'){ $align_dir=$ARGV[$i+1]; } if($ARGV[$i] eq '-m'){ $model=$ARGV[$i+1]; } $i++; } #check scores to delete lines that do not fit into the ranges. output into svm format for scaling. { my @score; my @seqid; open(svm_out, ">>svm_result") or die "can't create svm_result"; open(Score, $score_file) or die "can't open score file"; open(svm_input, ">>temp") or die "can't create svm input file"; while(){ chomp; @score=split(/[\t ]/, $_); my $i=0; my $print=1; push(@seqid, $score[0]); foreach(@score){ $i++; if($i ==1){ if($score[$i]>=-1 && $score[$i]<=0){ $score[$i]="1".":".$score[$i]; } else{ $print = 0; } } if($i == 2){ if($score[$i]>=-1.5 && $score[$i]<=0){ $score[$i]="2".":".$score[2]; } else{ $print=0; } } if($i == 3){ if($score[$i]>=-20 && $score[$i]<=50){ $score[$i]="3".":".$score[$i]; } else{ $print=0; } } if($i == 4){ if($score[$i]>=-0.1 && $score[$i]<=1.5){ $score[$i]="4".":".$score[$i]; } else{ $print=0; } } if($i == 5){ $score[$i]="5".":".$score[$i]; } if($i == 6){ $score[$i]="6".":".$score[$i]; } } if($print == 1){ print svm_input "-1 "; shift(@score); print svm_input join " ", @score; print svm_input "\n"; } } print svm_input "-1 1:-1 2:-1.5 3:-20 4:-0.1 5:0 6:0\n-1 1:0 2:0 3:50 4:1.5 5:1 6:1\n"; system("$svm_dir/svm-scale temp > temp.scaled"); system("$svm_dir/svm-predict -b 1 temp.scaled $model temp.out"); close(svm_input); open(t_out, "temp.out") or die "can't open temp.out"; my $j=-1; while(){ chomp; if(! /labels/){ $j++; if(exists $seqid[$j]){ print svm_out $seqid[$j], "\t", $_, "\n"; } } } close(t_out); close(svm_out); unlink("temp"); unlink("temp.scaled"); unlink("temp.out"); } system("cp svm_result svm_prediction.txt"); #get positions and combine, windows must be sorted. { open(svm_out, ">position") or die "can't create position file"; while(){ chomp; my @item=split(/[\t ]/, $_); if($item[1] eq '1'){ $item[0] =~ s/\./\t/g; print P $item[0], "\n"; } } close(P); unlink("svm_result"); system("sort -n -k2 position > position.txt"); unlink("position"); open(P_to_sort, ">position_sorted") or die "can't create position_sorted"; while(){ chomp; @line=split(/\t/, $_); if($line[0] ne $temp){ if($temp ne '0'){ print P_sorted $temp, "\t", $start, "\t", $end, "\n"; } $temp=$line[0]; $start=$line[1]; $end=$line[2]; } else{ if($line[1]<$end || $line[1] eq $end){ $end=$line[2]; } else { print P_sorted $temp, "\t", $start, "\t", $end, "\n"; $start=$line[1]; $end=$line[2]; } } } print P_sorted $temp, "\t", $start, "\t", $end, "\n"; unlink("position.txt"); } #get result alignments #position_sorted is the start and stop positions of each alignment to be selected #align_dir is the directory where all original alignment files are #the position file format is: alignment_name"\t"start"\t"end"\n" { system("mkdir result_alignment"); my %aln; #this is to store alignment name and alignment file name my $alignment_name; my $i; my @pos; #this is the array for each line of the position file my %seq; my ($start, $end); #NOTE: the alignment name is the filename minus file extension. #build up a hash with the alignment name as the key and file name as value opendir(DIR, $align_dir); while(my $file = readdir DIR){ next if($file=~/^\./); $file=~/^([^\.]+)/; #file name contains no "." $alignment_name=$1; $aln{$alignment_name}=$file; } open(F_pos, "position_sorted") or die "can't open position_sorted"; while(){ chomp; @pos=split(/\t/, $_); if(exists $aln{$pos[0]}){ $start=$pos[1]; $end=$pos[2]; open(Aln, "$align_dir/$aln{$pos[0]}") or die "can't open $align_dir/$aln{$pos[0]}"; tie (%seq, Tie::IxHash); while(){ chomp; if(/^\S+\s+\S+$/){ $_=~/^(\S+)\s+(\S+)$/; if(!exists $seq{$1}){ $seq{$1}=$2; } else{ $seq{$1}=$seq{$1}.$2; } } } open(f_out, ">>$pos[0].$start.$end.extended") or die "can't open new file"; print f_out "CLUSTAL W(1.5) multiple sequence alignment\n\n"; while((my $key, my $value)=each %seq){ print f_out $key, "\t"; for($i=0; $i