#!/gsc/bin/perl

use strict;
use warnings;
use Getopt::Std;

my %opts = (i=>150, m=>1000000,l=>35, q=>35, s=>7, r=>2, b=>100);
getopts('s:m:i:l:q:r:b:', \%opts);
die("
Usage:   maq.pl sv <in.mapview>\n
Options: -i INT    maximum insert size [$opts{i}]
         -m INT    maximum SV size [$opts{m}]
         -l INT    average read length [$opts{l}]
         -q INT    minimum alternative mapping quality [$opts{q}]
         -s INT    minimum length of a region [$opts{s}]
         -b INT    buffer size for building connection [$opts{b}]
         -r INT    minimum number of read pairs required to establish a connection\n
") unless (@ARGV);

my $d = $opts{i} - $opts{l};
my ($begins, $beginc, $lasts, $lastc) = ('', -1, '', -1);
my (%regs,   # tabulate regions flanking alignment gaps
    %read,   # assign read pairs to one or more region
    %reg_name    # chromosome, start, end, and length classes of the regions
   );
my @reg_seq;

#warn("-- read the mapview output\n");
my $idx_buff=0;
my $final_buff=0;
my $reg_idx=0;
while (<>) {
  my @t = split;
  next if ($t[8] <= $opts{q});  #alternative mapping quality
  next if ($t[5]==0 || $t[5] == 18 || $t[5] == 64 || $t[5] == 130);
  next if ($t[5]!=32 && abs($t[4])>$opts{m});  #skip read pairs mapped too distantly on the same chromosome


  #next if($t[5]==18 || $t[5]==130);
  #next if($t[8]<$opts{q} && $t[5]!=64 && $t[5]!=192);

  my $do_break = ($t[1] ne $lasts || $t[2] - $lastc > $d)? 1 : 0;
  if ($do_break) {  # breakpoint in the assembly
    if ($lastc - $beginc > $opts{s}) { # skip short/unreliable flanking supporting regions,
      #register reliable region and supporting reads across gaps

      my $k=$reg_idx++;
      my $flag = ($lastc - $beginc < $opts{i})? '*' : '.';
      $reg_name{$k}="$begins\t$beginc\t$lastc\t$flag";

      my @p;
      foreach (@reg_seq) {
	push(@p, $_);
	my @s = split;
	push(@{$read{$s[0]}}, $k);
      }
      $regs{$k}=\@p;
      $idx_buff++;
      if($idx_buff>$opts{b}){
	&buildConnection();
	$idx_buff=0;
      }
    }
    else{
      foreach (@reg_seq) {
	my @s = split;
	delete $read{$s[0]};
      }
    }
    ($begins, $beginc) = @t[1..2];
    @reg_seq = ();
  }
  $t[0] =~ s/\/[12]$//;
  push(@reg_seq, join(" ", @t[0..3,5,8,13]));
  ($lasts, $lastc) = @t[1..2];
}
$final_buff=1;
&buildConnection();


sub buildConnection{
  # build connections
  # find paired regions that are supported by paired reads
  #warn("-- link regions\n");
  my %link;
  foreach my $x (keys %read) {
    my $p = $read{$x};
    next if (@$p != 2);  #skip singleton read (non read pairs)
    if (defined $link{$p->[0]}{$p->[1]}) {
      ++$link{$p->[0]}{$p->[1]};
    } else {
      $link{$p->[0]}{$p->[1]} = 1;
    }
  }

  my %clink=%link;
  # segregate graph, find nodes that have connections
  my @free_regions;
  foreach my $s0(keys %clink){
    next unless(defined $clink{$s0});

    #construct a subgraph
    my %subgraph=($s0=>1);
    my @tails=($s0);
    while(@tails){
      my @newtails;
      foreach my $tail(@tails){
	next unless(defined $clink{$tail});
	my @s1s=keys %{$clink{$tail}};
	foreach my $s1(@s1s){
	  my $nlinks=$clink{$tail}{$s1};
	  delete $clink{$tail}{$s1};  # use a link only once
	  next if($nlinks<$opts{r});  # require sufficient number of pairs
	  next if(defined $subgraph{$s1});  # a node only appear once in a subgraph
	  $subgraph{$s1}=1;  #add node to the subgraph
	  push @newtails,$s1;
	}
	delete $clink{$tail};
      }
      @tails=@newtails;
    }

    #Check for PE completeness

    my @hangover_Pair;
    my $nreads=0;
    my %read_pair;
    my @nodes=keys %subgraph;
    foreach my $node(@nodes){
      foreach my $y (@{$regs{$node}}) {
	my @t = split(" ", $y);
	next unless(defined $read{$t[0]});
	if(!defined $read_pair{$t[0]}){
	  $read_pair{$t[0]}=1;
	}
	else{
	  delete $read_pair{$t[0]};
	}
	$nreads++;
      }
    }

    if(!$final_buff){
      @hangover_Pair=keys %read_pair;
    }

    if(@hangover_Pair<$opts{r} ){  # subgraph segregation successful
      my $flag='AMB';
      if($#nodes+1==2){
	my $x = $_;
	my @count;
	$count[$_] = 0 for (0..3);
	#count 0, 1 the + and - alignment in region 1 of the pair
	#count 2, 3 the + and - alignment in region 2 of the pair
	foreach my $y (@{$regs{$nodes[0]}}) {
	  my @t = split(" ", $y);
	  ++$count[($t[3] eq '+')? 0 : 1];
	}
	foreach my $y (@{$regs{$nodes[1]}}) {
	  my @t = split(" ", $y);
	  ++$count[($t[3] eq '+')? 2 : 3];
	}
	# infer flag
	$flag = ($nodes[0] == $nodes[1])? 'LOP' : '';  #paired region is the same region, e.g. inversion
	if (!$flag) {
	  my @t1 = split(/\s+/, $reg_name{$nodes[0]});
	  my @t2 = split(/\s+/, $reg_name{$nodes[1]});
	  if ($t1[0] ne $t2[0]) {    #inter-chromosomal rearrangement
	    $flag = 'DIF';
	  } elsif (($count[1] == 0 && 2*$count[2] < $count[3])
		   || ($count[2] == 0 && 2*$count[1] < $count[0])) {
	    #majority evidence supporting deletion
	    $flag = 'DEL';
	  } else {
	    $flag = 'AMB';
	  }
	}
      }

      #print out result
      print "$nreads\t$flag\t";

      #record list of nodes to be freed
      foreach my $node(keys %subgraph){
	print "\t$reg_name{$node}";
	push @free_regions,$node;
      }
      print "\n";

    }
  }

  #release memory
  foreach my $node(@free_regions){
    #remove reads in the regions
    foreach my $y (@{$regs{$node}}) {
      my @t = split(" ", $y);
      my $readname=$t[0];
      delete $read{$readname};
    }
    #remove regions
    delete $regs{$node};
    delete $reg_name{$node};
  }
}
