#!/usr/bin/perl

# This script generates collocate lists or tables from a CWB concordance file, the
# output is written as a csv file with tokens enclosed in quotation marks. The
# hit in the concordance must be enclosed by angled brackets: <HIT>; this can be changed
# in the first if-statement, but note that there is a good reason for using angled
# brackets to enclose hits for this purpose. By default, the script produces case 
# sensitive collocation tables, this can be changed by using flags.
# Usage:
# collocates.pl --list --case file.txt
# collocates.pl -l -c file.txt
# collocates.pl -lc file.txt
# --list, -l produces an alphabetically ordered list with columns from L4 to R4 showing
# the frequency of the word at those positions; if left unspecified, the output will have
# form of eight sets of two columns showing, respectively, the token and the frequency
# at each position from L4 to R4.
# --case, -c produces a case insensitive list or table; if left unspecified, the output 
# is case sensitive.
# (c) 2019 Anatol Stefanowitsch, GNU General Public License 3.0.

# The script can be used with strict and warnings
#use diagnostics;
#use strict;
#use warnings;
#use 5.010;
use Getopt::Long qw(GetOptions);
use List::MoreUtils qw[uniq];
use List::Util qw[min max];
use Data::Dumper;
Getopt::Long::Configure qw(gnu_getopt);

# Declare all variables, hashes and arrays
my $list;
my $case;
my %countL4;
my %countL3;
my %countL2;
my %countL1;
my %countR1;
my %countR2;
my %countR3;
my %countR4;
my @keysL4;
my @keysL3;
my @keysL2;
my @keysL1;
my @keysR1;
my @keysR2;
my @keysR3;
my @keysR4;
my $L4;
my $L3;
my $L2;
my $L1;
my $R1;
my $R2;
my $R3;
my $R4;
my $freqL4;
my $freqL3;
my $freqL2;
my $freqL1;
my $freqR1;
my $freqR2;
my $freqR3;
my $freqR4;
my $freqR;
my $freqL;
my $freqT;
my $tokenL4;
my $tokenL3;
my $tokenL2;
my $tokenL1;
my $tokenR1;
my $tokenR2;
my $tokenR3;
my $tokenR4;

# read the flags (if any)

GetOptions('list|l' => \$list, 'case|c' => \$case);

# read the input concordance, case sensitive or insensitive, depending on how
# the flag is set…

while(<>){

	s/\"/\"\"/g;

	if($_ =~ m/^.* (\S+) (\S+) (\S+) (\S+) \<\S+\> (\S+) (\S+) (\S+) (\S+)($| .*$)/){
	
		if ($case) {
		
			$L4 = lc($1);
			$L3 = lc($2);
			$L2 = lc($3);
			$L1 = lc($4);
			$R1 = lc($5);
			$R2 = lc($6);
			$R3 = lc($7);
			$R4 = lc($8);
		
		}else{

			$L4 = $1;
			$L3 = $2;
			$L2 = $3;
			$L1 = $4;
			$R1 = $5;
			$R2 = $6;
			$R3 = $7;
			$R4 = $8;
				
		}

#...and create a frequency list at each position		
		
		$countL4{$L4} = $countL4{$L4} ? $countL4{$L4}+1 : 1;
		$countL3{$L3} = $countL3{$L3} ? $countL3{$L3}+1 : 1;
		$countL2{$L2} = $countL2{$L2} ? $countL2{$L2}+1 : 1;
		$countL1{$L1} = $countL1{$L1} ? $countL1{$L1}+1 : 1;
		$countR1{$R1} = $countR1{$R1} ? $countR1{$R1}+1 : 1;
		$countR2{$R2} = $countR2{$R2} ? $countR2{$R2}+1 : 1;
		$countR3{$R3} = $countR3{$R3} ? $countR3{$R3}+1 : 1;
		$countR4{$R4} = $countR4{$R4} ? $countR4{$R4}+1 : 1;
				
	}
	
# could add an "else" condition here to count lines that were not matched	

}

# create collocate lists for each position, ordered by frequency

	@keysL4 = keys %countL4;
	@keysL3 = keys %countL3;
	@keysL2 = keys %countL2;
	@keysL1 = keys %countL1;
	@keysR1 = keys %countR1;
	@keysR2 = keys %countR2;
	@keysR3 = keys %countR3;
	@keysR4 = keys %countR4;

# if the option --list is used:

if ($list) {

# create a master list of all collocates, and sort it alphabetically

	my @keysALL = uniq(@keysL4,@keysL3,@keysL2,@keysL1,@keysR1,@keysR2,@keysR3,@keysR4);

	@keysALL = sort @keysALL;

	print ("\"Word\",\"Total\",\"Left\",\"Right\",\"L4\",\"L3\",\"L2\",\"L1\",\"R1\",\"R2\",\"R3\",\"R4\"\r\n");

# for each word on the master list, get the frequency information and put it into a
# a variable for printing (assign 0 if there is no frequency information)

	foreach my $i (@keysALL) {
	
		if ($countL4{$i}) {
			$freqL4 = $countL4{$i};
		}else{
			$freqL4 = 0;
		}

		if ($countL3{$i}) {
			$freqL3 = $countL3{$i};
		}else{
			$freqL3 = 0;
		}

		if ($countL2{$i}) {
			$freqL2 = $countL2{$i};
		}else{
			$freqL2 = 0;
		}

		if ($countL1{$i}) {
			$freqL1 = $countL1{$i};
		}else{
			$freqL1 = 0;
		}

		if ($countR1{$i}) {
			$freqR1 = $countR1{$i};
		}else{
			$freqR1 = 0;
		}

		if ($countR2{$i}) {
			$freqR2 = $countR2{$i};
		}else{
			$freqR2 = 0;
		}

		if ($countR3{$i}) {
			$freqR3 = $countR3{$i};
		}else{
			$freqR3 = 0;
		}

		if ($countR4{$i}) {
			$freqR4 = $countR4{$i};
		}else{
			$freqR4 = 0;
		}

# calculate left and right subtotals and totals for the current word

		$freqR = $freqR1 + $freqR2 + $freqR3 + $freqR4;
		$freqL = $freqL1 + $freqL2 + $freqL3 + $freqL4;
		$freqT = $freqR + $freqL;

		print ("\"".$i."\",".$freqT.",".$freqL.",".$freqR.",".$freqL4.",".$freqL3.",".$freqL2.",".$freqL1.",".$freqR1.",".$freqR2.",".$freqR3.",".$freqR4."\r\n");

	}

# if the list option is not set:
	
}else{

# create lists at each position, sorted by frequency in descending order

	@keysL4 = sort { $countL4{$b} <=> $countL4{$a} } @keysL4;
	@keysL3 = sort { $countL3{$b} <=> $countL3{$a} } @keysL3;
	@keysL2 = sort { $countL2{$b} <=> $countL2{$a} } @keysL2;
	@keysL1 = sort { $countL1{$b} <=> $countL1{$a} } @keysL1;
	@keysR1 = sort { $countR1{$b} <=> $countR1{$a} } @keysR1;
	@keysR2 = sort { $countR2{$b} <=> $countR2{$a} } @keysR2;
	@keysR3 = sort { $countR3{$b} <=> $countR3{$a} } @keysR3;
	@keysR4 = sort { $countR4{$b} <=> $countR4{$a} } @keysR4;

# set a counter and determine, which collocate list is the longest

	my $i = 0;
	my $longest = max($#keysL4, $#keysL3, $#keysL2, $#keysL1, $#keysR1, $#keysR2, $#keysR3, $#keysR4) + 1;

	print ("\"L4\",\"FrqL4\",\"L3\",\"FrqL3\",\"L2\",\"FrqL2\",\"L1\",\"FrqL1\",\"R1\",\"FrqR1\",\"R2\",\"FrqR2\",\"R3\",\"FrqR3\",\"R4\",\"FrqR4\"\r\n");

	while ($i <= $longest) {

# if there is a word at this position, set the print variable $token… to this
# frequency, if not, set it to an empty string.

		if (exists($keysL4[$i])) {
			$tokenL4 = $keysL4[$i];
		}else{
			$tokenL4 = "";
		}

		if (exists($keysL3[$i])) {
			$tokenL3 = $keysL3[$i];
		}else{
			$tokenL3 = "";
		}

		if (exists($keysL2[$i])) {
			$tokenL2 = $keysL2[$i];
		}else{
			$tokenL2 = "";
		}

		if (exists($keysL1[$i])) {
			$tokenL1 = $keysL1[$i];
		}else{
			$tokenL1 = "";
		}

		if (exists($keysR1[$i])) {
			$tokenR1 = $keysR1[$i];
		}else{
			$tokenR1 = "";
		}

		if (exists($keysR2[$i])) {
			$tokenR2 = $keysR2[$i];
		}else{
			$tokenR2 = "";
		}

		if (exists($keysR3[$i])) {
			$tokenR3 = $keysR3[$i];
		}else{
			$tokenR3 = "";
		}

		if (exists($keysR4[$i])) {
			$tokenR4 = $keysR4[$i];
		}else{
			$tokenR4 = "";
		}


# if there is a word frequency at this position, set the print variable $freq… to this
# frequency, if not, set it to an empty string.

		if (exists($keysL4[$i])) {
			$freqL4 = $countL4{$keysL4[$i]};
		}else{
			$freqL4 = "";
		}

		if (exists($keysL3[$i])) {
			$freqL3 = $countL3{$keysL3[$i]};
		}else{
			$freqL3 = "";
		}

		if (exists($keysL2[$i])) {
			$freqL2 = $countL2{$keysL2[$i]};
		}else{
			$freqL2 = "";
		}

		if (exists($keysL1[$i])) {
			$freqL1 = $countL1{$keysL1[$i]};
		}else{
			$freqL1 = "";
		}

		if (exists($keysR1[$i])) {
			$freqR1 = $countR1{$keysR1[$i]};
		}else{
			$freqR1 = "";
		}

		if (exists($keysR2[$i])) {
			$freqR2 = $countR2{$keysR2[$i]};
		}else{
			$freqR2 = "";
		}

		if (exists($keysR3[$i])) {
			$freqR3 = $countR3{$keysR3[$i]};
		}else{
			$freqR3 = "";
		}

		if (exists($keysR4[$i])) {
			$freqR4 = $countR4{$keysR4[$i]};
		}else{
			$freqR4 = "";
		}


# now print the output and increase the counter $i

		print ("\"".$tokenL4."\",".$freqL4.","."\"".$tokenL3."\",".$freqL3.","."\"".$tokenL2."\",".$freqL2.","."\"".$tokenL1."\",".$freqL1.","."\"".$tokenR1."\",".$freqR1.","."\"".$tokenR2."\",".$freqR2.","."\"".$tokenR3."\",".$freqR3.","."\"".$tokenR4."\",".$freqR4."\r\n");

		$i = $i + 1;

	}

}