#!/usr/bin/perl

# intersections.pl - extract and count the common ngrams from two texts

# Eric Lease Morgan <eric_morgan@infomotions.com>
# September 11, 2010 - first investigations
# September 12, 2010 - made it more general


# configure
use constant TEXTONE   => './copperfield.txt';
use constant TEXTTWO   => './twist.txt';
use constant TEXTTHREE => './christmas.txt';
use constant LENGTH    => 25;

# require
use lib '../lib';
use strict;
use Lingua::EN::Ngram;
use Lingua::StopWords qw( getStopWords );

# get input and sanity check
my $length = $ARGV[ 0 ];
if ( ! $length ) {

	print "Usage: $0 <integer>\n";
	exit;
	
}

# initialize
my $stopwords  = &getStopWords( 'en' );

# build corpus
my $textone   = Lingua::EN::Ngram->new( file => TEXTONE );
my $texttwo   = Lingua::EN::Ngram->new( file => TEXTTWO );
my $textthree = Lingua::EN::Ngram->new( file => TEXTTHREE );
my $corpus    = Lingua::EN::Ngram->new;

# calculate intersections
my $intersections = $corpus->intersection( corpus => [ ( $textone, $texttwo, $textthree ) ], length => $length );

# process each intersection
print 'Top ', LENGTH, " $length-gram phrases common to all ", TEXTONE, ', ', TEXTTWO, ', and ', TEXTTHREE, ":\n";
my $index = 0;
foreach my $_ ( sort { $$intersections{ $b } <=> $$intersections{ $a }} keys %$intersections ) {

	# (re-)initialize
	my $found = 0;
	
	# skip punctuation
	next if ( $_ =~ /[,.?!:;()\-]/ );
	next if ( $_ =~ /^'/ or $_ =~ /' / );
	
	# process bigrams
	if ( $length == 2 ) {
			
		# process each token of the bigram
		foreach my $token ( split / /, $_ ) {
		
			# flag occurance of stop words
			if ( $$stopwords{ $token }) { $found = 1 }
			
		}

	}

	# loop if found an unwanted token
	next if ( $found );
	
	# increment
	$index++;
	last if ( $index > LENGTH );
	
	# print summary
	print $$intersections{ $_ }, "\t$_\n";
	
}

# done
exit;


