#!/usr/bin/perl

# ngrams.pl - count and display the most frequent ngrams in a text

# Eric Lease Morgan <eric_morgan@infomotions.com>
# August    28, 2010 - first cut; for a blog posting
# August    29, 2010 - tweeked to accept command-line input
# September 12, 2010 - tweaked for use by Lingua::EN::Ngram


# require
use Lingua::EN::Ngram;
use Lingua::StopWords qw( getStopWords );
use strict;

# sanity check
my $file = $ARGV[ 0 ];
my $size = $ARGV[ 1 ];
if ( ! $file or ! $size ) {

	print "Usage: $0 <file> <integer>\n";
	exit;
	
}

# initialize and count ngrams
my $stopwords = &getStopWords( 'en' );
my $ngram     = Lingua::EN::Ngram->new( file => $file );
my $ngrams    = $ngram->ngram( $size );

# process all the ngrams
my $index = 0;
foreach my $phrase ( sort { $$ngrams{ $b } <=> $$ngrams{ $a } } keys %$ngrams ) {
	
	# check for punctuation in each token of phrase
	my $found = 0;
	foreach ( (split / /, $phrase ) ) {
	
		if ( $size < 3 ) {
		
			if ( $$stopwords{ $_ } ) { $found = 1; last }
		
		}
	
		if ( $_ =~ /[,.?!:;()\-]/ ) {
		
			$found = 1;
			last;
			
		}
		
	}
	
	# don't want found tokens
	next if ( $found );
			
	# don't want single frequency phrases
	if ( $size > 1 ) { last if ( $$ngrams{ $phrase } == 1 ) }

	# echo
	print $$ngrams{ $phrase }, "\t$phrase\n";
	
}

# done
exit;
