#!/usr/bin/perl -w
#$Id: make_collection,v 1.4 2005/04/13 00:05:44 joshr Exp $
# script to create random collections for swish-e from a file like /usr/dict/words (one word per line)
use strict;
use warnings;
use Getopt::Long;
use GetDictionaryWords;
my $prog = "make_collection";
# Dict file with words. One word per line.
my $dict='data/C020-words-txt/words-linux-fc1.txt';
my $min_words_per_file=1;
my $max_words_per_file=1;
my $num_files=0; # 0 means one file for each word in dictionary
my $num_words; # should be scalar(@words)
my $base_dir = ""; # empty base_dir means be an -S prog external program
my $randommode = 1;
# in randommode, words are randomly chosen, otherwise words are sequential from the dict
my $englishify = 1; # insert commas, periods, and caps?
my $filetype = "xml"; # type of file to create. can also be 'html' or 'txt'
my $verbose = 0;
sub Usage {
return "make_collection: [--dict=words.txt] [--base_dir=/your/location]\n" .
"[--min_words_per_file=#] [--max_words_per_file=#] [--max_files=#]\n" .
"[--verbose] [--englishify] [--filetype=(txt|html|xml)] [--(no)randommode]\n" .
" Makes a set of (possibly random) xml, html, or txt files based on a dict.\n";
}
GetOptions(
"min_words_per_file=i" => \$min_words_per_file,
"max_words_per_file=i" => \$max_words_per_file,
"num_files=i" => \$num_files,
"base_dir=s" => \$base_dir,
"dict=s" => \$dict,
"englishify!" => \$englishify,
"randommode!" => \$randommode,
"filetype=s" => \$filetype,
"verbose!" => \$verbose
) || die "$prog: Couldn't parse options:\n" . Usage();
die "$prog: Error: Filetype '$filetype' not understood\n" . Usage() unless $filetype =~ /^(txt|xml|html?)$/i;
if ($verbose) {
warn "$prog: Warning: No --base_dir option, running as swish-e external program\n" unless $base_dir;
}
my $parser = choose_parser($filetype);
if ($max_words_per_file < $min_words_per_file) {
warn "$prog: max_words_per_file must be larger than min_words_per_file";
}
# ref to wordlist, and ref to counthash
my ($words, $word_counts) = get_dictionary_words( $dict );
if ($num_files == 0) {
$num_files = scalar(@$words);
print STDERR "$prog: set num_files to $num_files\n" if $verbose;
}
print "Creating files...\n" if $verbose;
for(my $i = 0; $i < $num_files; $i++)
{
#if (($i+1) % 1000 == 0) { print STDERR "** working on file $i"; }
my $this_file_words = # choose how many words will be in the file
int( not_rand( $max_words_per_file - $min_words_per_file + 1 ) ) + $min_words_per_file;
my $doc ="";
my $toCap = 1; # should we Capitalize the coming word?
for(my $j = 0; $j < $this_file_words; $j++)
{
my $toadd = $randommode ? $$words[ not_rand( scalar(@$words) ) ] : $$words[$i % scalar(@$words)];
# choose the next word, either randomly, or sequentially
if ($toCap) { $toadd = "\u$toadd"; $toCap = 0; }
if (!defined($toadd)) { next; }
$doc .= $toadd;
if ($englishify) {
my $r = int(not_rand(10000)); # random number we use to plop in punctuation & line breaks
if ($j == $this_file_words-1 || $r % 9 == 0) { $doc .= ". "; $toCap = 1; }
elsif ($r % 7 == 0) { $doc .= ","; }
if (($j+$i+$r+1) % 5) { $doc .= " "; } else { $doc .= "\n"; }
} else {
$doc .= ($j+1) % 7 ? " " : "\n";
}
}
if ($filetype =~ /^xml$/i) {
$doc = simple_xmlify( $doc );
} elsif ($filetype =~ /^html$/i) {
$doc = simple_htmlify( extract_title($doc), $doc ); # title, content
} else {
$doc = simple_txtify( $doc );
}
if ($base_dir) {
my $path = "$base_dir/$i.$filetype";
open(OUTFILE, ">" . " $path") || die "$prog: Couldn't open $path";
print OUTFILE $doc;
close(OUTFILE) || die "$prog: Couldn't close $path";
print STDERR "$prog: created $path...\n" if ($verbose && $i % 1000 == 0);
} else {
# act like a swish-e external program
print simple_swishe_progify($parser, "$i.$filetype", $doc, scalar(localtime(time())));
}
}
# one block of text in xml
sub simple_xmlify {
return qq{\n\n} .
$_[0] . "\n\n\n";
}
# one block of text in txt
sub simple_txtify {
return "$_[0]\n";
}
# one block of text, with a title, in html
sub simple_htmlify {
my ($title, $content) = @_;
my $html = <
$title
$content