#!/usr/bin/perl -w # # namesift --crb3 10nov01 # # $chatty=0; $listfiles="0"; $biglist="1"; $dolists="0"; die "usage: namesift filename\n" unless defined($ARGV[0]); while(substr($ARGV[0],0,1) eq "-"){ $switch = shift(@ARGV); substr($switch,0,1)=""; # hack off - $c=substr($switch,0,1); substr($switch,0,1)=""; next unless defined($c); if($c eq "f"){ if(defined($switch)){ $listfiles=$switch; }else{ $listfiles=($listfiles eq "0" ? "1" : "0"); # toggle } }elsif($c eq "l"){ if(defined($switch)){ $dolists=$switch; }else{ $dolists=($dolists eq "0" ? "1" : "0"); # toggle } }elsif($c eq "b"){ if(defined($switch)){ $biglist=$switch; }else{ $biglist=($biglist eq "0" ? "1" : "0"); # toggle } } } while(defined($dline=)){ chomp $dline; next if $dline =~ /^\s*$/; $dline =~ s/^\s+//; $dline =~ s/\s+$//; $ignores{$dline}=$dline; } if($chatty){ print "IGNORES:\n"; foreach $key (sort(keys %ignores)){ print "$key\n"; } print "\n"; } foreach $fn (@ARGV){ %nhash=&siftem($fn); foreach $key (sort { $nhash{$b} <=> $nhash{$a} } (sort(keys (%nhash)))){ if($dolists eq "1"){ if($listfiles eq "1"){ print "$key: $nhash{$key}\t$fn\n"; }else{ print "$key: $nhash{$key}\n"; } } if($biglist eq "1"){ if( exists ($biglist{$key})){ $biglist{$key} += $nhash{$key}; }else{ $biglist{$key} = $nhash{$key}; } } } #print %nhash; print "\n"; } if($biglist eq "1"){ print "\nBIG LIST:\n"; foreach $key (sort {$biglist{$b}<=>$biglist{$a}} (sort(keys (%biglist)))){ print "$key: $biglist{$key}\n"; } print "\n"; } # # siftem. # per-file sifter. # takes a filename. opens it and scans the lines, # looking for two proper names together. file must be TXT, more or # less in fanfic format (\n\n separate paragraphs). # builds a hash with those as keys and recurrence counts as args. # closes the file and returns the hash. # sub siftem { my $fname=$_[0]; my $notdone=1; my(%pnames,$inline,$bigline,$oldwrd,$pname); open(IFIL,"<$fname") or die "can't open file $fname\n"; while(defined($inline=) or $notdone){ if(defined($inline)){ chomp $inline; unless($inline =~ /^\s*$/){ $bigline .= " $inline"; # one big line per pgraph next; } }else{ $notdone=0; } $oldwrd=""; next unless defined($bigline); foreach $wrd (split(' ',$bigline)){ $wrd =~ s/\W+$//; # shave off punctuation $wrd =~ s/^\W+//; # ditto if( $wrd =~ /^[A-Z][a-z_]+$/ and $oldwrd =~ /^[A-Z][a-z_]+$/ and !(exists($ignores{$wrd})) and !(exists($ignores{$oldwrd})) ){ $pname = "$oldwrd $wrd"; if(exists($pnames{$pname})){ $pnames{$pname}++; }else{ $pnames{$pname}=1; } } $oldwrd=$wrd; } $bigline=""; } close(IFIL); return(%pnames); } __END__ Jusenkyou Jusenkyo Chinese China Amazon Amazons Nyanniichuan Nanniichuan Nerima Furinkan Tokyo Japan Japanese English Days Character Copyright Yes Joketsu Joketsuzoku The This She He They It If As Ask What Who Why When Where Because Upsetting Thanks Shit Fuck Piss Cunt Motherfucker Nigger Every Everyone Everything Everywhere Nothing Perhaps Sorry Greetings Greeting World Daddy Damn From Within Without About Whoa Evening Morning Night Noon Neither Now Then Would Wouldnt Please Thank That This School Beijing Fortunately Version Service Adult Lemon Education Terrible Wonderful And Also Anything Animal Control British American Department Police Fire Program National Insurance Bullshit Here There Hello Goodbye Great Beware Good Husband Wife Child Okay Operation Woman Man Chapter Nine Eight Seven Six Five Four Three Two