#!/usr/bin/perl -wn
# Copyright (C) 2009-2011  Antonio Bonafonte
#            Universitat Politcnica de Catalunya, Barcelona, Spain
#
#  This script is free software; you can redistribute it and/or
#  modify it under the terms of the GNU Lesser General Public
#  License as published by the Free Software Foundation,
#  version 2.1 of the License.
#
#  This library is distributed in the hope that it will be useful,
#  but WITHOUT ANY WARRANTY; without even the implied warranty of
#  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
#  Lesser General Public License for more details.
#
#  You should have received a copy of the GNU Lesser General Public
#  License along with this library; if not, write to the Free Software
#  Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA


# Purge short words (less or equal to 3 characters), contractions using "'" and
# abbreviations using "."

chomp;

if( ! m/\((".*")\s(.*)\s\((.*)\)$/ ){
    next;
}

my($word, $pos, $trn) = m/\("(.*)"\s(.+)\s\((.*)\)$/;

if( $word =~ /\./ || # skip abbreviatures
    $word =~ /\'/ || # skip contracted articles + pronoms febles
    $word eq "status" || # some weird words appearing in the dictionary
    $word eq "bayern" ||
$word eq "cincia-ficci" ||
$word eq "dirna" ||
$word eq "dsseldorf" ||
$word eq "espaa" ||
$word eq "genticament" ||
$word eq "iaki" ||
$word eq "muoz" ||
$word eq "mller" ||
$word eq "nez" ||
$word eq "prviament" ||
$word eq "puigcerd" ||
$word eq "rell" ||
$word eq "sueo" ||
$word eq "sriament" ||
$word eq "tall" ||
$word eq "tats" ||
$word eq "tcnicament" ||
$word eq "zrich" ||
$word eq "frica" ||
$word eq "rtic" ||
$word eq "ustria" ||
$word eq "vila" ||
$word eq "lvarez" ||
$word eq "ndia" ||
$word eq "bviament" ||
$word eq "cincia-ficci" ||
$word eq "croissant" ||
$word eq "cruyff" ||
$word eq "disney" ||
$word eq "dirna" ||
$word eq "dsseldorf" ||
$word eq "edward" ||
$word eq "espaa" ||
$word eq "finley" ||
$word eq "genticament" ||
$word eq "haixix" ||
$word eq "iaki" ||
$word eq "kelly" ||
$word eq "leyla" ||
$word eq "loyola" ||
$word eq "mayor" ||
$word eq "megabytes" ||
$word eq "muoz" ||
$word eq "myller" ||
$word eq "mller" ||
$word eq "netscape" ||
$word eq "nez" ||
$word eq "prviament" ||
$word eq "puigcerd" ||
$word eq "rajoy" ||
$word eq "reyes" ||
$word eq "reykiavik" ||
$word eq "rell" ||
$word eq "rugby" ||
$word eq "sueo" ||
$word eq "sydney" ||
$word eq "sriament" ||
$word eq "tall" ||
$word eq "tats" ||
$word eq "thyssen" ||
$word eq "tsars" ||
$word eq "tcnicament" ||
$word eq "walter" ||
$word eq "washington" ||
$word eq "weissmann" ||
$word eq "wellington" ||
$word eq "whisky" ||
$word eq "whiskys" ||
$word eq "wilson" ||
$word eq "windows" ||
$word eq "yakarta" ||
$word eq "zrich" ||
$word eq "frica" ||
$word eq "rtic" ||
$word eq "ustria" ||
$word eq "vila" ||
$word eq "lvarez" ||
$word eq "ndia" ||
$word eq "bviament" ||
$word eq "dawai" ) {
    print STDERR "Skipping $word, too weird!\n";
    next;
}

if( length($word) <= 4 ){
    print STDERR "Skipping $word, too short!\n";
    next;
}

print "$_\n";
