36 lines
902 B
Perl
36 lines
902 B
Perl
print "Usage: perl $0 transcription_dir path_prefix output_file\n";
|
|
print "Example: perl $0 /direct/datadigest/read_English_SG/gitm/transcription /direct/datadigest/read_English_SG/gitm/calls/ ~/gitm.list\n";
|
|
|
|
use File::Find;
|
|
use File::Copy;
|
|
|
|
open CORPUS, ">$ARGV[2]" or die "Cannot open corpus file $ARGV[1] for write.\n";
|
|
|
|
if ($ARGV[1] =~ m|/$|) # the parameter "path_prefix" is ended with /
|
|
{
|
|
$prefix = $ARGV[1];
|
|
}else
|
|
{
|
|
$prefix = "$ARGV[1]/";
|
|
}
|
|
|
|
@dirs = ($ARGV[0]);
|
|
|
|
find ( {wanted => \&wanted},
|
|
@dirs );
|
|
|
|
sub wanted
|
|
{
|
|
if (m|^([a-zA-Z0-9_]+)_(utt\d+)\.words$|)
|
|
{
|
|
$folder = $1;
|
|
$utt = $2;
|
|
$folder =~ m|^[A-Za-z]+(\d\d\d)|;
|
|
$group = $1; # usually it's 000, but not always. So $group need be extracted.
|
|
open WORDS, "$_" or die "Cannot open words file $_\n";
|
|
$words = <WORDS>;
|
|
chomp ($words);
|
|
print CORPUS "$prefix$group/$folder/${folder}_${utt}.ulaw\t$words\n";
|
|
}
|
|
}
|