u
This commit is contained in:
35
script_nuance/tools/gather_transcription_from_wordsfile.pl
Normal file
35
script_nuance/tools/gather_transcription_from_wordsfile.pl
Normal file
@@ -0,0 +1,35 @@
|
||||
print "Usage: perl $0 transcription_dir path_prefix output_file\n";
|
||||
print "Example: perl $0 /direct/datadigest/read_English_SG/gitm/transcription /direct/datadigest/read_English_SG/gitm/calls/ ~/gitm.list\n";
|
||||
|
||||
use File::Find;
|
||||
use File::Copy;
|
||||
|
||||
open CORPUS, ">$ARGV[2]" or die "Cannot open corpus file $ARGV[1] for write.\n";
|
||||
|
||||
if ($ARGV[1] =~ m|/$|) # the parameter "path_prefix" is ended with /
|
||||
{
|
||||
$prefix = $ARGV[1];
|
||||
}else
|
||||
{
|
||||
$prefix = "$ARGV[1]/";
|
||||
}
|
||||
|
||||
@dirs = ($ARGV[0]);
|
||||
|
||||
find ( {wanted => \&wanted},
|
||||
@dirs );
|
||||
|
||||
sub wanted
|
||||
{
|
||||
if (m|^([a-zA-Z0-9_]+)_(utt\d+)\.words$|)
|
||||
{
|
||||
$folder = $1;
|
||||
$utt = $2;
|
||||
$folder =~ m|^[A-Za-z]+(\d\d\d)|;
|
||||
$group = $1; # usually it's 000, but not always. So $group need be extracted.
|
||||
open WORDS, "$_" or die "Cannot open words file $_\n";
|
||||
$words = <WORDS>;
|
||||
chomp ($words);
|
||||
print CORPUS "$prefix$group/$folder/${folder}_${utt}.ulaw\t$words\n";
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user