#!/usr/bin/perl
use strict;
use warnings;

my @a;
my %s;
while (<STDIN>) {
    # e.g. 0942;DEVANAGARI VOWEL SIGN UU;Mn;0;NSM;;;;;N;;;;;
    my @f = split /;/;
    my $codepoint = hex($f[0]);
    my $name = $f[1];
    $name =~ y/ /_/;
    $a[$codepoint] = $name;
}

sub tostringdef {
    my ($ch) = @_;
    my $codepoint = ord($ch);
    if ($codepoint >= 0x10000) {
        print STDERR "Code points >= 0x10000 not supported by snowball\n";
    } elsif ($codepoint >= scalar @a || $a[$codepoint] eq '') {
        printf STDERR "No mapping for code point U+%04x\n", $codepoint;
    }
    $s{$codepoint}++;
    return "{$a[$codepoint]}";
}

my $out = '';
open my $in, '<:encoding(UTF-8)', $ARGV[0] or die;
while (<$in>) {
    $_ =~ s/[\N{U+0080}-\N{U+f0ffff}]/tostringdef($&)/ge;
    $out .= $_;
}

my $stringdefs = "stringescapes {}\n\n";
for (sort {$a <=> $b} keys %s) {
    $stringdefs .= sprintf "stringdef %s hex '%04x'\n", $a[$_], $_;
}

$out =~ s/(?=\nexternals\b)/\n$stringdefs/ or $out = "$stringdefs\n$out";

print $out;
