#!/usr/bin/perl -w
package ACG::Parser;
BEGIN
{
use strict;
use XML::Parser::PerlSAX;
use vars qw($attr $cdata $liHtext $liText $current_element %NonSpanners %ListItems $lastHref);
%NonSpanners = (
hr => 1,
br => 1,
img => 1,
link => 1,
meta => 1,
);
}
sub init
{
my ( $self ) = @_;
$self->{liOff} = $self->{ulOff} = 1;
$self->{liLang} = $self->{spanLang} = undef;
$self->{ulNested} = 0;
}
sub new
{
my ($class) = @_;
my $self = {};
$self->{lang} = $_[2];
my $blessing = bless ($self, $class);
$self->init;
$blessing;
}
sub getAttrString
{
my $attrString;
foreach (sort keys %$attr) {
$attrString .= qq( $_="$attr->{$_}");
}
$attrString;
}
sub comment
{
$cdata = undef if ( $cdata && ($cdata =~ /^\s+$/s) );
$current_element = "comment";
}
sub append
{
my $self = shift;
$_ = shift;
if ( $self->{spanLang} ) {
if ( $self->{spanLang} eq $self->{lang} ) {
if ( $self->{liOff} ) {
$cdata .= $_;
}
else {
$liText .= $_;
$liHtext .= $_;
}
}
}
elsif ( $self->{liOff} ) {
$cdata .= $_;
}
else {
$liText .= $_;
$liHtext .= $_;
}
}
sub entity_reference
{
my ($self, $entity) = @_;
return if ( ($current_element eq "span") && (!$self->{spanLang}) && ($entity->{Name} eq "nbsp") );
$self->append ( "&$entity->{Name};" );
}
sub start_element
{
my ($self, $element) = @_;
if ( $cdata ) {
if ( $self->{ulOff} || ( $current_element eq "ul" ) ) {
print $cdata;
}
else {
$ListItems{$liText} .= $cdata;
}
$cdata = undef;
}
$current_element = $element->{Name};
my $attrString;
if ( $element->{Attributes} ) {
$attr = $element->{Attributes};
$attrString = $self->getAttrString ();
$self->{spanLang} = $attr->{lang} if ( ($current_element eq "span") && $attr->{lang} );
}
my $tag;
if ( $current_element eq "li" ) {
if ( $self->{ulNested} ) {
$tag = "
";
}
else {
$self->{liOff} = 0;
$liText = undef;
$self->{liLang} = $attr->{lang} if ( $attr && $attr->{lang} );
}
}
elsif ( $NonSpanners{$current_element} ) {
$tag = "<$current_element";
$tag .= ( $attrString ) ? "$attrString />" : " />";
}
#
# this confuses IE4, don't do it for now
#
# elsif ( $current_element eq "html" ) {
# $tag = qq();
# }
elsif ( ( $current_element eq "head" ) && ( $self->{lang} ne "en" ) ) {
$tag = qq(\n );
}
elsif ( $current_element ne "span" ) {
$tag = "<$current_element";
$tag .= ( $attrString ) ? "$attrString>" : ">";
if ( $current_element eq "ul" ) {
$self->{ulNested} = 1 unless ( $self->{ulOff} );
$self->{ulOff} = 0;
}
}
if ( $tag ) {
if ( $self->{spanLang} ) {
if ( $self->{spanLang} eq $self->{lang} ) {
if ( $self->{liOff} ) {
print $tag;
}
else {
$liHtext .= $tag;
}
}
}
else {
if ( $self->{liOff} ) {
print $tag;
}
else {
$liHtext .= $tag;
}
}
}
}
sub characters
{
my ($self, $characters) = @_;
$self->append ( $characters->{Data} );
}
sub end_element
{
my ($self, $element) = @_;
$current_element = $element->{Name}; # undo nesting clobber
if ( $cdata && ( $current_element ne "ul" ) ) {
print $cdata;
$cdata = undef;
}
my $tag;
if ( $current_element eq "ul" ) {
if ( $self->{ulNested} ) {
$self->{ulNested} = 0;
$tag = ( $cdata ) ? "$cdata" : "";
}
else {
foreach ( sort keys %ListItems ) {
print qq($ListItems{$_});
}
print "$cdata";
$self->{ulOff} = 1;
%ListItems = ();
}
}
elsif ( $current_element eq "span" ) {
$self->{spanLang} = undef;
}
elsif ( !$NonSpanners{$current_element} ) {
$tag = "$current_element>";
}
if ( $tag ) {
if ( $self->{liOff} ) {
print $tag;
}
else {
$liHtext .= $tag;
}
}
if ( ( $current_element eq "li" ) && !$self->{ulNested} ) {
$ListItems{$liText} = $liHtext
unless ( $self->{liLang} && ( $self->{liLang} ne $self->{lang} ) );
$self->{liOff} = 1;
$self->{liLang} = $liHtext = undef;
}
$attr = undef;
}
sub end_document
{
$_[0]->init;
}
package main;
require XML::Parser::PerlSAX;
open (SRCDOC, $ARGV[0] ) || die ( "$ARGV[0] not found." );
my $index = join ( "", );
close ( SRCDOC );
my $handler = new ACG::Parser ( lang => $ARGV[1] );
XML::Parser::PerlSAX->new ( Handler => $handler )->parse( $index );