#!/usr/bin/perl -w package ACG::Parser; BEGIN { use strict; use XML::Parser::PerlSAX; use vars qw($attr $cdata $liHtext $liText $current_element %NonSpanners %ListItems $lastHref); %NonSpanners = ( hr => 1, br => 1, img => 1, link => 1, meta => 1, ); } sub init { my ( $self ) = @_; $self->{liOff} = $self->{ulOff} = 1; $self->{liLang} = $self->{spanLang} = undef; $self->{ulNested} = 0; } sub new { my ($class) = @_; my $self = {}; $self->{lang} = $_[2]; my $blessing = bless ($self, $class); $self->init; $blessing; } sub getAttrString { my $attrString; foreach (sort keys %$attr) { $attrString .= qq( $_="$attr->{$_}"); } $attrString; } sub comment { $cdata = undef if ( $cdata && ($cdata =~ /^\s+$/s) ); $current_element = "comment"; } sub append { my $self = shift; $_ = shift; if ( $self->{spanLang} ) { if ( $self->{spanLang} eq $self->{lang} ) { if ( $self->{liOff} ) { $cdata .= $_; } else { $liText .= $_; $liHtext .= $_; } } } elsif ( $self->{liOff} ) { $cdata .= $_; } else { $liText .= $_; $liHtext .= $_; } } sub entity_reference { my ($self, $entity) = @_; return if ( ($current_element eq "span") && (!$self->{spanLang}) && ($entity->{Name} eq "nbsp") ); $self->append ( "&$entity->{Name};" ); } sub start_element { my ($self, $element) = @_; if ( $cdata ) { if ( $self->{ulOff} || ( $current_element eq "ul" ) ) { print $cdata; } else { $ListItems{$liText} .= $cdata; } $cdata = undef; } $current_element = $element->{Name}; my $attrString; if ( $element->{Attributes} ) { $attr = $element->{Attributes}; $attrString = $self->getAttrString (); $self->{spanLang} = $attr->{lang} if ( ($current_element eq "span") && $attr->{lang} ); } my $tag; if ( $current_element eq "li" ) { if ( $self->{ulNested} ) { $tag = "
  • "; } else { $self->{liOff} = 0; $liText = undef; $self->{liLang} = $attr->{lang} if ( $attr && $attr->{lang} ); } } elsif ( $NonSpanners{$current_element} ) { $tag = "<$current_element"; $tag .= ( $attrString ) ? "$attrString />" : " />"; } # # this confuses IE4, don't do it for now # # elsif ( $current_element eq "html" ) { # $tag = qq(); # } elsif ( ( $current_element eq "head" ) && ( $self->{lang} ne "en" ) ) { $tag = qq(\n ); } elsif ( $current_element ne "span" ) { $tag = "<$current_element"; $tag .= ( $attrString ) ? "$attrString>" : ">"; if ( $current_element eq "ul" ) { $self->{ulNested} = 1 unless ( $self->{ulOff} ); $self->{ulOff} = 0; } } if ( $tag ) { if ( $self->{spanLang} ) { if ( $self->{spanLang} eq $self->{lang} ) { if ( $self->{liOff} ) { print $tag; } else { $liHtext .= $tag; } } } else { if ( $self->{liOff} ) { print $tag; } else { $liHtext .= $tag; } } } } sub characters { my ($self, $characters) = @_; $self->append ( $characters->{Data} ); } sub end_element { my ($self, $element) = @_; $current_element = $element->{Name}; # undo nesting clobber if ( $cdata && ( $current_element ne "ul" ) ) { print $cdata; $cdata = undef; } my $tag; if ( $current_element eq "ul" ) { if ( $self->{ulNested} ) { $self->{ulNested} = 0; $tag = ( $cdata ) ? "$cdata" : ""; } else { foreach ( sort keys %ListItems ) { print qq(
  • $ListItems{$_}); } print "$cdata"; $self->{ulOff} = 1; %ListItems = (); } } elsif ( $current_element eq "span" ) { $self->{spanLang} = undef; } elsif ( !$NonSpanners{$current_element} ) { $tag = ""; } if ( $tag ) { if ( $self->{liOff} ) { print $tag; } else { $liHtext .= $tag; } } if ( ( $current_element eq "li" ) && !$self->{ulNested} ) { $ListItems{$liText} = $liHtext unless ( $self->{liLang} && ( $self->{liLang} ne $self->{lang} ) ); $self->{liOff} = 1; $self->{liLang} = $liHtext = undef; } $attr = undef; } sub end_document { $_[0]->init; } package main; require XML::Parser::PerlSAX; open (SRCDOC, $ARGV[0] ) || die ( "$ARGV[0] not found." ); my $index = join ( "", ); close ( SRCDOC ); my $handler = new ACG::Parser ( lang => $ARGV[1] ); XML::Parser::PerlSAX->new ( Handler => $handler )->parse( $index );