#!/usr/bin/perl ########################################################## # # demess.cgi # # CGI script that takes an htm file output from MS Word and # generates a valid XHTML file without all the extra garbage # Microsoft insists in leaving in the file. demess.cgi # will generate a smaller HTML file that's faster to load # and much, much easier to maintain away from Word. # # Most original coding by Laurie Mann (comments - LM) # http://www.dpsinfo.com # # Based on a script written by Jim Mann (comments - JM) # http://www.norstrilia.org # ########################################################## # LM: To use this script, copy it to your cgi-bin directory. # Check to make sure the first line of this file is the correct # path to the perl directory on your machine. You may need to reset # the permissions; if it doesn't run properly, try typing # chmod 755 demess.cgi from the Unix command line. # # Take any MS Word or Excel file and save it as an htm file. # Copy the .htm file to your cgi-bin directory, and, # from the command line, type: # # demess.cgi file.htm (file = name of your file) # JM: The script reads in the whole file, not just a line at a time. undef $/; # JM: $ARGV[0] is perl for the first command line arg, # the name of the file to convert. $filename = $ARGV[0]; # LM: The Basename coding is from The Perl Cookbook, # Tom Christiansen & Nathan Torkington, O'Reilly, 1999, # pp. 328-329, Example 9.10. use File::Basename; $base = basename($path); $dir = dirname ($path); ($base, $dir, $ext) = fileparse($path); $path= $filename; ($name,$dir,$ext) = fileparse($path, '\..*'); # LM: Revised script to accomodate changing extension. $newname = $name . ".html"; # JM: open both files. The '>' in the OUTFILE is how you # specify "for output" open(INFILE,"$filename"); open(OUTFILE,">$newname"); # JM: is just Perl syntax for "read from # the input file." $contents = ; # LM: header conversion $contents =~ s/]+>/\n /g; $contents =~ s/]+>//g; $contents =~ s/]+>//g; $contents =~ s///g; $contents =~ s/<\/xml>//g; $contents =~ s/.*?WordDocument>//sg; $contents =~ s/.*?OfficeDocumentSettings>//sg; $contents =~ s/.*?ExcelWorkbook>//sg; # LM: Populating meta tags, some with data from the Word file. $contents =~ s/(.*)<[^>]+>/ /g; $contents =~ s/(.*)<[^>]+>//g; $contents =~ s/(.*)<[^>]+>//g; $contents =~ s/(.*)<[^>]+>//g; $contents =~ s/(.*)<[^>]+>/ /g; # LM: Rebuilding the body # /g usually takes care of the global conversion, UNLESS the # contents of the tags extend over end-of-lines. If that # can happen, use /sg. (This took over two hours, four # books, various examples and a consultation with # with JM before I found the answer in one of the books.) $contents =~ s/]+>//g; $contents =~ s/]+>//g; $contents =~ s///sg; $contents =~ s//

/g; $contents =~ s/

/

/sg; $contents =~ s/]+>//sg; $contents =~ s/<\/span>//g; $contents =~ s/]+>//sg; $contents =~ s/<\/div>//g; $contents =~ s/]+>.*?<\/v:shape>//sg; $contents =~ s/v:shapes=\".*?\"/alt=\" \"/g; $contents =~ s/<\/p><\/p>/<\/p>/g; $contents =~ s/]+>/
/sg; $contents =~ s/
]+>/
/sg; $contents =~ s/]+>//g; $contents =~ s//g; $contents =~ s/<\/p>\s+<\/td>/<\/td>/sg; $contents =~ s/

 <\/p>//g; # JM: Print it all to outfile. Note that if we hadn't # undefed $/ at the beginning, we'd have been reading, # processing, then writing a line at a time. print OUTFILE "$contents";

]+>//g; $contents =~ s/]+>//sg; $contents =~ s/]+>//sg; $contents =~ s/]+>//sg; $contents =~ s/]+>//sg; $contents =~ s/]+>//sg; $contents =~ s/]+>//sg; $contents =~ s/]+>//sg; $contents =~ s/]+><\/td>/ <\/td>/sg; $contents =~ s/ / /sg; $contents =~ s/]+>\s+

/

/sg; $contents =~ s/]+>/