made the pack completely portable and wrote relevent bat files to go with it

2025-04-09 17:04:56 +03:00
parent 5e77d7e9cf
commit 5e4144c3c0
7417 changed files with 2181044 additions and 19 deletions
--- a/gitportable/usr/bin/docx2txt.pl
+++ b/gitportable/usr/bin/docx2txt.pl
@@ -0,0 +1,721 @@
+#!/usr/bin/env perl
+
+# docx2txt, a command-line utility to convert Docx documents to text format.
+# Copyright (C) 2008-2014 Sandeep Kumar
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+
+#
+# This script extracts text from document.xml contained inside .docx file.
+# Perl v5.10.1 was used for testing this script.
+#
+# Author : Sandeep Kumar (shimple0 -AT- Yahoo .DOT. COM)
+#
+# ChangeLog :
+#
+#    10/08/2008 - Initial version (v0.1)
+#    15/08/2008 - Script takes two arguments [second optional] now and can be
+#                 used independently to extract text from docx file. It accepts
+#                 docx file directly, instead of xml file.
+#    18/08/2008 - Added support for center and right justification of text that
+#                 fits in a line 80 characters wide (adjustable).
+#    03/09/2008 - Fixed the slip in usage message.
+#    12/09/2008 - Slightly changed the script invocation and argument handling
+#                 to incorporate some of the shell script functionality here.
+#                 Added support to handle embedded urls in docx document.
+#    23/09/2008 - Changed #! line to use /usr/bin/env - good suggestion from
+#                 Rene Maroufi (info>AT<maroufi>DOT<net) to reduce user work
+#                 during installation.
+#    31/08/2009 - Added support for handling more escape characters.
+#                 Using OS specific null device to redirect stderr.
+#                 Saving text file in binary mode.
+#    03/09/2009 - Updations based on feedback/suggestions from Sergei Kulakov
+#                 (sergei>AT<dewia>DOT<com).
+#                 - removal of non-document text in between TOC related tags.
+#                 - display of hyperlink alongside linked text user controlled.
+#                 - some character conversion updates
+#    05/09/2009 - Merged cjustify and rjustify into single subroutine justify.
+#                 Added more character conversions.
+#                 Organised conversion mappings in tabular form for speedup and
+#                 easy maintenance.
+#                 Tweaked code to reduce number of passes over document content.
+#    10/09/2009 - For leaner text experience, hyperlink is not displayed if
+#                 hyperlink and hyperlinked text are same, even if user has
+#                 enabled hyperlink display.
+#                 Improved handling of short line justification. Many
+#                 justification tag patterns were not captured earlier.
+#    11/09/2009 - A directory holding the unzipped content of .docx file can
+#                 also be specified as argument to the script, in place of file.
+#    17/09/2009 - Removed trailing slashes from input directory name.
+#                 Updated unzip command invocations to handle path names
+#                 containing spaces.
+#    01/10/2009 - Added support for configuration file.
+#    02/10/2009 - Using single quotes to specify path for unzip command. 
+#    04/10/2009 - Corrected configuration option name lineIndent to listIndent.
+#    11/12/2011 - Configuration variables now begin with config_ .
+#                 Configuration file is looked for in HOME directory as well.
+#                 Added a check for existence of unzip command.
+#                 Superscripted cross-references are placed within [...] now.
+#                 Fixed bugs #3003903, #3082018 and #3082035.
+#                 Fixed nullDevice for Cygwin.
+#    12/12/2011 - Configuration file is also looked for in /etc, default
+#                 location for Unix-ish systems.
+#    22/12/2011 - Added &apos; and &quot; to docx specific escape characters
+#                 conversions. [Bug #3463033]
+#    24/12/2011 - Improved handling of special (non-text) characters, along with
+#                 support for more non-text characters.
+#    05/01/2012 - Configuration file is now looked for in current directory,
+#                 user configuration directory and system configuration
+#                 directory (in the specified order). This streamlining allows
+#                 for per user configuration file even on Windows.
+#    14/01/2012 - Wrong code was committed during earlier fixing of nullDevice
+#                 for Cygwin, fixed that.
+#                 Usage is extended to accept docx file from standard input.
+#                 "-h" has to be given as the first argument to get usage help.
+#                 Added new configuration variable "config_tempDir".
+#    14/03/2014 - Remove deleted text from output. This effects in case changes
+#                 are being tracked in docx document. Patch was contributed by
+#                 William Parsons (wbparsons>AT<cshore>DOT<com).
+#                 Removed experimental config option config_exp_extra_deEscape.
+#    27/03/2014 - Remove non-document_text content marked by wp/wp14 tags.
+#    07/04/2014 - Added support for handling lists (bullet, decimal, letter,
+#                 roman) along with (attempt at) indentation.
+#                 Added new configuration variable config_twipsPerChar.
+#                 Removed configuration variable config_listIndent.
+#    14/04/2014 - Fixed list numbering - lvl start value needs to be considered.
+#                 Improved list indentation and corresponding code.
+#    27/04/2014 - Improved paragraph content layout/indentation.
+#    13/05/2014 - Added new configuration variable config_unzip_opts. Users can
+#                 now use unzipping programs like 7z, pkzipc, winzip as well.
+#
+
+
+#
+# The default settings below can be overridden via docx2txt.config in current
+# directory/ user configuration directory/ system configuration directory.
+#
+
+our $config_unzip = '/usr/bin/unzip';	# Windows path like 'C:/path/to/unzip.exe'
+our $config_unzip_opts = '-p';		# To extract file on standard output
+
+our $config_newLine = "\n";		# Alternative is "\r\n".
+our $config_lineWidth = 80;		# Line width, used for short line justification.
+our $config_showHyperLink = "N";	# Show hyperlink alongside linked text.
+our $config_tempDir;			# Directory for temporary file creation.
+our $config_twipsPerChar = 120;		# Approx mapping for layout purpose.
+
+
+#
+# Windows/Non-Windows specific settings. Adjust these here, if needed.
+#
+
+if ($ENV{OS} =~ /^Windows/ && !(exists $ENV{OSTYPE} || exists $ENV{HOME})) {
+    $nullDevice = "nul";
+    $userConfigDir = $ENV{APPDATA};
+
+    #
+    # On Windows, configuration file is installed in same folder as this script.
+    #
+    $0 =~ m%^(.*[/\\])[^/\\]*?$%;
+    $systemConfigDir = $1;
+
+    $config_tempDir = "$ENV{TEMP}";
+} else {
+    $nullDevice = "/dev/null";
+    $userConfigDir = $ENV{HOME};
+    $systemConfigDir = "/etc";
+
+    $config_tempDir = "/tmp";
+}
+
+
+#
+# Character conversion tables
+#
+
+# Only (amp, apos, gt, lt and quot) are the required reserved characters in HTML
+# and XHTML, others are used for better text experience.
+my %escChrs = (	amp => '&', apos => '\'', gt => '>', lt => '<', quot => '"',
+		acute => '\'', brvbar => '|', copy => '(C)', divide => '/',
+		laquo => '<<', macr => '-', nbsp => ' ', raquo => '>>',
+		reg => '(R)', shy => '-', times => 'x'
+);
+
+my %splchars = (
+    "\xC2" => {
+	"\xA0" => ' ',		# <nbsp> non-breaking space
+	"\xA2" => 'cent',	# <cent>
+	"\xA3" => 'Pound',	# <pound>
+	"\xA5" => 'Yen',	# <yen>
+	"\xA6" => '|',		# <brvbar> broken vertical bar
+#	"\xA7" => '',		# <sect> section
+	"\xA9" => '(C)',	# <copy> copyright
+	"\xAB" => '<<',		# <laquo> angle quotation mark (left)
+	"\xAC" => '-',		# <not> negation
+	"\xAE" => '(R)',	# <reg> registered trademark
+	"\xB1" => '+-',		# <plusmn> plus-or-minus
+	"\xB4" => '\'',		# <acute>
+	"\xB5" => 'u',		# <micro>
+#	"\xB6" => '',		# <para> paragraph
+	"\xBB" => '>>',		# <raquo> angle quotation mark (right)
+	"\xBC" => '(1/4)',	# <frac14> fraction 1/4
+	"\xBD" => '(1/2)',	# <frac12> fraction 1/2
+	"\xBE" => '(3/4)',	# <frac34> fraction 3/4
+    },
+
+    "\xC3" => {
+	"\x97" => 'x',		# <times> multiplication
+	"\xB7" => '/',		# <divide> division
+    },
+
+    "\xCF" => {
+	"\x80" => 'PI',		# <pi>
+    },
+
+    "\xE2\x80" => {
+	"\x82" => '  ',		# <ensp> en space
+	"\x83" => '  ',		# <emsp> em space
+	"\x85" => ' ',		# <qemsp>
+	"\x93" => ' - ',	# <ndash> en dash
+	"\x94" => ' -- ',	# <mdash> em dash
+	"\x95" => '--',		# <horizontal bar>
+	"\x98" => '`',		# <soq>
+	"\x99" => '\'',		# <scq>
+	"\x9C" => '"',		# <doq>
+	"\x9D" => '"',		# <dcq>
+	"\xA2" => '::',		# <diamond symbol>
+	"\xA6" => '...',	# <hellip> horizontal ellipsis
+	"\xB0" => '%.',		# <permil> per mille
+    },
+
+    "\xE2\x82" => {
+	"\xAC" => 'Euro'	# <euro>
+    },
+
+    "\xE2\x84" => {
+	"\x85" => 'c/o',	# <care/of>
+	"\x97" => '(P)',	# <sound recording copyright>
+	"\xA0" => '(SM)',	# <servicemark>
+	"\xA2" => '(TM)',	# <trade> trademark
+	"\xA6" => 'Ohm',	# <Ohm>
+    },
+
+    "\xE2\x85" => {
+	"\x93" => '(1/3)',
+	"\x94" => '(2/3)',
+	"\x95" => '(1/5)',
+	"\x96" => '(2/5)',
+	"\x97" => '(3/5)',
+	"\x98" => '(4/5)',
+	"\x99" => '(1/6)',
+	"\x9B" => '(1/8)',
+	"\x9C" => '(3/8)',
+	"\x9D" => '(5/8)',
+	"\x9E" => '(7/8)',
+	"\x9F" => '1/',
+    },
+
+    "\xE2\x86" => {
+	"\x90" => '<--',	# <larr> left arrow
+	"\x92" => '-->',	# <rarr> right arrow
+	"\x94" => '<-->',	# <harr> left right arrow
+    },
+
+    "\xE2\x88" => {
+	"\x82" => 'd',		# partial differential
+	"\x9E" => 'infinity',
+    },
+
+    "\xE2\x89" => {
+	"\xA0" => '!=',		# <neq>
+	"\xA4" => '<=',		# <leq>
+	"\xA5" => '>=',		# <geq>
+    },
+
+    "\xEF\x82" => {
+	"\xB7" => '*'		# small white square
+    }
+);
+
+
+#
+# Check argument(s) sanity.
+#
+
+my $usage = <<USAGE;
+
+Usage:	$0 [infile.docx|-|-h] [outfile.txt|-]
+	$0 < infile.docx
+	$0 < infile.docx > outfile.txt
+
+	In second usage, output is dumped on STDOUT.
+
+	Use '-h' as the first argument to get this usage information.
+
+	Use '-' as the infile name to read the docx file from STDIN.
+
+	Use '-' as the outfile name to dump the text on STDOUT.
+	Output is saved in infile.txt if second argument is omitted.
+
+Note:	infile.docx can also be a directory name holding the unzipped content
+	of concerned .docx file.
+
+USAGE
+
+die $usage if (@ARGV > 2 || $ARGV[0] eq '-h');
+
+
+#
+# Look for configuration file in current directory/ user configuration
+# directory/ system configuration directory - in the given order.
+#
+
+my %config;
+
+if (-f "docx2txt.config") {
+    %config = do 'docx2txt.config';
+} elsif (-f "$userConfigDir/docx2txt.config") {
+    %config = do "$userConfigDir/docx2txt.config";
+} elsif (-f "$systemConfigDir/docx2txt.config") {
+    %config = do "$systemConfigDir/docx2txt.config";
+}
+
+if (%config) {
+    foreach my $var (keys %config) {
+        $$var = $config{$var};
+    }
+}
+
+#
+# Check for unzip utility, before proceeding further.
+#
+
+die "Failed to locate unzip command '$config_unzip'!\n" if ! -f $config_unzip;
+
+
+#
+# Handle cases where this script reads docx file from STDIN.
+#
+
+if (@ARGV == 0) {
+    $ARGV[0] = '-';
+    $ARGV[1] = '-';
+    $inputFileName = "STDIN";
+} elsif (@ARGV == 1 && $ARGV[0] eq '-') {
+    $ARGV[1] = '-';
+    $inputFileName = "STDIN";
+} else {
+    $inputFileName = $ARGV[0];
+}
+
+if ($ARGV[0] eq '-') {
+    $tempFile = "${config_tempDir}/dx2tTemp_${$}_" . time() . ".docx";
+    open my $fhTemp, "> $tempFile" or die "Can't create temporary file for storing docx file read from STDIN!\n";
+
+    binmode $fhTemp;
+    local $/ = undef;
+    my $docxFileContent = <STDIN>;
+
+    print $fhTemp $docxFileContent;
+    close $fhTemp;
+
+    $ARGV[0] = $tempFile;
+}
+
+
+#
+# Check for existence and readability of required file in specified directory,
+# and whether it is a text file.
+#
+
+sub check_for_required_file_in_folder {
+    stat("$_[1]/$_[0]");
+    die "Can't read <$_[0]> in <$_[1]>!\n" if ! (-f _ && -r _);
+    die "<$_[1]/$_[0]> does not seem to be a text file!\n" if ! -T _;
+}
+
+sub readFileInto {
+    local $/ = undef;
+    open my $fh, "$_[0]" or die "Couldn't read file <$_[0]>!\n";
+    binmode $fh;
+    $_[1] = <$fh>;
+    close $fh;
+}
+
+sub readOptionalFileInto {
+    local $/ = undef;
+
+    stat("$_[0]");
+    if (-f _) {
+        if (-r _ && -T _) {
+            open my $fh, "$_[0]" or die "Couldn't read file <$_[0]>!\n";
+            binmode $fh;
+            $_[1] = <$fh>;
+            close $fh;
+        }
+        else {
+            die "Invalid <$_[0]>!\n";
+        }
+    }
+}
+
+
+
+#
+# Check whether first argument is specifying a directory holding extracted
+# content of .docx file, or .docx file itself.
+#
+
+sub cleandie {
+    unlink("$tempFile") if -e "$tempFile";
+    die "$_[0]";
+}
+    
+
+stat($ARGV[0]);
+
+if (-d _) {
+    check_for_required_file_in_folder("word/document.xml", $ARGV[0]);
+    check_for_required_file_in_folder("word/_rels/document.xml.rels", $ARGV[0]);
+    $inpIsDir = 'y';
+}
+else {
+    cleandie "Can't read docx file <$inputFileName>!\n" if ! (-f _ && -r _);
+    cleandie "<$inputFileName> does not seem to be a docx file!\n" if -T _;
+}
+
+
+#
+# Extract xml document content from argument docx file/directory.
+#
+
+my $unzip_cmd = "'$config_unzip' $config_unzip_opts";
+
+if ($inpIsDir eq 'y') {
+    readFileInto("$ARGV[0]/word/document.xml", $content);
+} else {
+    $content = `$unzip_cmd "$ARGV[0]" word/document.xml 2>$nullDevice`;
+}
+
+cleandie "Failed to extract required information from <$inputFileName>!\n" if ! $content;
+
+
+#
+# Be ready for outputting the extracted text contents.
+#
+
+if (@ARGV == 1) {
+     $ARGV[1] = $ARGV[0];
+
+     # Remove any trailing slashes to generate proper output filename, when
+     # input is directory.
+     $ARGV[1] =~ s%[/\\]+$%% if ($inpIsDir eq 'y');
+
+     $ARGV[1] .= ".txt" if !($ARGV[1] =~ s/\.docx$/\.txt/);
+}
+
+my $txtfile;
+open($txtfile, "> $ARGV[1]") || cleandie "Can't create <$ARGV[1]> for output!\n";
+binmode $txtfile;    # Ensure no auto-conversion of '\n' to '\r\n' on Windows.
+
+
+#
+# Gather information about header, footer, hyperlinks, images, footnotes etc.
+#
+
+if ($inpIsDir eq 'y') {
+    readFileInto("$ARGV[0]/word/_rels/document.xml.rels", $_);
+} else {
+    $_ = `$unzip_cmd "$ARGV[0]" word/_rels/document.xml.rels 2>$nullDevice`;
+}
+
+my %docurels;
+while (/<Relationship Id="(.*?)" Type=".*?\/([^\/]*?)" Target="(.*?)"( .*?)?\/>/g)
+{
+    $docurels{"$2:$1"} = $3;
+}
+
+#
+# Gather list numbering information.
+#
+
+$_ = "";
+if ($inpIsDir eq 'y') {
+    readOptionalFileInto("$ARGV[0]/word/numbering.xml", $_);
+} else {
+    $_ = `$unzip_cmd "$ARGV[0]" word/numbering.xml 2>$nullDevice`;
+}
+
+my %abstractNum;
+my @N2ANId = ();
+
+my %NFList = (
+    "bullet"      => \&bullet,
+    "decimal"     => \&decimal,
+    "lowerLetter" => \&lowerLetter,
+    "upperLetter" => \&upperLetter,
+    "lowerRoman"  => \&lowerRoman,
+    "upperRoman"  => \&upperRoman
+);
+
+if ($_) {
+    while (/<w:abstractNum w:abstractNumId="(\d+)">(.*?)<\/w:abstractNum>/g)
+    {
+        my $abstractNumId = $1, $temp = $2;
+
+	while ($temp =~ /<w:lvl w:ilvl="(\d+)"[^>]*><w:start w:val="(\d+)"[^>]*><w:numFmt w:val="(.*?)"[^>]*>.*?<w:lvlText w:val="(.*?)"[^>]*>.*?<w:ind w:left="(\d+)" w:hanging="(\d+)"[^>]*>/g )
+        {
+            # $2: Start $3: NumFmt, $4: LvlText, ($5,$6): (Indent (twips), hanging)
+
+            @{$abstractNum{"$abstractNumId:$1"}} = (
+                $NFList{$3},
+                $4,
+                $2,
+                int ((($5-$6) / $config_twipsPerChar) + 0.5),
+                $5
+            );
+        }
+    }
+
+    while ( /<w:num w:numId="(\d+)"><w:abstractNumId w:val="(\d+)"/g )
+    {
+        $N2ANId[$1] = $2;
+    }
+}
+
+# Remove the temporary file (if) created to store input from STDIN. All the
+# (needed) data is read from it already.
+unlink("$tempFile") if -e "$tempFile";
+
+
+#
+# Subroutines for center and right justification of text in a line.
+#
+
+sub justify {
+    my $len = length $_[1];
+
+    if ($_[0] eq "center" && $len < ($config_lineWidth - 1)) {
+        return ' ' x (($config_lineWidth - $len) / 2) . $_[1];
+    } elsif ($_[0] eq "right" && $len < $config_lineWidth) {
+        return ' ' x ($config_lineWidth - $len) . $_[1];
+    } else {
+        return $_[1];
+    }
+}
+
+#
+# Subroutines for dealing with embedded links and images
+#
+
+sub hyperlink {
+    my $hlrid = $_[0];
+    my $hltext = $_[1];
+    my $hlink = $docurels{"hyperlink:$hlrid"};
+
+    $hltext =~ s/<[^>]*?>//og;
+    $hltext .= " [HYPERLINK: $hlink]" if (lc $config_showHyperLink eq "y" && $hltext ne $hlink);
+
+    return $hltext;
+}
+
+#
+# Subroutines for processing numbering information.
+#
+
+my @RomanNumbers = ( "",
+    "i", "ii", "iii", "iv", "v", "vi", "vii", "viii", "ix", "x", "xi", "xii",
+    "xiii", "xiv", "xv", "xvi", "xvii", "xviii", "xix", "xx", "xxi", "xxii",
+    "xxiii", "xxiv", "xxv", "xxvi", "xxvii", "xxviii", "xxix", "xxx", "xxxi",
+    "xxxii", "xxxiii", "xxxiv", "xxxv", "xxxvi", "xxxvii", "xxxviii", "xxxix",
+    "xl", "xli", "xlii", "xliii", "xliv", "xlv", "xlvi", "xlvii", "xlviii",
+    "xlix", "l", "li" );
+
+
+sub lowerRoman {
+    return $RomanNumbers[$_[0]] if ($_[0] < @RomanNumbers);
+
+    @rcode = ("i", "iv", "v", "ix", "x", "xl", "l", "xc", "c", "cd", "d", "cm", "m");
+    @dval = (1, 4, 5, 9, 10, 40, 50, 90, 100, 400, 500, 900, 1000);
+
+    my $roman = "";
+    my $num = $_[0];
+
+    my $div, $i = (@rcode - 1);
+    while ($num > 0) {
+        $i-- while ($num < $dval[$i]);
+        $div = $num / $dval[$i];
+        $num = $num % $dval[$i];
+        $roman .= $rcode[$i] x $div;
+    }
+
+    return $roman;
+}
+
+sub upperRoman {
+    return uc lowerRoman(@_);
+}
+
+
+sub lowerLetter {
+    @Alphabets = split '' , "abcdefghijklmnopqrstuvwxyz";
+    return $Alphabets[($_[0] % 26) - 1] x (($_[0] - 1)/26 + 1);
+}
+
+sub upperLetter {
+    return uc lowerLetter(@_);
+}
+
+
+sub decimal {
+    return $_[0];
+}
+
+
+my %bullets = (
+    "\x6F" => 'o',
+    "\xEF\x81\xB6" => '::',	# Diamond
+    "\xEF\x82\xA7" => '#',	# Small Black Square
+    "\xEF\x82\xB7" => '*',	# Small Black Circle
+    "\xEF\x83\x98" => '>',	# Arrowhead
+    "\xEF\x83\xBC" => '+'	# Right Sign
+);
+
+sub bullet {
+    return $bullets{$_[0]} ? $bullets{$_[0]} : 'oo';
+}
+    
+my @lastCnt = (0);
+my @twipStack = (0);
+my @keyStack = (undef);
+my $ssiz = 1;
+
+sub listNumbering {
+    my $aref = \@{$abstractNum{"$N2ANId[$_[0]]:$_[1]"}};
+    my $lvlText;
+
+    if ($aref->[0] != \&bullet) {
+        my $key = "$N2ANId[$_[0]]:$_[1]";
+        my $ccnt;
+
+        if ($aref->[4] < $twipStack[$ssiz-1]) {
+            while ($twipStack[$ssiz-1] > $aref->[4]) {
+                pop @twipStack;
+                pop @keyStack;
+                pop @lastCnt;
+                $ssiz--;
+            }
+        }
+
+        if ($aref->[4] == $twipStack[$ssiz-1]) {
+            if ($key eq $keyStack[$ssiz-1]) {
+                ++$lastCnt[$ssiz-1];
+            }
+            else {
+                $keyStack[$ssiz-1] = $key;
+                $lastCnt[$ssiz-1] = $aref->[2];
+            }
+        }
+        elsif ($aref->[4] > $twipStack[$ssiz-1]) {
+            push @twipStack, $aref->[4];
+            push @keyStack, $key;
+            push @lastCnt, $aref->[2];
+            $ssiz++;
+        }
+
+        $ccnt = $lastCnt[$ssiz-1];
+
+        $lvlText = $aref->[1];
+        $lvlText =~ s/%\d([^%]*)$/($aref->[0]->($ccnt)).$1/oe;
+
+        my $i = $ssiz - 2;
+        $i-- while ($lvlText =~ s/%\d([^%]*)$/$lastCnt[$i]$1/o);
+    }
+    else {
+        $lvlText = $aref->[0]->($aref->[1]);
+    }
+
+    return ' ' x $aref->[3] . $lvlText . ' ';
+}
+
+#
+# Subroutines for processing paragraph content.
+#
+
+sub processParagraph {
+    my $para = $_[0] . "$config_newLine";
+    my $align = $1 if ($_[0] =~ /<w:jc w:val="([^"]*?)"\/>/);
+
+    $para =~ s/<.*?>//og;
+    return justify($align,$para) if $align;
+
+    return $para;
+}
+
+#
+# Text extraction starts.
+#
+
+my %tag2chr = (tab => "\t", noBreakHyphen => "-", softHyphen => " - ");
+
+$content =~ s/<?xml .*?\?>(\r)?\n//;
+
+$content =~ s{<(wp14|wp):[^>]*>.*?</\1:[^>]*>}||og;
+
+# Remove the field instructions (instrText) and data (fldData), and deleted
+# text.
+$content =~ s{<w:(instrText|fldData|delText)[^>]*>.*?</w:\1>}||ogs;
+
+# Mark cross-reference superscripting within [...].
+$content =~ s|<w:vertAlign w:val="superscript"/></w:rPr><w:t>(.*?)</w:t>|[$1]|og;
+
+$content =~ s{<w:(tab|noBreakHyphen|softHyphen)/>}|$tag2chr{$1}|og;
+
+my $hr = '-' x $config_lineWidth . $config_newLine;
+$content =~ s|<w:pBdr>.*?</w:pBdr>|$hr|og;
+
+$content =~ s{<w:caps/>.*?(<w:t>|<w:t [^>]+>)(.*?)</w:t>}/uc $2/oge;
+
+$content =~ s{<w:hyperlink r:id="(.*?)".*?>(.*?)</w:hyperlink>}/hyperlink($1,$2)/oge;
+
+$content =~ s|<w:numPr><w:ilvl w:val="(\d+)"/><w:numId w:val="(\d+)"\/>|listNumbering($2,$1)|oge;
+
+$content =~ s{<w:ind w:(left|firstLine)="(\d+)"( w:hanging="(\d+)")?[^>]*>}|' ' x int((($2-$4)/$config_twipsPerChar)+0.5)|oge;
+
+$content =~ s{<w:p [^/>]+?/>|<w:br/>}|$config_newLine|og;
+
+$content =~ s/<w:p[^>]+?>(.*?)<\/w:p>/processParagraph($1)/ogse;
+
+$content =~ s/<.*?>//og;
+
+
+#
+# Convert non-ASCII characters/character sequences to ASCII characters.
+#
+
+$content =~ s/(\xC2|\xC3|\xCF|\xE2.|\xEF.)(.)/($splchars{$1}{$2} ? $splchars{$1}{$2} : $1.$2)/oge;
+
+#
+# Convert docx specific (reserved HTML/XHTML) escape characters.
+#
+$content =~ s/(&)(amp|apos|gt|lt|quot)(;)/$escChrs{lc $2}/iog;
+
+#
+# Write the extracted and converted text contents to output.
+#
+
+print $txtfile $content;
+close $txtfile;
+