#!/usr/bin/perl -w -CSDA # # snJwordcount - article word density counter for Joomla! 1.5 # # Copyright (c) 2008-2009 EPIPE Communications # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # See for license terms. # # # Version: 1.2 # # Version history: # # - 1.2: changed the license to GPLv3 due to stupid new # JED requirements (2009-01-29) # # - 1.1: changed to use CMS::Joomla module (2008-11-13) # # - 1.0: initial release (2008-11-07) # # Download location: # # http://dist.epipe.com/joomla/perl/ # # Author contact information: # # info # epipe.com # http://epipe.com/ # # Description: # # snJwordcount connects to Joomla! 1.5 database and extracts words # from specified or all articles in the database. It outputs a list # of all the words and a count of each word. # # Requirements: # # - perl # # - shell access to the Joomla! server # # - CMS::Joomla perl module (available from CPAN) # # - some common perl modules (available from CPAN) for database # access and HTML manipulation, see the "use" statements at the # beginning of the code # # Configuration: # # Have a look at the configuration variables at the beginning of the code # and edit them according to your needs. By default (as distributed # originally) the variables are set to sane defaults. # # Usage: # # snJwordcount /your/joomla/dir/configuration.php [articleID ...] # # If articleID consists entirely of numbers, it is assumed to be # an article ID number of an existing article in the Joomla! # database. # # Otherwise articleID is expected to be a title or an alias title # of the desired article. # # Several articleIDs can be listed. If none are listed, word count # of all published articles in the Joomla! database is returned. # # Limitations: # # This script only counts words in articles in the Joomla! database. # The actual web site word counts can be very different when looking # at the site through the Joomla! front-end. # # It is not known how well this script works with Unicode UTF-8 # characters (which I suppose is the Joomla! database format) # outside of the standard US-ASCII subset used by English language # websites. The author hopes that the perl -CSDA options at the # first line make this script automagically fully UTF-8 compliant. # That is probably not the case. # use strict; use DBI; use CMS::Joomla; use HTML::Strip; use HTML::Entities; # configurables: my $stripjoomla = 1; # whether to strip {joomla} tags my $emitspaces = 1; # whether to emit spaces in place of HTML tags my $counttitle = 1; # whether to include words in article title my $minwordlen = 3; # minimun word length in characters my $printpercent = 1; # whether to output percentage of total count # end of configuration variables my %words; my $totalwords = 0; sub striptags ($) { my $text = shift; return undef unless defined($text); $text =~ s/{[^}\s]+}/ /g if $stripjoomla; my $hs = HTML::Strip->new(); $hs->set_emit_spaces($emitspaces); my $st = $hs->parse($text); $hs->eof; decode_entities($st); $st =~ s/\s+/ /g; $st =~ s/^\s+//; $st =~ s/\s+$//; return $st; } # sub addwordcount ($$) { my ($title) = shift; my ($text) = ($counttitle ? $title . ' ' : '') . striptags(shift); foreach (split(/\W+/, $text)) { next if length($_) < $minwordlen; $totalwords++; $_ = lc $_; if (defined($words{$_})) { $words{$_}++; } else { $words{$_} = 1; } } } # sub wordcount (@) { my ($joomla) = CMS::Joomla->new(shift); my (@ids) = @_; return 0 unless defined($joomla); my $dbh = $joomla->dbhandle( { RaiseError => 1, AutoCommit => 1 } ); return 0 unless defined($dbh); if (@ids) { foreach my $id (@ids) { my ($idnumeric) = $id =~ /^\d+$/; my $sth = $dbh->prepare('SELECT id, title, introtext' . ' FROM ' . $joomla->cfg->{'dbprefix'} . 'content' . ' WHERE ' . ($idnumeric ? 'id = ?' : 'title = ? OR alias = ?')); if ($idnumeric) { $sth->execute($id); } else { $sth->execute($id, $id); } while (my ($aid, $title, $introtext) = $sth->fetchrow_array) { addwordcount($title, $introtext); } } } else { my $sth = $dbh->prepare('SELECT id, title, introtext' . ' FROM ' . $joomla->cfg->{'dbprefix'} . 'content' . ' WHERE state > 0'); $sth->execute; while (my ($aid, $title, $introtext) = $sth->fetchrow_array) { addwordcount($title, $introtext); } } foreach (sort {$words{$b} <=> $words{$a}} keys %words) { print "$_: $words{$_}" . ($printpercent ? sprintf(" (%2.1f%%)", $words{$_} / $totalwords * 100) : '') . "\n"; } print "--\n"; print "TOTAL $totalwords words\n"; } # main() if (@ARGV < 1) { print STDERR "usage: $0 /your/joomla/dir/configuration.php [articleID ...]\n"; exit(2); } wordcount(@ARGV); # eof