#!/usr/bin/perl -w -CSDA
#
# snJwordcount - article word density counter for Joomla! 1.5
#
# Copyright (c) 2008-2009 EPIPE Communications
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# See for license terms.
#
#
# Version: 1.2
#
# Version history:
#
# - 1.2: changed the license to GPLv3 due to stupid new
# JED requirements (2009-01-29)
#
# - 1.1: changed to use CMS::Joomla module (2008-11-13)
#
# - 1.0: initial release (2008-11-07)
#
# Download location:
#
# http://dist.epipe.com/joomla/perl/
#
# Author contact information:
#
# info # epipe.com
# http://epipe.com/
#
# Description:
#
# snJwordcount connects to Joomla! 1.5 database and extracts words
# from specified or all articles in the database. It outputs a list
# of all the words and a count of each word.
#
# Requirements:
#
# - perl
#
# - shell access to the Joomla! server
#
# - CMS::Joomla perl module (available from CPAN)
#
# - some common perl modules (available from CPAN) for database
# access and HTML manipulation, see the "use" statements at the
# beginning of the code
#
# Configuration:
#
# Have a look at the configuration variables at the beginning of the code
# and edit them according to your needs. By default (as distributed
# originally) the variables are set to sane defaults.
#
# Usage:
#
# snJwordcount /your/joomla/dir/configuration.php [articleID ...]
#
# If articleID consists entirely of numbers, it is assumed to be
# an article ID number of an existing article in the Joomla!
# database.
#
# Otherwise articleID is expected to be a title or an alias title
# of the desired article.
#
# Several articleIDs can be listed. If none are listed, word count
# of all published articles in the Joomla! database is returned.
#
# Limitations:
#
# This script only counts words in articles in the Joomla! database.
# The actual web site word counts can be very different when looking
# at the site through the Joomla! front-end.
#
# It is not known how well this script works with Unicode UTF-8
# characters (which I suppose is the Joomla! database format)
# outside of the standard US-ASCII subset used by English language
# websites. The author hopes that the perl -CSDA options at the
# first line make this script automagically fully UTF-8 compliant.
# That is probably not the case.
#
use strict;
use DBI;
use CMS::Joomla;
use HTML::Strip;
use HTML::Entities;
# configurables:
my $stripjoomla = 1; # whether to strip {joomla} tags
my $emitspaces = 1; # whether to emit spaces in place of HTML tags
my $counttitle = 1; # whether to include words in article title
my $minwordlen = 3; # minimun word length in characters
my $printpercent = 1; # whether to output percentage of total count
# end of configuration variables
my %words;
my $totalwords = 0;
sub striptags ($) {
my $text = shift;
return undef unless defined($text);
$text =~ s/{[^}\s]+}/ /g if $stripjoomla;
my $hs = HTML::Strip->new();
$hs->set_emit_spaces($emitspaces);
my $st = $hs->parse($text);
$hs->eof;
decode_entities($st);
$st =~ s/\s+/ /g;
$st =~ s/^\s+//;
$st =~ s/\s+$//;
return $st;
}
#
sub addwordcount ($$) {
my ($title) = shift;
my ($text) = ($counttitle ? $title . ' ' : '') . striptags(shift);
foreach (split(/\W+/, $text)) {
next if length($_) < $minwordlen;
$totalwords++;
$_ = lc $_;
if (defined($words{$_})) {
$words{$_}++;
} else {
$words{$_} = 1;
}
}
}
#
sub wordcount (@) {
my ($joomla) = CMS::Joomla->new(shift);
my (@ids) = @_;
return 0 unless defined($joomla);
my $dbh = $joomla->dbhandle( { RaiseError => 1, AutoCommit => 1 } );
return 0 unless defined($dbh);
if (@ids) {
foreach my $id (@ids) {
my ($idnumeric) = $id =~ /^\d+$/;
my $sth = $dbh->prepare('SELECT id, title, introtext'
. ' FROM ' . $joomla->cfg->{'dbprefix'} . 'content'
. ' WHERE ' . ($idnumeric ? 'id = ?' : 'title = ? OR alias = ?'));
if ($idnumeric) {
$sth->execute($id);
} else {
$sth->execute($id, $id);
}
while (my ($aid, $title, $introtext) = $sth->fetchrow_array) {
addwordcount($title, $introtext);
}
}
} else {
my $sth = $dbh->prepare('SELECT id, title, introtext'
. ' FROM ' . $joomla->cfg->{'dbprefix'} . 'content'
. ' WHERE state > 0');
$sth->execute;
while (my ($aid, $title, $introtext) = $sth->fetchrow_array) {
addwordcount($title, $introtext);
}
}
foreach (sort {$words{$b} <=> $words{$a}} keys %words) {
print "$_: $words{$_}" . ($printpercent ?
sprintf(" (%2.1f%%)", $words{$_} / $totalwords * 100) : '') . "\n";
}
print "--\n";
print "TOTAL $totalwords words\n";
}
# main()
if (@ARGV < 1) {
print STDERR "usage: $0 /your/joomla/dir/configuration.php [articleID ...]\n";
exit(2);
}
wordcount(@ARGV);
# eof