#!/usr/bin/perl -w -CSDA
#
# snJmetadesc - meta description generator for Joomla! 1.5
#
# Copyright (c) 2008-2009 EPIPE Communications
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# See for license terms.
#
#
# Version: 1.4
#
# Version history:
#
# - 1.4: changed the license to GPLv3 due to stupid new
# JED requirements (2009-01-14)
#
# - 1.3: changed to use CMS::Joomla module (2008-11-15)
# added metadescprefix configurable
#
# - 1.2: documentation improvements (2008-09-20)
#
# - 1.1: documentation improvements (2008-08-21)
#
# - 1.0: initial release (2008-08-19)
#
# Download location:
#
# http://dist.epipe.com/joomla/perl/
#
# Author contact information:
#
# info # epipe.com
# http://epipe.com/
#
# Description:
#
# snJmetadesc connects to Joomla! 1.5 database and goes through all
# articles. It extracts the beginning of each article content, removes
# any markup and adds the resulting string as the meta description tag of
# that article if none exists previously or if the original meta
# description is also autogenerated by this script. When a meta
# description of an article is changed, the script outputs the article
# ID number followed by the new meta description. All articles are
# processed, no matter what their status is (published or not).
#
# The script uses a specific character sequence "... " (excluding the
# quotes but including the space) at the end of the meta description
# field to indicate that it is autogenerated by this script. You can
# also define your meta descriptions manually for some articles. In
# that case do NOT type a space character at the end of the field
# if you end it with three dots (in that case the next run of this
# script would overwrite your manually written meta description).
# You can easily customize this character sequence by editing the
# script if you wish.
#
# Most Joomla! developers would call this a Search Engine Optimization
# (SEO) tool, but this is really just an automatic meta description
# generator. It is intended to be run manually after updating any
# articles or periodically from cron(8).
#
# This script serves also as an example how Joomla! configuration
# parameters and database can be accessed and manipulated from a perl
# script. Those folks who think that PHP sucks but still wish or are
# forced to use Joomla! might appreciate this as a basis for their
# own perl based Joomla! tools.
#
# Requirements:
#
# - perl (recent version)
#
# - shell access to the Joomla! server
#
# - CMS::Joomla perl module (available from CPAN)
#
# - some common perl modules (available from CPAN) for database
# access and HTML manipulation, see the "use" statements at the
# beginning of the code
#
# Configuration:
#
# Have a look at the configuration variables at the beginning of the code
# and edit them according to your needs. By default (as distributed
# originally) this script is set to "dry run" mode (in other words
# it does not do anything but it outputs the meta descriptions that
# would have been created).
#
# If you wish to remove all autogenerated meta descriptions, just
# set maxmetadesclen to zero.
#
# Usage:
#
# snJmetadesc /your/joomla/dir/configuration.php
#
# You can also list several configuration.php files. In that case
# all of the corresponding web sites will be processed.
#
# Limitations:
#
# This script creates meta descriptions only for articles in the Joomla!
# database. Those meta descriptions are displayed only on single article
# pages. Other types of pages will be still generated by Joomla! using
# the site global meta description (which Google does not like).
# If there is no global definition, then the meta description on those
# pages is left empty.
#
# It is not known how well this script works with Unicode UTF-8
# characters (which I suppose is the Joomla! database format)
# outside of the standard US-ASCII subset used by English language
# websites. The author hopes that the perl -CSDA options at the
# first line make this script automagically fully UTF-8 compliant.
# That is probably not the case.
#
# Safety Precautions:
#
# You should probably have basic perl, unix and database administration
# skills (as well as the ability to read English language) before
# running this script. You might want to take a backup of your
# Joomla! database or go to the emergency shelters or whatever your
# preferred safety precautions are.
#
use strict;
use DBI;
use HTML::Strip;
use HTML::Entities;
use CMS::Joomla;
# configurables:
my $maxmetadesclen = 250; # maximum meta description length
my $minmetadesclen = 32; # minimum meta description length
my $stripjoomla = 1; # whether to strip {joomla} tags
my $cloackemail = 1; # whether to do (really simple) e-mail cloacking
my $emitspaces = 1; # whether to emit spaces in place of HTML tags
my $dryrun = 1; # if set, does not actually update database
my $verbose = 1; # if set, outputs the generated metadescs
my $metadescprefix = ''; # prefix string for generated meta descriptions
# end of configuration variables
sub makemetadesc ($) {
my $text = shift;
return undef unless defined($text);
$text =~ s/{[^}\s]+}/ /g if $stripjoomla;
my $hs = HTML::Strip->new();
$hs->set_emit_spaces($emitspaces);
my $desc = $hs->parse($text);
$hs->eof;
decode_entities($desc);
$desc =~ s/\S+@\S+//g if $cloackemail;
$desc =~ s/\s+/ /g;
$desc =~ s/^\s+//;
if (length($desc) > $maxmetadesclen) {
$desc = substr($desc, 0, $maxmetadesclen);
$desc =~ s/\w+$//;
}
$desc =~ s/\s+$//;
return undef if length($desc) < $minmetadesclen;
return encode_entities($desc);
}
#
sub gendescs ($) {
my $joomla = CMS::Joomla->new(shift);
return 0 unless defined($joomla);
my $dbh = $joomla->dbhandle( { RaiseError => 1, AutoCommit => 0 } );
return 0 unless defined($dbh);
my $sth = $dbh->prepare('SELECT id, introtext, metadesc '
. 'FROM ' . $joomla->cfg->{'dbprefix'} . 'content');
$sth->execute;
my %updates;
while (my ($id, $introtext, $metadesc) = $sth->fetchrow_array) {
next if defined($metadesc) && $metadesc =~ /\S/ && $metadesc !~ /\.\.\. $/;
my $newmetadesc = makemetadesc($introtext);
next if !defined($newmetadesc) && !defined($metadesc);
if (defined($newmetadesc)) {
$newmetadesc = $metadescprefix . $newmetadesc . '... ';
} else {
$newmetadesc = '';
}
next if defined($newmetadesc) && defined($metadesc)
&& $newmetadesc eq $metadesc;
$updates{$id} = $newmetadesc;
}
$sth = $dbh->prepare('UPDATE ' . $joomla->cfg->{'dbprefix'} . 'content '
. 'SET metadesc = ? WHERE id = ?');
while (my ($id, $data) = each %updates) {
print "$id: $data\n" if $verbose;
$sth->execute($data, $id) unless $dryrun;
}
$dbh->commit;
$dbh->disconnect;
return 1;
}
# main()
foreach (@ARGV) {
gendescs($_);
}
# eof