refactor: sitemap throttle + SEF URLs (#100) and batch cursor pagination (#106) #116

Merged
jmiller merged 1 commits from feat/sitemap-batch-redesign into dev 2026-06-29 17:18:12 +00:00
4 changed files with 85 additions and 16 deletions
@@ -73,7 +73,9 @@ class BatchController extends BaseController
}
$app = Factory::getApplication();
$limit = min($app->getInput()->getInt('limit', 50), 200);
$input = $app->getInput();
$limit = min($input->getInt('limit', 50), 200);
$lastId = max(0, $input->getInt('lastid', 0));
$db = Factory::getContainer()->get(\Joomla\Database\DatabaseInterface::class);
$query = $db->getQuery(true)
@@ -88,18 +90,25 @@ class BatchController extends BaseController
)
->where($db->quoteName('c.state') . ' = 1')
->where($db->quoteName('t.id') . ' IS NULL')
->where($db->quoteName('c.id') . ' > ' . $lastId)
->order($db->quoteName('c.id') . ' ASC');
// Always offset=0: processed articles now have #__mokoog_tags rows
// and are excluded by the LEFT JOIN ... IS NULL filter automatically.
// Cursor-based pagination by id: each chunk fetches the next articles whose
// id is greater than the previous chunk's highest id. A row that fails to
// insert is passed over on the next chunk (its id is already behind the
// cursor) instead of being re-fetched forever, so the batch always reaches
// the end. The client stops when a chunk examines 0 rows.
$db->setQuery($query, 0, $limit);
$articles = $db->loadObjectList();
$created = 0;
$skipped = 0;
$now = Factory::getDate()->toSql();
$created = 0;
$skipped = 0;
$lastProcessedId = $lastId;
$now = Factory::getDate()->toSql();
foreach ($articles as $article) {
$lastProcessedId = (int) $article->id;
$ogTitle = $article->title;
$ogDescription = $this->extractDescription($article);
$ogImage = $this->extractImage($article);
@@ -131,7 +140,10 @@ class BatchController extends BaseController
}
echo new JsonResponse([
'created' => $created,
'created' => $created,
'skipped' => $skipped,
'examined' => \count($articles),
'last_id' => $lastProcessedId,
]);
$app->close();
@@ -234,27 +234,31 @@ document.addEventListener('DOMContentLoaded', function() {
return;
}
status.textContent = total + ' <?php echo Text::_('COM_MOKOOG_BATCH_FOUND', true); ?>';
processChunk(0, total, chunkSize, token, bar, status);
processChunk(0, 0, total, chunkSize, token, bar, status);
})
.catch(function(err) {
status.textContent = '<?php echo Text::_('COM_MOKOOG_BATCH_ERROR', true); ?> ' + err.message;
});
}
function processChunk(processed, total, chunkSize, token, bar, status) {
// Always offset=0: processed items are excluded by the IS NULL filter
fetch('index.php?option=com_mokoog&task=batch.process&format=json&limit=' + chunkSize + '&' + token + '=1')
function processChunk(lastId, processed, total, chunkSize, token, bar, status) {
// Cursor-based: pass the highest id seen so far. Failed rows fall behind
// the cursor and are not re-fetched, so the loop always terminates.
fetch('index.php?option=com_mokoog&task=batch.process&format=json&limit=' + chunkSize + '&lastid=' + lastId + '&' + token + '=1')
.then(function(r) { return r.json(); })
.then(function(resp) {
processed += resp.data.created;
var pct = Math.min(100, Math.round((processed / total) * 100));
var examined = resp.data.examined || 0;
processed += examined;
var pct = total > 0 ? Math.min(100, Math.round((processed / total) * 100)) : 100;
bar.style.width = pct + '%';
bar.textContent = pct + '%';
status.textContent = processed + ' / ' + total + ' <?php echo Text::_('COM_MOKOOG_BATCH_PROCESSED', true); ?>';
if (resp.data.created > 0 && processed < total) {
processChunk(processed, total, chunkSize, token, bar, status);
if (examined > 0) {
processChunk(resp.data.last_id, processed, total, chunkSize, token, bar, status);
} else {
bar.style.width = '100%';
bar.textContent = '100%';
bar.classList.remove('progress-bar-animated');
bar.classList.add('bg-success');
status.textContent = '<?php echo Text::_('COM_MOKOOG_BATCH_COMPLETE', true); ?> ' + processed + ' articles.';
@@ -28,6 +28,11 @@ final class MokoOG extends CMSPlugin implements SubscriberInterface
*/
protected $autoloadLanguage = true;
/**
* Minimum seconds between full sitemap regenerations (save-time throttle).
*/
private const SITEMAP_MIN_INTERVAL = 60;
/**
* Returns the events this plugin subscribes to.
*
@@ -845,6 +850,15 @@ final class MokoOG extends CMSPlugin implements SubscriberInterface
return;
}
// Throttle: rebuilding the whole sitemap on every save does not scale
// (bulk edits/imports). Regenerate at most once per interval — the
// sitemap is eventually consistent within that window.
$path = JPATH_ROOT . '/sitemap.xml';
if (is_file($path) && (time() - filemtime($path)) < self::SITEMAP_MIN_INTERVAL) {
return;
}
$changefreq = $this->params->get('sitemap_changefreq', 'weekly');
$xml = SitemapBuilder::generate($changefreq);
@@ -81,7 +81,7 @@ class SitemapBuilder
continue;
}
$url = $root . '/index.php?option=com_content&view=article&id=' . $article->id;
$url = self::articleUrl($article, $root);
$lastmod = $article->modified && $article->modified !== '0000-00-00 00:00:00'
? date('Y-m-d', strtotime($article->modified)) : '';
@@ -102,6 +102,45 @@ class SitemapBuilder
return $xml;
}
/**
* Build the SEF/canonical site URL for an article, with a safe fallback.
*
* Routes through the site router so the sitemap matches the canonical URLs
* the plugin emits. If routing fails (or SEF is off), falls back to the
* non-SEF index.php URL — never an empty or broken URL.
*
* @param object $article Row with id, alias, catid, language
* @param string $root Site root without trailing slash
*
* @return string Absolute URL
*/
private static function articleUrl(object $article, string $root): string
{
$fallback = $root . '/index.php?option=com_content&view=article&id=' . (int) $article->id;
$internal = 'index.php?option=com_content&view=article&id=' . (int) $article->id
. (!empty($article->alias) ? ':' . $article->alias : '')
. (!empty($article->catid) ? '&catid=' . (int) $article->catid : '');
try {
$routed = \Joomla\CMS\Router\Route::link(
'site',
$internal,
false,
\Joomla\CMS\Router\Route::TLS_IGNORE,
true
);
if (\is_string($routed) && $routed !== '') {
return $routed;
}
} catch (\Throwable $e) {
// Fall back to the non-SEF URL below.
}
return $fallback;
}
/**
* Write sitemap XML to the site root.
*