/**
* Combined Wiki (MediaWiki) and HTML tokenizer based on pegjs. Emits several
* chunks of tokens (one chunk per top-level block matched) and eventually an
* end event. Tokens map to HTML tags as far as possible, with custom tokens
* used where further processing on the token stream is needed.
*/
{
use Wikimedia\Assert\UnreachableException;
use Wikimedia\Parsoid\Config\Env;
use Wikimedia\Parsoid\Config\SiteConfig;
use Wikimedia\Parsoid\Core\DomSourceRange;
use Wikimedia\Parsoid\NodeData\DataParsoid;
use Wikimedia\Parsoid\Tokens\CommentTk;
use Wikimedia\Parsoid\Tokens\EndTagTk;
use Wikimedia\Parsoid\Tokens\EOFTk;
use Wikimedia\Parsoid\Tokens\KV;
use Wikimedia\Parsoid\Tokens\KVSourceRange;
use Wikimedia\Parsoid\Tokens\NlTk;
use Wikimedia\Parsoid\Tokens\SelfclosingTagTk;
use Wikimedia\Parsoid\Tokens\SourceRange;
use Wikimedia\Parsoid\Tokens\TagTk;
use Wikimedia\Parsoid\Tokens\Token;
use Wikimedia\Parsoid\Utils\TokenUtils;
use Wikimedia\Parsoid\Utils\Utils;
use Wikimedia\Parsoid\Utils\PHPUtils;
use Wikimedia\Parsoid\Utils\WTUtils;
use Wikimedia\Parsoid\Wikitext\Consts;
}
{
/** @var Env */
private $env;
/** @var SiteConfig */
private $siteConfig;
/** @var array */
private $pipelineOpts;
/** @var int */
private $pipelineOffset;
private $extTags;
private $startTime;
/** @var string */
private $reUrltextLookahead;
/** @var string */
private $urltextPlainSegment = '';
/** @var bool */
private $urltextFoundAutolink = false;
protected function initialize() {
$this->env = $this->options['env'];
$this->siteConfig = $this->env->getSiteConfig();
$tokenizer = $this->options['pegTokenizer'];
$this->pipelineOpts = $tokenizer->getOptions();
// FIXME: inTemplate option may not always be set in
// standalone tokenizers user by some pipelines handlers.
$this->pipelineOffset = $this->options['pipelineOffset'] ?? 0;
$this->extTags = $this->siteConfig->getExtensionTagNameMap();
// Non-greedy text_char sequence: stop at ampersand, double-underscore,
// magic link prefix or protocol
$this->reUrltextLookahead = '!(?:' .
'([^-\'<[{\n\r:;\]}|\!=&]*?)' .
'(?:__|$|[-\'<[{\n\r:;\]}|\!=&]|(RFC|PMID|ISBN|' .
'(?i)' . $this->siteConfig->getProtocolsRegex( true ) .
')))!A';
}
private $prevOffset = 0;
private $headingIndex = 0;
private function assert( $condition, $text ) {
if ( !$condition ) {
throw new \Exception( "Grammar.pegphp assertion failure: $text" );
}
}
private function unreachable() {
throw new UnreachableException( "Grammar.pegphp: this should be unreachable" );
}
// Some shorthands for legibility
private function startOffset() {
return $this->savedPos;
}
private function endOffset() {
return $this->currPos;
}
private function tsrOffsets( $flag = 'default' ): SourceRange {
switch ( $flag ) {
case 'start':
return new SourceRange( $this->savedPos, $this->savedPos );
case 'end':
return new SourceRange( $this->currPos, $this->currPos );
default:
return new SourceRange( $this->savedPos, $this->currPos );
}
}
/*
* Emit a chunk of tokens to our consumers. Once this has been done, the
* current expression can return an empty list (true).
*/
private function emitChunk( $tokens ) {
// FIXME: We don't expect nulls here, but looks like
// hack from I1c695ab6cdd3655e98877c175ddbabdee9dc44b7
// introduces them. Work around it for now!
if ( !$tokens ) {
return [];
}
// Shift tsr of all tokens by the pipeline offset
TokenUtils::shiftTokenTSR( $tokens, $this->pipelineOffset );
$this->env->log( 'trace/peg', $this->options['pipelineId'] ?? '0', '----> ', $tokens );
$i = null;
$n = count( $tokens );
// Enforce parsing resource limits
for ( $i = 0; $i < $n; $i++ ) {
TokenizerUtils::enforceParserResourceLimits( $this->env, $tokens[ $i ] );
}
return $tokens;
}
/* ------------------------------------------------------------------------
* Extension tags should be parsed with higher priority than anything else.
*
* The trick we use is to strip out the content inside a matching tag-pair
* and not tokenize it. The content, if it needs to parsed (for example,
* for [, <*include*> tags), is parsed in a fresh tokenizer context
* which means any error correction that needs to happen is restricted to
* the scope of the extension content and doesn't spill over to the higher
* level. Ex: ]