/** * Combined Wiki (MediaWiki) and HTML tokenizer based on pegjs. Emits several * chunks of tokens (one chunk per top-level block matched) and eventually an * end event. Tokens map to HTML tags as far as possible, with custom tokens * used where further processing on the token stream is needed. */ { use Wikimedia\Assert\UnreachableException; use Wikimedia\Parsoid\Config\Env; use Wikimedia\Parsoid\Config\SiteConfig; use Wikimedia\Parsoid\Core\DomSourceRange; use Wikimedia\Parsoid\NodeData\DataParsoid; use Wikimedia\Parsoid\Tokens\CommentTk; use Wikimedia\Parsoid\Tokens\EndTagTk; use Wikimedia\Parsoid\Tokens\EOFTk; use Wikimedia\Parsoid\Tokens\KV; use Wikimedia\Parsoid\Tokens\KVSourceRange; use Wikimedia\Parsoid\Tokens\NlTk; use Wikimedia\Parsoid\Tokens\SelfclosingTagTk; use Wikimedia\Parsoid\Tokens\SourceRange; use Wikimedia\Parsoid\Tokens\TagTk; use Wikimedia\Parsoid\Tokens\Token; use Wikimedia\Parsoid\Utils\TokenUtils; use Wikimedia\Parsoid\Utils\Utils; use Wikimedia\Parsoid\Utils\PHPUtils; use Wikimedia\Parsoid\Utils\WTUtils; use Wikimedia\Parsoid\Wikitext\Consts; } { /** @var Env */ private $env; /** @var SiteConfig */ private $siteConfig; /** @var array */ private $pipelineOpts; /** @var int */ private $pipelineOffset; private $extTags; private $startTime; /** @var string */ private $reUrltextLookahead; /** @var string */ private $urltextPlainSegment = ''; /** @var bool */ private $urltextFoundAutolink = false; protected function initialize() { $this->env = $this->options['env']; $this->siteConfig = $this->env->getSiteConfig(); $tokenizer = $this->options['pegTokenizer']; $this->pipelineOpts = $tokenizer->getOptions(); // FIXME: inTemplate option may not always be set in // standalone tokenizers user by some pipelines handlers. $this->pipelineOffset = $this->options['pipelineOffset'] ?? 0; $this->extTags = $this->siteConfig->getExtensionTagNameMap(); // Non-greedy text_char sequence: stop at ampersand, double-underscore, // magic link prefix or protocol $this->reUrltextLookahead = '!(?:' . '([^-\'<[{\n\r:;\]}|\!=&]*?)' . '(?:__|$|[-\'<[{\n\r:;\]}|\!=&]|(RFC|PMID|ISBN|' . '(?i)' . $this->siteConfig->getProtocolsRegex( true ) . ')))!A'; } private $prevOffset = 0; private $headingIndex = 0; private function assert( $condition, $text ) { if ( !$condition ) { throw new \Exception( "Grammar.pegphp assertion failure: $text" ); } } private function unreachable() { throw new UnreachableException( "Grammar.pegphp: this should be unreachable" ); } // Some shorthands for legibility private function startOffset() { return $this->savedPos; } private function endOffset() { return $this->currPos; } private function tsrOffsets( $flag = 'default' ): SourceRange { switch ( $flag ) { case 'start': return new SourceRange( $this->savedPos, $this->savedPos ); case 'end': return new SourceRange( $this->currPos, $this->currPos ); default: return new SourceRange( $this->savedPos, $this->currPos ); } } /* * Emit a chunk of tokens to our consumers. Once this has been done, the * current expression can return an empty list (true). */ private function emitChunk( $tokens ) { // FIXME: We don't expect nulls here, but looks like // hack from I1c695ab6cdd3655e98877c175ddbabdee9dc44b7 // introduces them. Work around it for now! if ( !$tokens ) { return []; } // Shift tsr of all tokens by the pipeline offset TokenUtils::shiftTokenTSR( $tokens, $this->pipelineOffset ); $this->env->log( 'trace/peg', $this->options['pipelineId'] ?? '0', '----> ', $tokens ); $i = null; $n = count( $tokens ); // Enforce parsing resource limits for ( $i = 0; $i < $n; $i++ ) { TokenizerUtils::enforceParserResourceLimits( $this->env, $tokens[ $i ] ); } return $tokens; } /* ------------------------------------------------------------------------ * Extension tags should be parsed with higher priority than anything else. * * The trick we use is to strip out the content inside a matching tag-pair * and not tokenize it. The content, if it needs to parsed (for example, * for , <*include*> tags), is parsed in a fresh tokenizer context * which means any error correction that needs to happen is restricted to * the scope of the extension content and doesn't spill over to the higher * level. Ex: FAILS looking for }} // backtracks, popping "bracket_bracket" and "brace_brace" off preproc stack // wikilink->text,broken_template,text --> FAILS looking for ]] // backtracks, popping "bracket_bracket" and false off preproc stack // broken_wikilink,text,broken_template,text --> OK // with [false, false] left on the preproc stack broken_template = preproc:<&preproc> t:"{{" { $preproc = null; return $t; } template_preproc = "{{" leadWS:$( nl_comment_space* ) target:template_param_value params:( nl_comment_space* "|" r:( p0:("" { return $this->endOffset(); }) v:nl_comment_space* p:("" { return $this->endOffset(); }) &("|" / "}}") { // empty argument $tsr0 = new SourceRange( $p0, $p ); return new KV( '', TokenizerUtils::flattenIfArray( $v ), $tsr0->expandTsrV() ); } / template_param ) { return $r; } )* trailWS:$( nl_comment_space* ) inline_breaks "}}" { // Insert target as first positional attribute, so that it can be // generically expanded. The TemplateHandler then needs to shift it out // again. array_unshift( $params, new KV( TokenizerUtils::flattenIfArray( $target['tokens'] ), '', $target['srcOffsets']->expandTsrK() ) ); $dp = new DataParsoid; $dp->tsr = $this->tsrOffsets(); $dp->src = $this->text(); $tmp = $dp->getTemp(); $tmp->leadWS = $leadWS; $tmp->trailWS = $trailWS; $obj = new SelfclosingTagTk( 'template', $params, $dp ); return $obj; } / $('{{' space_or_newline* '}}') tplarg = tplarg_preproc<&preproc="}}"> tplarg_preproc = "{{{" p:("" { return $this->endOffset(); }) target:template_param_value? params:( nl_comment_space* "|" r:( p0:("" { return $this->endOffset(); }) v:nl_comment_space* p1:("" { return $this->endOffset(); }) &("|" / "}}}") { // empty argument return [ 'tokens' => $v, 'srcOffsets' => new SourceRange( $p0, $p1 ) ]; } / template_param_value ) { return $r; } )* nl_comment_space* inline_breaks "}}}" { $kvs = []; if ( $target === null ) { $target = [ 'tokens' => '', 'srcOffsets' => new SourceRange( $p, $p ) ]; } // Insert target as first positional attribute, so that it can be // generically expanded. The TemplateHandler then needs to shift it out // again. $kvs[] = new KV( TokenizerUtils::flattenIfArray( $target['tokens'] ), '', $target['srcOffsets']->expandTsrK() ); foreach ( $params as $o ) { $s = $o['srcOffsets']; $kvs[] = new KV( '', TokenizerUtils::flattenIfArray( $o['tokens'] ), $s->expandTsrV() ); } $dp = new DataParsoid; $dp->tsr = $this->tsrOffsets(); $dp->src = $this->text(); $obj = new SelfclosingTagTk( 'templatearg', $kvs, $dp ); return $obj; } template_param = name:template_param_name val:( kEndPos:("" { return $this->endOffset(); }) // no optionalSpaceToken here, it's eaten by template_param_name "=" vStartPos:("" { return $this->endOffset(); }) optSp:optionalSpaceToken tpv:template_param_value? { return [ 'kEndPos' => $kEndPos, 'vStartPos' => $vStartPos, 'value' => TokenizerUtils::flattenString( [ $optSp, $tpv['tokens'] ?? [] ] ), ]; } )? { if ( $val !== null ) { if ( $val['value'] !== null ) { $so = new KVSourceRange( $this->startOffset(), $val['kEndPos'], $val['vStartPos'], $this->endOffset() ); return new KV( $name, TokenizerUtils::flattenIfArray( $val['value'] ), $so ); } else { return new KV( TokenizerUtils::flattenIfArray( $name ), '', $so ); } } else { $so = new SourceRange( $this->startOffset(), $this->endOffset() ); return new KV( '', TokenizerUtils::flattenIfArray( $name ), $so->expandTsrV() ); } } // empty parameter / & [|}] { $so = new SourceRange( $this->startOffset(), $this->endOffset() ); return new KV( '', '', $so->expandTsrV() ); } template_param_name = template_param_text / (&'=' { return ''; }) template_param_value = tpt:template_param_text { return [ 'tokens' => $tpt, 'srcOffsets' => $this->tsrOffsets() ]; } template_param_text = il:(nested_block / newlineToken)+ { // il is guaranteed to be an array -- so, tu.flattenIfArray will // always return an array $r = TokenizerUtils::flattenIfArray( $il ); if ( count( $r ) === 1 && is_string( $r[0] ) ) { $r = $r[0]; } return $r; } //// Language converter block markup of language variants: -{ ... }- // Note that "rightmost opening" precedence rule (see // https://www.mediawiki.org/wiki/Preprocessor_ABNF ) means // that neither -{{ nor -{{{ are parsed as a -{ token, although // -{{{{ is (since {{{ has precedence over {{). lang_variant_or_tpl = &('-{' &('{{{'+ !'{') tplarg) a:lang_variant { return $a; } / a:$('-' &('{{{'+ !'{')) b:tplarg { return [ $a, $b ]; } / a:$('-' &('{{' '{{{'* !'{')) b:template { return [ $a, $b ]; } / &'-{' a:lang_variant { return $a; } broken_lang_variant = r:"-{" preproc:<&preproc> { $preproc = null; return $r; } lang_variant = // FIXME: Maybe this should suppress "table" and "tableCellArg" like 'template_param_text' d too lang_variant_preproc<&preproc="}-", extlink=false> / broken_lang_variant lang_variant_preproc = lv0:("-{" { return $this->startOffset(); }) f:( &{ return $this->env->langConverterEnabled(); } ff:opt_lang_variant_flags { // if flags contains 'R', then don't treat ; or : specially inside. if ( isset( $ff['flags'] ) ) { $ff['raw'] = isset( $ff['flags']['R'] ) || isset( $ff['flags']['N'] ); } elseif ( isset( $ff['variants'] ) ) { $ff['raw'] = true; } return $ff; } / &{ return !$this->env->langConverterEnabled(); } "" { // if language converter not enabled, don't try to parse inside. return [ 'raw' => true ]; } ) ts:( &{ return $f['raw']; } lv:lang_variant_text { return [ [ 'text' => $lv ] ]; } / &{ return !$f['raw']; } lv:lang_variant_option_list { return $lv; } ) inline_breaks lv1:("}-" { return $this->endOffset(); }) { if ( !$this->env->langConverterEnabled() ) { return [ '-{', $ts[0]['text']['tokens'], '}-' ]; } $lvsrc = substr( $this->input, $lv0, $lv1 - $lv0 ); $attribs = []; foreach ( $ts as &$t ) { // move token strings into KV attributes so that they are // properly expanded by early stages of the token pipeline foreach ( [ 'text', 'from', 'to' ] as $fld ) { if ( !isset( $t[$fld] ) ) { continue; } $name = 'mw:lv' . count( $attribs ); // Note that AttributeExpander will expect the tokens array to be // flattened. We do that in lang_variant_text / lang_variant_nowiki $attribs[] = new KV( $name, $t[$fld]['tokens'], $t[$fld]['srcOffsets']->expandTsrV() ); $t[$fld] = $name; } } unset( $t ); $flags = isset( $f['flags'] ) ? array_keys( $f['flags'] ) : []; sort( $flags ); $variants = isset( $f['variants'] ) ? array_keys( $f['variants'] ) : []; sort( $variants ); $dp = new DataParsoid; $dp->tsr = new SourceRange( $lv0, $lv1 ); $dp->src = $lvsrc; $dp->flags = $flags; $dp->variants = $variants; $dp->original = $f['original']; $dp->flagSp = $f['sp']; $dp->texts = $ts; return [ new SelfclosingTagTk( 'language-variant', $attribs, $dp ) ]; } opt_lang_variant_flags = f:( ff:lang_variant_flags "|" { return $ff; } )? { // Collect & separate flags and variants into a hashtable (by key) and ordered list $flags = []; $variants = []; $flagList = []; $flagSpace = []; $variantList = []; $variantSpace = []; $useVariants = false; if ( $f !== null ) { // lang_variant_flags returns arrays in reverse order. $spPtr = count( $f['sp'] ) - 1; for ( $i = count( $f['flags'] ) - 1; $i >= 0; $i--) { $item = $f['flags'][$i]; if ( isset( $item['flag'] ) ) { $flagSpace[] = $f['sp'][$spPtr--]; $flags[$item['flag']] = true; $flagList[] = $item['flag']; $flagSpace[] = $f['sp'][$spPtr--]; } if ( isset( $item['variant'] ) ) { $variantSpace[] = $f['sp'][$spPtr--]; $variants[$item['variant']] = true; $variantList[] = $item['variant']; $variantSpace[] = $f['sp'][$spPtr--]; } } if ( $spPtr >= 0 ) { // handle space after a trailing semicolon $flagSpace[] = $f['sp'][$spPtr]; $variantSpace[] = $f['sp'][$spPtr]; } } // Parse flags (this logic is from core/languages/ConverterRule.php // in the parseFlags() function) if ( count( $flags ) === 0 && count( $variants ) === 0 ) { $flags['$S'] = true; } elseif ( isset( $flags['R'] ) ) { $flags = [ 'R' => true ]; // remove other flags } elseif ( isset( $flags['N'] ) ) { $flags = [ 'N' => true ]; // remove other flags } elseif ( isset( $flags['-'] ) ) { $flags = [ '-' => true ]; // remove other flags } elseif ( isset( $flags['T'] ) && count( $flags ) === 1 ) { $flags['H'] = true; } elseif ( isset( $flags['H'] ) ) { // Replace A flag, and remove other flags except T and D $nf = [ '$+' => true, 'H' => true ]; if ( isset( $flags['T'] ) ) { $nf['T'] = true; } if ( isset( $flags['D'] ) ) { $nf['D'] = true; } $flags = $nf; } elseif ( count( $variants ) > 0 ) { $useVariants = true; } else { if ( isset( $flags['A'] ) ) { $flags['$+'] = true; $flags['$S'] = true; } if ( isset( $flags['D'] ) ) { unset( $flags['$S'] ); } } if ( $useVariants ) { return [ 'variants' => $variants, 'original' => $variantList, 'sp' => $variantSpace ]; } else { return [ 'flags' => $flags, 'original' => $flagList, 'sp' => $flagSpace ]; } } lang_variant_flags = sp1:$(space_or_newline*) f:lang_variant_flag sp2:$(space_or_newline*) more:( ";" lang_variant_flags? )? { $r = ( $more && $more[1] ) ? $more[1] : [ 'sp' => [], 'flags' => [] ]; // Note that sp and flags are in reverse order, since we're using // right recursion and want to push instead of unshift. $r['sp'][] = $sp2; $r['sp'][] = $sp1; $r['flags'][] = $f; return $r; } / sp:$(space_or_newline*) { return [ 'sp' => [ $sp ], 'flags' => [] ]; } lang_variant_flag = f:[-+A-Z] { return [ 'flag' => $f ]; } / v:lang_variant_name { return [ 'variant' => $v ]; } / b:$(!space_or_newline !nowiki [^{}|;])+ { return [ 'bogus' => $b ]; /* bad flag */} // language variant name, like zh, zh-cn, etc. lang_variant_name = $([a-z] [-a-zA-Z]+) // Escaped otherwise-unrepresentable language names // Primarily for supporting html2html round trips; PHP doesn't support // using nowikis here (yet!) / nowiki_text lang_variant_option_list = o:lang_variant_option rest:( ";" oo:lang_variant_option { return $oo; })* tr:( ";" $bogus_lang_variant_option )* // optional trailing crap { array_unshift( $rest, $o ); // if the last bogus option is just spaces, keep them; otherwise // drop all this bogus stuff on the ground if ( count($tr) > 0 ) { $last = $tr[count($tr)-1]; if (preg_match('/^\s*$/Du', $last[1])) { $rest[] = [ 'semi' => true, 'sp' => $last[1] ]; } } return $rest; } / lvtext:lang_variant_text { return [ [ 'text' => $lvtext ] ]; } bogus_lang_variant_option = lang_variant_text? lang_variant_option = sp1:$(space_or_newline*) lang:lang_variant_name sp2:$(space_or_newline*) ":" sp3:$(space_or_newline*) lvtext:(lang_variant_nowiki / lang_variant_text_no_semi) { return [ 'twoway' => true, 'lang' => $lang, 'text' => $lvtext, 'sp' => [ $sp1, $sp2, $sp3 ] ]; } / sp1:$(space_or_newline*) from:(lang_variant_nowiki / lang_variant_text_no_semi_or_arrow) "=>" sp2:$(space_or_newline*) lang:lang_variant_name sp3:$(space_or_newline*) ":" sp4:$(space_or_newline*) to:(lang_variant_nowiki / lang_variant_text_no_semi) { return [ 'oneway' => true, 'from' => $from, 'lang' => $lang, 'to' => $to, 'sp' => [ $sp1, $sp2, $sp3, $sp4 ] ]; } // html2wt support: If a language name or conversion string can't be // represented w/o breaking wikitext, just wrap it in a . // PHP doesn't support this (yet), but Parsoid does. lang_variant_nowiki = n:nowiki_text sp:$space_or_newline* { $tsr = $this->tsrOffsets(); $tsr->end -= strlen( $sp ); return [ 'tokens' => [ $n ], 'srcOffsets' => $tsr, ]; } lang_variant_text = tokens:(inlineline / "|" )* { return [ 'tokens' => TokenizerUtils::flattenStringlist( $tokens ), 'srcOffsets' => $this->tsrOffsets(), ]; } lang_variant_text_no_semi = lang_variant_text lang_variant_text_no_semi_or_arrow = lang_variant_text_no_semi wikilink_content = ( pipe startPos:("" { return $this->endOffset(); }) lt:link_text? { $tsr = new SourceRange( $startPos, $this->endOffset() ); $maybeContent = new KV( 'mw:maybeContent', $lt ?? [], $tsr->expandTsrV() ); $maybeContent->vsrc = substr( $this->input, $startPos, $this->endOffset() - $startPos ); return $maybeContent; } )* wikilink = wikilink_preproc<&preproc="]]"> / broken_wikilink // `broken-link` (see [[:mw:Preprocessor_ABNF]]), but careful because the // second bracket could start an extlink. Set preproc to false as a reference // parameter in the parent since we haven't seen a double-close bracket. // (See full explanation above broken_template production.) broken_wikilink = &"[[" preproc:<&preproc> &{ $preproc = null; return true; } a:("[" (extlink / "[")) { return $a; } wikilink_preproc = "[[" spos:("" { return $this->endOffset(); }) target:wikilink_preprocessor_text? tpos:("" { return $this->endOffset(); }) lcs:wikilink_content inline_breaks "]]" { $pipeTrick = count( $lcs ) === 1 && count( $lcs[0]->v ) === 0; $textTokens = []; if ( $target === null || $pipeTrick ) { $textTokens[] = '[['; if ( $target ) { $textTokens[] = $target; } foreach ( $lcs as $a ) { // a is a mw:maybeContent attribute $textTokens[] = '|'; if ( count( $a->v ) > 0 ) { $textTokens[] = $a->v; } } $textTokens[] = ']]'; return $textTokens; } $obj = new SelfclosingTagTk( 'wikilink' ); $tsr = new SourceRange( $spos, $tpos ); $hrefKV = new KV( 'href', $target, $tsr->expandTsrV() ); $hrefKV->vsrc = $tsr->substr( $this->input ); // XXX: Point to object with path, revision and input information // obj.source = input; $obj->attribs[] = $hrefKV; $obj->attribs = array_merge( $obj->attribs, $lcs ); $dp = new DataParsoid; $dp->tsr = $this->tsrOffsets(); $dp->src = $this->text(); $obj->dataParsoid = $dp; return [ $obj ]; } // Tables are allowed inside image captions. // Suppress the equal flag temporarily in this rule to consume the '=' here. link_text = link_text_parameterized link_text_parameterized = c:( // This group is similar to "block_line" but "list_item" // is omitted since `doBlockLevels` happens after // `handleInternalLinks2`, where newlines are stripped. (sol (heading / hr / full_table_in_link_caption)) / urltext / ( !inline_breaks r:( inline_element / '[' text_char+ ']' $(&(!']' / ']]')) / . ) { return $r; } ) )+ { return TokenizerUtils::flattenStringlist( $c ); } /* Generic quote rule for italic and bold, further processed in a token * stream transformation in doQuotes. Relies on NlTk tokens being emitted * for each line of text to balance quotes per line. * * We are not using a simple pair rule here as we need to support mis-nested * bolds/italics and MediaWiki's special heuristics for apostrophes, which are * all not context free. */ quote = quotes:$("''" "'"*) { // sequences of four or more than five quotes are assumed to start // with some number of plain-text apostrophes. $plainticks = 0; $result = []; if ( strlen( $quotes ) === 4 ) { $plainticks = 1; } elseif ( strlen( $quotes ) > 5 ) { $plainticks = strlen( $quotes ) - 5; } if ( $plainticks > 0 ) { $result[] = substr( $quotes, 0, $plainticks ); } // mw-quote token will be consumed in token transforms $tsr = $this->tsrOffsets(); $tsr->start += $plainticks; $dp = new DataParsoid; $dp->tsr = $tsr; $mwq = new SelfclosingTagTk( 'mw-quote', [ new KV( 'value', substr( $quotes, $plainticks ) ) ], $dp ); if ( strlen( $quotes ) > 2 ) { $mwq->addAttribute( 'isSpace_1', $tsr->start > 0 && substr( $this->input, $tsr->start - 1, 1 ) === ' '); $mwq->addAttribute( 'isSpace_2', $tsr->start > 1 && substr( $this->input, $tsr->start - 2, 1 ) === ' '); } $result[] = $mwq; return $result; } /*********************************************************** * Xmlish tags ***********************************************************/ // FIXME: Temporary (?) hack to let us not horribly break on old tvar syntax // In coordination with language team, get rid of this hack once all old uses // are migrated to new syntax (T274881). tvar_old_syntax_closing_HACK = "/>" & { return $this->env->hasAnnotations && $this->siteConfig->isAnnotationTag( 'tvar' ); } { $metaAttrs = [ new KV( 'typeof', 'mw:Annotation/tvar/End' ) ]; $dp = new DataParsoid(); $dp->tsr = $this->tsrOffsets(); $dp->tsr->start--; // For "<" matched at the start of xmlish_tag rule if ( empty( $this->pipelineOpts['inTemplate'] ) ) { return [ new SelfclosingTagTk ( 'meta', $metaAttrs, $dp ) ]; } else { // suppress meta tags from pipeline output return []; } } annotation_tag = annToken:extension_annotation_tag &{ return ( $annToken instanceof Token && $annToken->getName() !== 'extension' ); } { return $annToken; } extension_annotation_tag = ! "<" tag:( extToken:xmlish_tag // Account for `maybeAnnotationOrExtensionTag` returning unmatched start / end tags &{ return !$extToken || $extToken[0]->getName() === 'extension' || ($extToken[0]->getName() === 'meta' && preg_match( WTUtils::ANNOTATION_META_TYPE_REGEXP, $extToken[0]->getAttribute( 'typeof' ) ?? '' ) > 0); } { return !$extToken ? '' : $extToken[0]; } / tvar_old_syntax_closing_HACK ) { return $tag; } nowiki = & ("<" "/"? "nowiki"i ) extToken:extension_annotation_tag { return $extToken; } // Used by lang_variant productions to protect special language names or // conversion strings. nowiki_text = extToken:nowiki { $txt = Utils::extractExtBody( $extToken ); return Utils::decodeWtEntities( $txt ); } /* Generic XML-like tags * * These also cover extensions (including Cite), which will hook into the * token stream for further processing. The content of extension tags is * parsed as regular inline, but the source positions of the tag are added * to allow reconstructing the unparsed text from the input. */ // See http://www.w3.org/TR/html5/syntax.html#tag-open-state and the following // paragraphs. Note that we don't enforce ascii alpha for the first character // here because we need to be more permissive for extension tag names. That // happens in xmlish_tag below. tag_name = $[^\t\n\v />\0]+ // This rule is used in carefully crafted places of xmlish tag tokenizing with // the inclusion of solidus to match where the spec would ignore those // characters. In particular, it does not belong in between attribute name // and value. space_or_newline_or_solidus = space_or_newline / (s:"/" !">" { return $s; }) xmlish_tag = & { $this->assert( $this->input[$this->currPos - 1] === '<', 'Failed to open xmlish_tag before entering.' ); return true; } end:"/"? name: tag_name annOrExtTag: & { if ( $annOrExtTag ) { return WTUtils::isAnnOrExtTag( $this->env, $name ); } else { // Only enforce ascii alpha first char for non-extension tags. // See tag_name above for the details. return preg_match( '/^[A-Za-z]/', $name ) && $this->isXMLTag( $name ); } } // By the time we get to `doTableStuff` in the old parser, we've already // safely encoded element attributes. See 55313f4e in core. attribs:generic_newline_attributes space_or_newline_or_solidus* // No need to preserve this -- canonicalize on RT via dirty diff selfclose:"/"? space* // not preserved - canonicalized on RT via dirty diff ">" { $lcName = mb_strtolower( $name ); // Extension tags don't necessarily have the same semantics as html tags, // so don't treat them as void elements. $isVoidElt = Utils::isVoidElement( $lcName ) && !$annOrExtTag; // Support
if ( $lcName === 'br' && $end ) { $end = null; } $tsr = $this->tsrOffsets(); $tsr->start--; // For "<" matched at the start of xmlish_tag rule $res = TokenizerUtils::buildXMLTag( $name, $lcName, $attribs, $end, !!$selfclose || $isVoidElt, $tsr ); // change up data-attribs in one scenario // void-elts that aren't self-closed ==> useful for accurate RT-ing if ( !$selfclose && $isVoidElt ) { unset( $res->dataParsoid->selfClose ); $res->dataParsoid->noClose = true; } $met = $this->maybeAnnotationOrExtensionTag( $res, $end, $attribs, $tsr ); return is_array( $met ) ? $met : [ $met ]; } // A generic attribute that can span multiple lines. generic_newline_attribute = space_or_newline_or_solidus* namePos0:("" { return $this->endOffset(); }) name:generic_attribute_name namePos:("" { return $this->endOffset(); }) vd:(space_or_newline* "=" v:generic_att_value? { return $v; })? { // NB: Keep in sync w/ table_attibute $res = null; // Encapsulate protected attributes. if ( is_string( $name ) ) { $name = TokenizerUtils::protectAttrs( $name ); } $nameSO = new SourceRange( $namePos0, $namePos ); if ( $vd !== null ) { $res = new KV( $name, $vd['value'], $nameSO->join( $vd['srcOffsets'] ) ); $res->vsrc = $vd['srcOffsets']->substr( $this->input ); } else { $res = new KV( $name, '', $nameSO->expandTsrK() ); } if ( is_array( $name ) ) { $res->ksrc = $nameSO->substr( $this->input ); } return $res; } // A single-line attribute. table_attribute = s:optionalSpaceToken namePos0:("" { return $this->endOffset(); }) name:table_attribute_name namePos:("" { return $this->endOffset(); }) vd:(optionalSpaceToken "=" v:table_att_value? { return $v; })? { // NB: Keep in sync w/ generic_newline_attribute $res = null; // Encapsulate protected attributes. if ( gettype( $name ) === 'string' ) { $name = TokenizerUtils::protectAttrs( $name ); } $nameSO = new SourceRange( $namePos0, $namePos ); if ( $vd !== null ) { $res = new KV( $name, $vd['value'], $nameSO->join( $vd['srcOffsets'] ) ); $res->vsrc = $vd['srcOffsets']->substr( $this->input ); } else { $res = new KV( $name, '', $nameSO->expandTsrK() ); } if ( is_array( $name ) ) { $res->ksrc = $nameSO->substr( $this->input ); } return $res; } // The old parser's Sanitizer::removeHTMLtags explodes on < so that it can't // be found anywhere in xmlish tags. This is a divergence from html5 tokenizing // which happily permits it in attribute positions. Extension tags being the // exception, since they're stripped beforehand. less_than = $( & "<" ) // The arrangement of chars is to emphasize the split between what's disallowed // by html5 and what's necessary to give directive a chance. // See: http://www.w3.org/TR/html5/syntax.html#attributes-0 generic_attribute_name = q:$(["'=]?) // From #before-attribute-name-state, < is omitted for directive r:( $[^ \t\r\n\0/=><&{}\-!|]+ / !inline_breaks // \0/=> is the html5 attribute name set we do not want. t:( directive / less_than / $( !( space_or_newline / [\0/=><] ) . ) ) { return $t; } )* & { return count( $r ) > 0 || $q !== ''; } { array_unshift( $r, $q ); return TokenizerUtils::flattenString( $r ); } // Also accept these chars in a wikitext table or tr attribute name position. // They are normally not matched by the table_attribute_name. broken_table_attribute_name_char = c:[\0/=>] { return new KV( $c, '' ); } // Same as generic_attribute_name, except for accepting tags and wikilinks. // (That doesn't make sense (ie. match the old parser) in the generic case.) // We also give a chance to break on \[ (see T2553). table_attribute_name = q:$(["'=]?) // From #before-attribute-name-state, < is omitted for directive r:( $[^ \t\r\n\0/=><&{}\-!|\[]+ / !inline_breaks // \0/=> is the html5 attribute name set we do not want. t:( $wikilink / directive // Accept tags-inside-attributes as attribute names. // The sanitizer will strip and shadow them for roundtripping. // Example: generated with.. / x:inline_xmlish_tag ill:inlineline? { return array_merge( $x, $ill ?: [] ); } / $( !( space_or_newline / [\0/=>] ) . ) ) { return $t; } )* & { return count( $r ) > 0 || $q !== ''; } { array_unshift( $r, $q ); return TokenizerUtils::flattenString( $r ); } // Attribute value, quoted variants can span multiple lines. // Missing end quote: accept /> look-ahead as heuristic. // These need to be kept in sync with the attribute_preprocessor_text_* generic_att_value = s:$(space_or_newline* "'") t:attribute_preprocessor_text_single? q:$("'" / &('/'? '>')) { return TokenizerUtils::getAttrVal( $t, $this->startOffset() + strlen( $s ), $this->endOffset() - strlen( $q ) ); } / s:$(space_or_newline* '"') t:attribute_preprocessor_text_double? q:$('"' / &('/'? '>')) { return TokenizerUtils::getAttrVal( $t, $this->startOffset() + strlen( $s ), $this->endOffset() - strlen( $q ) ); } / s:$space_or_newline* t:attribute_preprocessor_text &(space_or_newline / eof / '/'? '>') { return TokenizerUtils::getAttrVal( $t, $this->startOffset() + strlen( $s ), $this->endOffset() ); } // Attribute value, restricted to a single line. // Missing end quote: accept |, !!, \r, and \n look-ahead as heuristic. // These need to be kept in sync with the table_attribute_preprocessor_text_* table_att_value = s:$(space* "'") t:table_attribute_preprocessor_text_single? q:$("'" / &('!!' / [|\r\n])) { return TokenizerUtils::getAttrVal( $t, $this->startOffset() + strlen( $s ), $this->endOffset() - strlen( $q ) ); } / s:$(space* '"') t:table_attribute_preprocessor_text_double? q:$('"' / &('!!' / [|\r\n])) { return TokenizerUtils::getAttrVal( $t, $this->startOffset() + strlen( $s ), $this->endOffset() - strlen( $q ) ); } / s:$space* t:table_attribute_preprocessor_text &(space_or_newline/ eof / '!!' / '|') { return TokenizerUtils::getAttrVal( $t, $this->startOffset() + strlen( $s ), $this->endOffset() ); } /********************************************************* * Lists *********************************************************/ list_item = dtdd / hacky_dl_uses / li li = bullets:list_char+ c:inlineline? // The inline_break is to check if we've hit a template end delimiter. &(eolf / inline_breaks) { // Leave bullets as an array -- list handler expects this $tsr = $this->tsrOffsets( 'start' ); $tsr->end += count( $bullets ); $dp = new DataParsoid; $dp->tsr = $tsr; $li = new TagTk( 'listItem', [ new KV( 'bullets', $bullets, $tsr->expandTsrV() ) ], $dp ); return array_merge( [ $li ], $c ?: [] ); } /* * This rule is required to support wikitext of this form * ::{|border="1"|foo|bar|baz|} * where the leading colons are used to indent the entire table. * This hack was added back in 2006 in commit * a0746946312b0f1eda30a2c793f5f7052e8e5f3a based on a patch by Carl * Fürstenberg. */ hacky_dl_uses = bullets:":"+ tbl:(table_line (sol+ table_line)*) line:inlineline? &((space / comment)* eolf) { // Leave bullets as an array -- list handler expects this $tsr = $this->tsrOffsets( 'start' ); $tsr->end += count( $bullets ); $dp = new DataParsoid; $dp->tsr = $tsr; $li = new TagTk( 'listItem', [ new KV( 'bullets', $bullets, $tsr->expandTsrV() ) ], $dp ); return TokenizerUtils::flattenIfArray( [ $li, $tbl, $line ?: [] ] ); } dtdd = bullets:(!(";" !list_char) lc:list_char { return $lc; })* ";" c:inlineline_break_on_colon? cpos:(":" { return $this->endOffset(); }) d:inlineline? &eolf { // Leave bullets as an array -- list handler expects this // TSR: +1 for the leading ";" $numBullets = count( $bullets ) + 1; $tsr = $this->tsrOffsets( 'start' ); $tsr->end += $numBullets; $li1Bullets = $bullets; $li1Bullets[] = ';'; $dp = new DataParsoid; $dp->tsr = $tsr; $li1 = new TagTk( 'listItem', [ new KV( 'bullets', $li1Bullets, $tsr->expandTsrV() ) ], $dp ); // TSR: -1 for the intermediate ":" $li2Bullets = $bullets; $li2Bullets[] = ':'; $tsr2 = new SourceRange( $cpos - 1, $cpos ); $dp2 = new DataParsoid; $dp2->tsr = $tsr2; $dp2->stx = 'row'; $li2 = new TagTk( 'listItem', [ new KV( 'bullets', $li2Bullets, $tsr2->expandTsrV() ) ], $dp2 ); return array_merge( [ $li1 ], $c ?: [], [ $li2 ], $d ?: [] ); } list_char = [*#:;] inlineline_break_on_colon = inlineline /****************************************************************************** * Tables * ------ * Table rules are geared to support independent parsing of fragments in * templates (the common table start / row / table end use case). The tokens * produced by these fragments then match up to a table while building the * DOM tree. For similar reasons, table rows do not emit explicit end tag * tokens. * * The separate table_line rule is faster than moving those rules * directly to block_lines. * * Notes about the full_table_in_link_caption rule * ----------------------------------------------------- * However, for link-tables, we have introduced a stricter parse wherein * we require table-start and table-end tags to not come from a template. * In addition, this new rule doesn't accept fosterable-content in * the table unlike the more lax (sol table_line)+ rule. * * This is the best we can do at this time since we cannot distinguish * between table rows and image options entirely in the tokenizer. * * Consider the following examples: * * Example 1: * * [[Image:Foo.jpg|left|30px|Example 1 * {{This-template-returns-a-table-start-tag}} * |foo * {{This-template-returns-a-table-end-tag}} * ]] * * Example 2: * * [[Image:Foo.jpg|left|30px|Example 1 * {{1x|a}} * |foo * {{1x|b}} * ]] * * So, we cannot know a priori (without preprocessing or fully expanding * all templates) if "|foo" in the two examples is a table cell or an image * option. This is a limitation of our tokenizer-based approach compared to * the preprocessing-based approach of the old parser. * * Given this limitation, we are okay forcing a full-table context in * link captions (if necessary, we can relax the fosterable-content requirement * but that is broken wikitext anyway, so we can force that edge-case wikitext * to get fixed by rejecting it). ******************************************************************************/ full_table_in_link_caption = !inline_breaks // Note that "linkdesc" is suppressed here to provide a nested parsing // context in which to parse the table. Otherwise, we may break on // on pipes in the `table_start_tag` and `table_row_tag` attributes. // However, as a result, this can be more permissive than the old // implementation (legacy parser?), but likelier to match the users intent. // Suppress the recursion protection from tableDataBlock since we're trying // to parse a full table and if the link is itself nested in a table this // will always stop. Hopefully, this won't result in any overflows. r: full_table_in_link_caption_parameterized { return $r; } full_table_in_link_caption_parameterized = table_start_tag // Accept multiple end tags since a nested table may have been // opened in the table content line. ( (sol+ (table_content_line / tplarg_or_template))* sol+ table_end_tag )+ // This rule assumes start-of-line position! table_line = (! inline_breaks / & '{{!}}' ) tl:( table_start_tag / table_content_line
/ table_end_tag ) { return $tl; } table_content_line = (space / comment)* ( table_heading_tags / table_row_tag / table_data_tags / table_caption_tag ) table_start_tag "table_start_tag" = sc:(space / comment)* startPos:("" { return $this->endOffset(); }) b:"{" p:pipe // ok to normalize away stray |} on rt (see T59360) ta:(table_attributes / &{ $this->unreachable(); }) tsEndPos:("" { return $this->endOffset(); }) s2:space* { $coms = TokenizerUtils::popComments( $ta ); if ( $coms ) { $tsEndPos = $coms['commentStartPos']; } $dp = new DataParsoid; $dp->tsr = new SourceRange( $startPos, $tsEndPos ); if ( $p !== '|' ) { // Variation from default $dp->startTagSrc = $b . $p; } return array_merge( $sc, [ new TagTk( 'table', $ta, $dp ) ], $coms ? $coms['buf'] : [], $s2 ); } // FIXME: Not sure if we want to support it, but this should allow columns. table_caption_tag = // avoid recursion via nested_block_in_table ! p:pipe "+" args:row_syntax_table_args? tagEndPos:("" { return $this->endOffset(); }) c:nested_block_in_table* { $tsr = new SourceRange( $this->startOffset(), $tagEndPos ); return TokenizerUtils::buildTableTokens( 'caption', '|+', $args, $tsr, $this->endOffset(), $c, true ); } table_row_tag = // avoid recursion via nested_block_in_table ! p:pipe dashes:$"-"+ a:(table_attributes / &{ $this->unreachable(); }) tagEndPos:("" { return $this->endOffset(); }) s2:space* { $coms = TokenizerUtils::popComments( $a ); if ( $coms ) { $tagEndPos = $coms['commentStartPos']; } $da = new DataParsoid; $da->tsr = new SourceRange( $this->startOffset(), $tagEndPos ); $da->startTagSrc = $p . $dashes; // We rely on our tree builder to close the row as needed. This is // needed to support building tables from fragment templates with // individual cells or rows. $trToken = new TagTk( 'tr', $a, $da ); return array_merge( [ $trToken ], $coms ? $coms['buf'] : [], $s2 ); } tds = ( pp:( pipe_pipe / p:pipe & row_syntax_table_args { return $p; } ) tdt:table_data_tag { // Avoid modifying cached dataParsoid object $tdt[0] = clone $tdt[0]; $da = $tdt[0]->dataParsoid = clone $tdt[0]->dataParsoid; $da->tsr = clone $da->tsr; $da->stx = 'row'; $da->tsr->start -= strlen( $pp ); // include "||" if ( $pp !== '||' || ( isset( $da->startTagSrc ) && $da->startTagSrc !== $pp ) ) { // Variation from default $da->startTagSrc = $pp . ( isset( $da->startTagSrc ) ? substr( $da->startTagSrc, 1 ) : '' ); } return $tdt; } )* table_data_tags = // avoid recursion via nested_block_in_table ! p:pipe ![+-] td:table_data_tag tagEndPos:("" { return $this->endOffset(); }) tds:tds { // Avoid modifying a cached result $td[0] = clone $td[0]; $da = $td[0]->dataParsoid = clone $td[0]->dataParsoid; $da->tsr = clone $da->tsr; $da->tsr->start -= strlen( $p ); // include "|" if ( $p !== '|' ) { // Variation from default $da->startTagSrc = $p; } return array_merge( $td, $tds ); } table_data_tag = ! "}" arg:row_syntax_table_args? // use inline_breaks to break on tr etc tagEndPos:("" { return $this->endOffset(); }) td:nested_block_in_table* { $tsr = new SourceRange( $this->startOffset(), $tagEndPos ); return TokenizerUtils::buildTableTokens( 'td', '|', $arg, $tsr, $this->endOffset(), $td ); } table_heading_tags = table_heading_tags_parameterized<&th> table_heading_tags_parameterized = "!" thTag:table_heading_tag thTags:( pp:("!!" / pipe_pipe) tht:table_heading_tag { // Avoid modifying a cached result $tht[0] = clone $tht[0]; $da = $tht[0]->dataParsoid = clone $tht[0]->dataParsoid; $da->tsr = clone $da->tsr; $da->stx = 'row'; $da->tsr->start -= strlen( $pp ); // include "!!" or "||" if ( $pp !== '!!' || ( isset( $da->startTagSrc ) && $da->startTagSrc !== $pp ) ) { // Variation from default $da->startTagSrc = $pp . ( isset( $da->startTagSrc ) ? substr( $da->startTagSrc, 1 ) : '' ); } return $tht; } )* { $thTag[0] = clone $thTag[0]; $da = $thTag[0]->dataParsoid = clone $thTag[0]->dataParsoid; $da->tsr = clone $da->tsr; $da->tsr->start--; // include "!" array_unshift( $thTags, $thTag ); return $thTags; } table_heading_tag = arg:row_syntax_table_args? tagEndPos:("" { return $this->endOffset(); }) c:( th:<&th> d:nested_block_in_table { if ( $th !== false && strpos( $this->text(), "\n" ) !== false ) { // There's been a newline. Remove the break and continue // tokenizing nested_block_in_tables. $th = false; } return $d; } )* { $tsr = new SourceRange( $this->startOffset(), $tagEndPos ); return TokenizerUtils::buildTableTokens( 'th', '!', $arg, $tsr, $this->endOffset(), $c ); } table_end_tag = sc:(space / comment)* startPos:("" { return $this->endOffset(); }) p:pipe b:"}" { $dp = new DataParsoid; $dp->tsr = new SourceRange( $startPos, $this->endOffset() ); $tblEnd = new EndTagTk( 'table', [], $dp ); if ( $p !== '|' ) { // p+"" is triggering some bug in pegJS // I cannot even use that expression in the comment! $tblEnd->dataParsoid->endTagSrc = $p . $b; } array_push( $sc, $tblEnd ); return $sc; } /** * Table parameters separated from the content by a single pipe. Does *not* * match if followed by double pipe (row-based syntax). */ row_syntax_table_args = as:table_attributes s:optional_spaces p:pipe !pipe { return [ $as, $s, $p ]; } /******************************************************************* * Text variants and other general rules *******************************************************************/ /* All chars that cannot start syntactic structures in the middle of a line * XXX: ] and other end delimiters should probably only be activated inside * structures to avoid unnecessarily leaving the text rule on plain * content. * * TODO: Much of this is should really be context-dependent (syntactic * flags). The wikilink_preprocessor_text rule is an example where * text_char is not quite right and had to be augmented. Try to minimize / * clarify this carefully! * * This character class is inlined into $this->reUrltextLookahead. Changes * here may also need to be reflected there. */ text_char = [^-'<[{\n\r:;\]}|!=] /* Legend * ' quotes (italic/bold) * < start of xmlish_tag * [ start of links * { start of parser functions, transclusion and template args * \n all sort of block-level markup at start of line * \r ditto * A-Za-z autolinks (http(s), nttp(s), mailto, ISBN, PMID, RFC) * * _ behavior switches (e.g., '__NOTOC__') (XXX: not URL related) * ! and | table cell delimiters, might be better to specialize those * = headings - also specialize those! * * The following chars are also included for now, but only apply in some * contexts and should probably be enabled only in those: * : separate definition in ; term : definition * ] end of link * } end of parser func/transclusion/template arg * - start of lang_variant -{ ... }- * ; separator in lang_variant */ urltext = ( /* Very special performance hack: * Look for a plain text sequence, and if found, pretend to match the * empty string, but then advance currPos in the action and return the * whole plain text segment as a single result. */ & { if ( preg_match( $this->reUrltextLookahead, $this->input, $m, 0, $this->currPos ) ) { $plain = $m[1]; $this->urltextPlainSegment = $plain; $this->urltextFoundAutolink = ( $m[2] ?? '' ) !== ''; return (bool)strlen( $plain ); } else { $this->urltextFoundAutolink = false; return false; } } '' { $this->currPos += strlen( $this->urltextPlainSegment ); return $this->urltextPlainSegment; } / & { return $this->urltextFoundAutolink; } al:autolink { return $al; } / & "&" he:htmlentity { return $he; } / & ('__') bs:behavior_switch { return $bs; } / text_char )+ raw_htmlentity = m:$("&" [#0-9a-zA-Zרלמرلم]+ ";") { return Utils::decodeWtEntities( $m ); } htmlentity = cc:raw_htmlentity { // if this is an invalid entity, don't tag it with 'mw:Entity' // note that some entities (like ∾̳) decode to 2 codepoints! if ( mb_strlen( $cc ) > 2 /* decoded entity would be 1-2 codepoints */ ) { return $cc; } $dpStart = new DataParsoid; $dpStart->src = $this->text(); $dpStart->srcContent = $cc; $dpStart->tsr = $this->tsrOffsets( 'start' ); $dpEnd = new DataParsoid; $dpEnd->tsr = $this->tsrOffsets( 'end' ); return [ // If this changes, the nowiki extension's toDOM will need to follow suit new TagTk( 'span', [ new KV( 'typeof', 'mw:Entity' ) ], $dpStart ), $cc, new EndTagTk( 'span', [], $dpEnd ) ]; } /** * noinclude / includeonly / onlyinclude rules. These are normally * handled by the xmlish_tag rule, except where generic tags are not * allowed- for example in directives, which are allowed in various attribute * names and -values. * * Example test case: * {| * |- * foo * * |Hello * |} */ include_limits = & ("<" "/"? ( "includeonly"i / "noinclude"i / "onlyinclude"i ) ) "<" il:xmlish_tag sol_il: & { $il = $il[0]; $lname = mb_strtolower( $il->getName() ); if ( !WTUtils::isIncludeTag( $lname ) ) { return false; } // Preserve SOL where necessary (for onlyinclude and noinclude) // Note that this only works because we encounter <*include*> tags in // the toplevel content and we rely on the php preprocessor to expand // templates, so we shouldn't ever be tokenizing inInclude. // Last line should be empty (except for comments) if ( $lname !== 'includeonly' && $sol_il && $il instanceof TagTk ) { $dp = $il->dataParsoid; $inclContent = $dp->extTagOffsets->stripTags( $dp->src ); $nlpos = strrpos( $inclContent, "\n" ); $last = $nlpos === false ? $inclContent : substr( $inclContent, $nlpos + 1 ); if ( !preg_match( '/^()*$/D', $last ) ) { return false; } } return true; } { return $il; } // 'Preprocessor' directive- higher-level things that can occur in otherwise // plain-text content. directive = comment / extension_annotation_tag / tplarg_or_template / & "-{" v:lang_variant_or_tpl { return $v; } / & "&" e:htmlentity { return $e; } / include_limits wikilink_preprocessor_text = r:( t:$[^<[{\n\r\t|!\]}{ &\-]+ // XXX gwicke: any more chars we need to allow here? / !inline_breaks wr:( directive / $( !"]]" ( text_char / [!<\-\}\]\n\r] ) ) ) { return $wr; } )+ { return TokenizerUtils::flattenStringlist( $r ); } // added special separator character class inline: separates url from // description / text extlink_nonipv6url = // Prevent breaking on pipes when we're in a link description. // See the test, 'Images with the "|" character in the comment'. extlink_nonipv6url_parameterized extlink_nonipv6url_parameterized = r:( $[^<[{\n\r|!\]}\-\t&="' \u00A0\u1680\u180E\u2000-\u200A\u202F\u205F\u3000]+ / !inline_breaks s:( directive / [&|{\-!}=] ) { return $s; } / $(['] ![']) // single quotes are ok, double quotes are bad )+ { return TokenizerUtils::flattenString( $r ); } // Attribute values with preprocessor support // n.b. / is a permissible char in the three rules below. // We only break on />, enforced by the negated expression. // Hence, it isn't included in the stop set. // The stop set is space_or_newline and > which matches generic_att_value. attribute_preprocessor_text = r:( $[^{}&<\-|/ \t\n\r\x0c>]+ / !inline_breaks !'/>' s:( directive / less_than / [{}&\-|/] ) { return $s; } )+ { return TokenizerUtils::flattenString( $r ); } // The stop set is '> which matches generic_att_value. attribute_preprocessor_text_single = r:( $[^{}&<\-|/'>]+ / !inline_breaks !'/>' s:( directive / less_than / [{}&\-|/] ) { return $s; } )* { return TokenizerUtils::flattenString( $r ); } // The stop set is "> which matches generic_att_value. attribute_preprocessor_text_double = r:( $[^{}&<\-|/">]+ / !inline_breaks !'/>' s:( directive / less_than / [{}&\-|/] ) { return $s; } )* { return TokenizerUtils::flattenString( $r ); } // Variants with the entire attribute on a single line // n.b. ! is a permissible char in the three rules below. // We only break on !! in th, enforced by the inline break. // Hence, it isn't included in the stop set. // [ is also permissible but we give a chance to break // for the [[ special case in the old parser's doTableStuff (See T2553). // The stop set is space_or_newline and | which matches table_att_value. table_attribute_preprocessor_text = r:( $[^{}&<\-!\[ \t\n\r\x0c|]+ / !inline_breaks s:( directive / [{}&<\-!\[] ) { return $s; } )+ { return TokenizerUtils::flattenString( $r ); } // The stop set is '\r\n| which matches table_att_value. table_attribute_preprocessor_text_single = r:( $[^{}&<\-!\['\r\n|]+ / !inline_breaks s:( directive / [{}&<\-!\[] ) { return $s; } )* { return TokenizerUtils::flattenString( $r ); } // The stop set is "\r\n| which matches table_att_value. table_attribute_preprocessor_text_double = r:( $[^{}&<\-!\["\r\n|]+ / !inline_breaks s:( directive / [{}&<\-!\[] ) { return $s; } )* { return TokenizerUtils::flattenString( $r ); } // Special-case support for those pipe templates pipe = "|" / "{{!}}" // SSS FIXME: what about |{{!}} and {{!}}| pipe_pipe = "||" / "{{!}}{{!}}" space = [ \t] optional_spaces = $[ \t]* // Start of file sof = & { return $this->endOffset() === 0 && !$this->pipelineOffset; } // End of file eof = & { return $this->endOffset() === $this->inputLength; } newline = '\n' / '\r\n' newlineToken = newline { return [ new NlTk( $this->tsrOffsets() ) ]; } eolf = newline / eof // The old parser does a straight str.replace(/).)*-->/g, "") // but, as always, things around here are a little more complicated. // // We accept the same comments, but because we emit them as HTML comments // instead of deleting them, we have to encode the data to ensure that // we always emit a valid HTML5 comment. See the encodeComment helper // for further details. comment = '" .)* cEnd:$('-->' / eof) { $data = WTUtils::encodeComment( $c ); $dp = new DataParsoid; $dp->tsr = $this->tsrOffsets(); if ( $cEnd !== '-->' ) { $dp->unclosedComment = true; } return [ new CommentTk( $data, $dp ) ]; } nl_comment_space = newlineToken / space / comment optionalSpaceToken = s:optional_spaces { if ( $s !== '' ) { return [ $s ]; } else { return []; } } /* This rule corresponds to \s in the PHP preg_* functions, * which is used frequently in the old parser. The inclusion of * form feed (but not other whitespace, like vertical tab) is a quirk * of Perl, which PHP inherited via the PCRE (Perl-Compatible Regular * Expressions) library. */ space_or_newline = [ \t\n\r\x0c] /* This rule corresponds to \b in the PHP preg_* functions, * after a word character. That is, it's a zero-width lookahead that * the next character is not a word character. */ end_of_word = eof / ![A-Za-z0-9_] // Unicode "separator, space" category. It covers the \u0020 space as well // as \u3000 IDEOGRAPHIC SPACE (see bug 19052). In PHP this is \p{Zs}. // Keep this up-to-date with the characters tagged ;Zs; in // http://www.unicode.org/Public/UNIDATA/UnicodeData.txt unispace = [ \u00A0\u1680\u2000-\u200A\u202F\u205F\u3000] // Non-newline whitespace, including non-breaking spaces. Used for magic links. space_or_nbsp = space // includes \t / unispace / & "&" he:htmlentity &{ return is_array( $he ) && $he[ 1 ] === "\u{A0}"; } { return $he; } // Used within ISBN magic links space_or_nbsp_or_dash = space_or_nbsp / "-" // Elements that do not break beginning or end of line for blocks (headers for instance) sol_transparent = comment / include_limits / annotation_tag / behavior_switch sol = (empty_line_with_comments / sol_prefix) sol_transparent* sol_prefix = newlineToken / & { // Use the sol flag only at the start of the input // Flag should always be an actual boolean (not falsy or undefined) $this->assert( is_bool( $this->options['sol'] ), 'sol should be boolean' ); return $this->endOffset() === 0 && $this->options['sol']; } { return []; } // This rule requires at least one comment to be matched empty_line_with_comments = sp:sol_prefix p:("" { return $this->endOffset(); }) c:(space* comment (space / comment)* newline)+ { $dp = new DataParsoid; $dp->tsr = new SourceRange( $p, $this->endOffset() ); $dp->tokens = TokenizerUtils::flattenIfArray( $c ); return [ $sp, new SelfclosingTagTk( 'meta', [ new KV( 'typeof', 'mw:EmptyLine' ) ], $dp ) ]; }