MediaWiki extension: HtmlToWiki
| Release status: beta | |
|---|---|
| Description: | HTML to wikitext converter |
| Author: | Eric Hartwell |
| Version: | 0.5 (2007.8.10) |
| MediaWiki: | 1.10 |
| Download: | Code |
|
[edit] What can this extension do?
This extension is yet another HTML to wikitext converter (Try it!). What's different about this one?
- Completely written in PHP
- Installs as a special page in your wiki
- Uses HTML Purifier, a standards-compliant HTML filter library written in PHP, to parse almost any HTML input source
- Converts a basic subset of HTML tags to MediaWiki wikitext format
- Easy to modify handling of particular tags and attributes
For a more powerful online converter, use Diberri's Perl-based online HTML::WikiConverter page.
[edit] Usage
[edit] Installation
[edit] Parameters
[edit] Changes to LocalSettings.php
require_once("$IP/extensions/HtmlToWiki/HtmlToWiki.php");
[edit] Code
[edit] HtmlToWiki.php
<?php # Not a valid entry point, skip unless MEDIAWIKI is defined if (!defined('MEDIAWIKI')) { echo <<<EOT To install my extension, put the following line in LocalSettings.php: require_once( "$IP/extensions/HtmlToWiki/HtmlToWiki.php" ); EOT; exit( 1 ); } $wgAutoloadClasses['HtmlToWiki'] = dirname(__FILE__) . '/HtmlToWiki_body.php'; $wgSpecialPages['HtmlToWiki'] = 'HtmlToWiki'; $wgHooks['LoadAllMessages'][] = 'HtmlToWiki::loadMessages'; $wgHooks['LangugeGetSpecialPageAliases'][] = 'HtmlToWikiLocalizedPageName'; $wgExtensionCredits['specialpage'][] = array( 'name' => 'HtmlToWiki', 'author' =>'Eric Hartwell', 'url' => 'http://www.ehartwell.com', 'description' => 'HTML to wikitext converter' ); function HtmlToWikiLocalizedPageName(&$specialPageArray, $code) { # The localized title of the special page is among the messages of the # extension: HtmlToWiki::loadMessages(); $text = wfMsg('htmltowiki'); # Convert from title in text form to DBKey and put it into the alias array: $title = Title::newFromText($text); $specialPageArray[wfMsg('htmltowiki')][] = $title->getDBKey(); return true; } ?>
[edit] HtmlToWiki_body.php
<?php class HtmlToWiki extends SpecialPage { function HtmlToWiki() { self::loadMessages(); SpecialPage::SpecialPage(wfMsg('htmltowiki')); } function execute( $par ) { global $wgRequest, $wgOut; $this->setHeaders(); $self = Title::makeTitle( NS_SPECIAL, wfMsg('htmltowiki') ); # Get request data from, e.g. $param = $wgRequest->getText('param'); $html = $wgRequest->getText('convert'); # Only load the HTML Purifier code if we're going to use it require( dirname( __FILE__ ) . '/HtmlToWikiPurifier.php' ); # Build the input form using Xml.php (see: Special:LinkSearch) $wgOut->addWikiText( wfMsg('htmltowiki-text') ); $wgOut->addHtml( Xml::openElement( 'form', array( 'id' => 'mw-htmltowiki-form', 'method' => 'post', 'action' => $self->escapeLocalURL() ) ) . xml_startfieldset( 'input', wfMsg( 'htmltowiki-input' ) ) . xml_textarea( 'convert', ($html != '') ? 6 : 20, 100, $html ) . Xml::submitButton( wfMsg( 'htmltowiki-convert' ) ) . xml_endfieldset() ); // Show the converted wikitext if there's something to convert if ( $html != '' ) { $parser = new HTMLPurifierWikitext(); $out = $parser->purifyWikitext( $html ); $wgOut->addHtml( xml_startfieldset( 'output', wfMsg( 'htmltowiki-output' ) ) . xml_textarea( 'wikitext', 15, 100, $out ) . xml_endfieldset() ); } // Finish off the input/output form $wgOut->addHtml( Xml::closeElement( 'form' ) ); $wgOut->addWikiText( wfMsg('htmltowiki-credits') ); } function loadMessages() { static $messagesLoaded = false; global $wgMessageCache; if ( $messagesLoaded ) return; $messagesLoaded = true; require( dirname( __FILE__ ) . '/HtmlToWiki.i18n.php' ); foreach ( $allMessages as $lang => $langMessages ) { $wgMessageCache->addMessages( $langMessages, $lang ); } } } /** * Convenience functions to build an HTML text input field * @return string HTML */ ## Note: <textarea value=xxx /> doesn't work with IE function xml_textarea( $name, $rows=false, $cols=false, $value=false, $attribs=array() ) { return( Xml::openElement( 'textarea', array( 'name' => $name, 'cols' => $cols, 'rows' => $rows ) + $attribs, $value ) . $value . Xml::closeElement('textarea') ); } function xml_startfieldset( $id=false, $label=false ) { $out = Xml::openElement( 'fieldset', array( 'id' => $id )); if ( $label ) $out .= Xml::element( 'legend', array(), $label ); return ( $out ); } function xml_endfieldset() { return( Xml::closeElement( 'fieldset' ) ); } ?>
[edit] HtmlToWiki.i18n.php
<?php $allMessages = array( 'en' => array( 'htmltowiki' => 'HtmlToWiki', 'htmltowiki-text' => 'Convert HTML to wikitext:', 'htmltowiki-convert' => 'Convert to wikitext', 'htmltowiki-input' => 'HTML source', 'htmltowiki-output' => 'Converted wikitext', 'htmltowiki-credits' => 'For more information about this utility, see [http://www.ehartwell.com/InfoDabble/MediaWiki_extension:_HtmlToWiki HtmlToWiki]. This utility uses the [http://htmlpurifier.org/ HTML Purifier] library, which is licensed under the LGPL v2.1+.', ), ); ?>
[edit] HtmlToWikiPurifier.php
<?php class HtmlToWiki extends SpecialPage { function HtmlToWiki() { self::loadMessages(); SpecialPage::SpecialPage(wfMsg('htmltowiki')); } function execute( $par ) { global $wgRequest, $wgOut; $this->setHeaders(); $self = Title::makeTitle( NS_SPECIAL, wfMsg('htmltowiki') ); # Get request data from, e.g. $param = $wgRequest->getText('param'); $html = $wgRequest->getText('convert'); # Only load the HTML Purifier code if we're going to use it require( dirname( __FILE__ ) . '/HtmlToWikiPurifier.php' ); # Build the input form using Xml.php (see: Special:LinkSearch) $wgOut->addWikiText( wfMsg('htmltowiki-text') ); $wgOut->addHtml( Xml::openElement( 'form', array( 'id' => 'mw-htmltowiki-form', 'method' => 'post', 'action' => $self->escapeLocalURL() ) ) . xml_startfieldset( 'input', wfMsg( 'htmltowiki-input' ) ) . xml_textarea( 'convert', ($html != '') ? 6 : 20, 100, $html ) . Xml::submitButton( wfMsg( 'htmltowiki-convert' ) ) . xml_endfieldset() ); // Show the converted wikitext if there's something to convert if ( $html != '' ) { $parser = new HTMLPurifierWikitext(); $out = $parser->purifyWikitext( $html ); $wgOut->addHtml( xml_startfieldset( 'output', wfMsg( 'htmltowiki-output' ) ) . xml_textarea( 'wikitext', 15, 100, $out ) . xml_endfieldset() ); } // Finish off the input/output form $wgOut->addHtml( Xml::closeElement( 'form' ) ); $wgOut->addWikiText( wfMsg('htmltowiki-credits') ); } function loadMessages() { static $messagesLoaded = false; global $wgMessageCache; if ( $messagesLoaded ) return; $messagesLoaded = true; require( dirname( __FILE__ ) . '/HtmlToWiki.i18n.php' ); foreach ( $allMessages as $lang => $langMessages ) { $wgMessageCache->addMessages( $langMessages, $lang ); } } } /** * Convenience functions to build an HTML text input field * @return string HTML */ ## Note: <textarea value=xxx /> doesn't work with IE function xml_textarea( $name, $rows=false, $cols=false, $value=false, $attribs=array() ) { return( Xml::openElement( 'textarea', array( 'name' => $name, 'cols' => $cols, 'rows' => $rows ) + $attribs, $value ) . $value . Xml::closeElement('textarea') ); } function xml_startfieldset( $id=false, $label=false ) { $out = Xml::openElement( 'fieldset', array( 'id' => $id )); if ( $label ) $out .= Xml::element( 'legend', array(), $label ); return ( $out ); } function xml_endfieldset() { return( Xml::closeElement( 'fieldset' ) ); } ?>
[edit] Implementation
Suprisingly enough, it's hard to find a current HTML parser written in PHP.
Finally I found HTML Purifier, which bills itself as a standards-compliant HTML filter library.
[edit] Tips, tricks, and hoops
- Initially I hoped to use PHP's DOM functions, but they are only suitable for use with clean XML. For example, it doesn't tokenize common nested tags such as:
<p>This is <b>typical</b> HTML markup.</p> - "[Y]ou save my day by allowing me not to write another damned HTML parser."
[edit] Version history
- August 10, 2007 - initial version
[edit] See also
- For a more powerful online converter, use Diberri's Perl-based online HTML::WikiConverter page
- Send2Wiki
Other HTML parsers and converters:
- Magic HTML Parser (PHP Classes) Parse HTML documents and extract keywords
- HTML Parser Class (Totally PHP) HTML parser class that it supports the innerHTML property
- DOM Functions (PHP.net) operate on XML documents through the DOM API with PHP 5.
- HTML table to wiki converter (DZone) to html python wiki conversion
- HTML Parser for PHP 4 (SourceForge) Object oriented PHP based HTML parser