MediaWiki extension: HtmlToWiki

InfoDabble > Tech Notes > MediaWiki > MediaWiki extension: HtmlToWiki
Jump to: navigation, search
This project is still under development. See Writing a new special page
This is a MediaWiki Extension

Release status: beta

Description: HTML to wikitext converter
Author: Eric Hartwell
Version: 0.5 (2007.8.10)
MediaWiki: 1.10
Download: Code

Contents

[edit] What can this extension do?

This extension is yet another HTML to wikitext converter (Try it!). What's different about this one?

  • Completely written in PHP
  • Installs as a special page in your wiki
  • Uses HTML Purifier, a standards-compliant HTML filter library written in PHP, to parse almost any HTML input source
  • Converts a basic subset of HTML tags to MediaWiki wikitext format
    • Easy to modify handling of particular tags and attributes

For a more powerful online converter, use Diberri's Perl-based online HTML::WikiConverter page.

[edit] Usage

[edit] Installation

[edit] Parameters

[edit] Changes to LocalSettings.php

require_once("$IP/extensions/HtmlToWiki/HtmlToWiki.php");

[edit] Code

[edit] HtmlToWiki.php

<?php
# Not a valid entry point, skip unless MEDIAWIKI is defined
if (!defined('MEDIAWIKI')) {
        echo <<<EOT
To install my extension, put the following line in LocalSettings.php:
require_once( "$IP/extensions/HtmlToWiki/HtmlToWiki.php" );
EOT;
        exit( 1 );
}
 
$wgAutoloadClasses['HtmlToWiki'] = dirname(__FILE__) . '/HtmlToWiki_body.php';
$wgSpecialPages['HtmlToWiki'] = 'HtmlToWiki';
$wgHooks['LoadAllMessages'][] = 'HtmlToWiki::loadMessages';
$wgHooks['LangugeGetSpecialPageAliases'][] = 'HtmlToWikiLocalizedPageName';
 
   $wgExtensionCredits['specialpage'][] = array(
       'name' => 'HtmlToWiki',
       'author' =>'Eric Hartwell', 
       'url' => 'http://www.ehartwell.com',
       'description' => 'HTML to wikitext converter'
       );
 
function HtmlToWikiLocalizedPageName(&$specialPageArray, $code) {
  # The localized title of the special page is among the messages of the
  # extension:
  HtmlToWiki::loadMessages();
  $text = wfMsg('htmltowiki');
 
  # Convert from title in text form to DBKey and put it into the alias array:
  $title = Title::newFromText($text);
  $specialPageArray[wfMsg('htmltowiki')][] = $title->getDBKey();
 
  return true;
}
 
?>

[edit] HtmlToWiki_body.php

<?php
class HtmlToWiki extends SpecialPage
{
	function HtmlToWiki() {
		self::loadMessages();
		SpecialPage::SpecialPage(wfMsg('htmltowiki'));
	}
 
    function execute( $par ) {
		global $wgRequest, $wgOut;
 
		$this->setHeaders();
		$self = Title::makeTitle( NS_SPECIAL, wfMsg('htmltowiki') );
 
		# Get request data from, e.g.
		$param = $wgRequest->getText('param');
		$html = $wgRequest->getText('convert');
 
		# Only load the HTML Purifier code if we're going to use it
		require( dirname( __FILE__ ) . '/HtmlToWikiPurifier.php' );
 
		# Build the input form using Xml.php (see: Special:LinkSearch)
		$wgOut->addWikiText( wfMsg('htmltowiki-text') );
		$wgOut->addHtml( 
						Xml::openElement( 'form', array( 'id' => 'mw-htmltowiki-form', 
										  'method' => 'post', 'action' => $self->escapeLocalURL() ) ) .
						xml_startfieldset( 'input', wfMsg( 'htmltowiki-input' ) ) .
						xml_textarea( 'convert', ($html != '') ? 6 : 20, 100, $html ) . 
						Xml::submitButton( wfMsg( 'htmltowiki-convert' ) ) . 
						xml_endfieldset() 
					);
 
		// Show the converted wikitext if there's something to convert
		if ( $html != '' ) {
			$parser = new HTMLPurifierWikitext();
			$out = $parser->purifyWikitext( $html );
			$wgOut->addHtml( 
					xml_startfieldset( 'output', wfMsg( 'htmltowiki-output' ) ) .
					xml_textarea( 'wikitext', 15, 100, $out ) . 
					xml_endfieldset()  
				);
		}
 
		// Finish off the input/output form
		$wgOut->addHtml( Xml::closeElement( 'form' ) );
		$wgOut->addWikiText( wfMsg('htmltowiki-credits') );
    }
 
    function loadMessages() {
		static $messagesLoaded = false;
		global $wgMessageCache;
		if ( $messagesLoaded ) return;
		$messagesLoaded = true;
 
		require( dirname( __FILE__ ) . '/HtmlToWiki.i18n.php' );
		foreach ( $allMessages as $lang => $langMessages ) {
				$wgMessageCache->addMessages( $langMessages, $lang );
		}
    }
}
 
 
/**
 * Convenience functions to build an HTML text input field
 * @return string HTML
 */
 ## Note: <textarea value=xxx /> doesn't work with IE
function xml_textarea( $name, $rows=false, $cols=false, $value=false, $attribs=array() ) {
	return( 
			Xml::openElement( 'textarea', 
							  array( 'name' => $name, 'cols' => $cols, 'rows' => $rows ) + $attribs,
							  $value )
			. $value
			. Xml::closeElement('textarea') );
}
 
function xml_startfieldset( $id=false, $label=false ) {
 	$out = Xml::openElement( 'fieldset', array( 'id' => $id ));
 	if ( $label )	$out .= Xml::element( 'legend', array(), $label );
	return ( $out );
}
 
function xml_endfieldset() {
	return(	Xml::closeElement( 'fieldset' ) );
}
 
?>

[edit] HtmlToWiki.i18n.php

<?php
$allMessages = array(
        'en' => array( 
                'htmltowiki' 			=> 'HtmlToWiki',
				'htmltowiki-text'		=> 'Convert HTML to wikitext:',
				'htmltowiki-convert'	=> 'Convert to wikitext',
				'htmltowiki-input'		=> 'HTML source',
				'htmltowiki-output'		=> 'Converted wikitext',
				'htmltowiki-credits'	=> 'For more information about this utility, see [http://www.ehartwell.com/InfoDabble/MediaWiki_extension:_HtmlToWiki HtmlToWiki]. This utility uses the [http://htmlpurifier.org/ HTML Purifier] library, which is licensed under the LGPL v2.1+.',
        ),
);
?>

[edit] HtmlToWikiPurifier.php

<?php
class HtmlToWiki extends SpecialPage
{
	function HtmlToWiki() {
		self::loadMessages();
		SpecialPage::SpecialPage(wfMsg('htmltowiki'));
	}
 
    function execute( $par ) {
		global $wgRequest, $wgOut;
 
		$this->setHeaders();
		$self = Title::makeTitle( NS_SPECIAL, wfMsg('htmltowiki') );
 
		# Get request data from, e.g.
		$param = $wgRequest->getText('param');
		$html = $wgRequest->getText('convert');
 
		# Only load the HTML Purifier code if we're going to use it
		require( dirname( __FILE__ ) . '/HtmlToWikiPurifier.php' );
 
		# Build the input form using Xml.php (see: Special:LinkSearch)
		$wgOut->addWikiText( wfMsg('htmltowiki-text') );
		$wgOut->addHtml( 
						Xml::openElement( 'form', array( 'id' => 'mw-htmltowiki-form', 
										  'method' => 'post', 'action' => $self->escapeLocalURL() ) ) .
						xml_startfieldset( 'input', wfMsg( 'htmltowiki-input' ) ) .
						xml_textarea( 'convert', ($html != '') ? 6 : 20, 100, $html ) . 
						Xml::submitButton( wfMsg( 'htmltowiki-convert' ) ) . 
						xml_endfieldset() 
					);
 
		// Show the converted wikitext if there's something to convert
		if ( $html != '' ) {
			$parser = new HTMLPurifierWikitext();
			$out = $parser->purifyWikitext( $html );
			$wgOut->addHtml( 
					xml_startfieldset( 'output', wfMsg( 'htmltowiki-output' ) ) .
					xml_textarea( 'wikitext', 15, 100, $out ) . 
					xml_endfieldset()  
				);
		}
 
		// Finish off the input/output form
		$wgOut->addHtml( Xml::closeElement( 'form' ) );
		$wgOut->addWikiText( wfMsg('htmltowiki-credits') );
    }
 
    function loadMessages() {
		static $messagesLoaded = false;
		global $wgMessageCache;
		if ( $messagesLoaded ) return;
		$messagesLoaded = true;
 
		require( dirname( __FILE__ ) . '/HtmlToWiki.i18n.php' );
		foreach ( $allMessages as $lang => $langMessages ) {
				$wgMessageCache->addMessages( $langMessages, $lang );
		}
    }
}
 
 
/**
 * Convenience functions to build an HTML text input field
 * @return string HTML
 */
 ## Note: <textarea value=xxx /> doesn't work with IE
function xml_textarea( $name, $rows=false, $cols=false, $value=false, $attribs=array() ) {
	return( 
			Xml::openElement( 'textarea', 
							  array( 'name' => $name, 'cols' => $cols, 'rows' => $rows ) + $attribs,
							  $value )
			. $value
			. Xml::closeElement('textarea') );
}
 
function xml_startfieldset( $id=false, $label=false ) {
 	$out = Xml::openElement( 'fieldset', array( 'id' => $id ));
 	if ( $label )	$out .= Xml::element( 'legend', array(), $label );
	return ( $out );
}
 
function xml_endfieldset() {
	return(	Xml::closeElement( 'fieldset' ) );
}
 
?>

[edit] Implementation

Suprisingly enough, it's hard to find a current HTML parser written in PHP.

Finally I found HTML Purifier, which bills itself as a standards-compliant HTML filter library.

[edit] Tips, tricks, and hoops

  • Initially I hoped to use PHP's DOM functions, but they are only suitable for use with clean XML. For example, it doesn't tokenize common nested tags such as:
    <p>This is <b>typical</b> HTML markup.</p>
  • "[Y]ou save my day by allowing me not to write another damned HTML parser."

[edit] Version history

  • August 10, 2007 - initial version

[edit] See also

Other HTML parsers and converters: