How to Convert RTF to HTML

Asked

Viewed 1,921 times

5

I’m working on a web system that pulls a bulletin board from a system made in Delphi, which uses the format RTF to format the text. With this I need to use some Server-Side class in PHP to transform it into HTML.

The class I’m trying to be rtfclass.php that stays here: rtfclass Github.

I couldn’t find documentation with examples to test it accurately.

Issues:

  1. What is the best RTF/HTML conversion solution?
  2. Could someone give me an example of the use?

1 answer

3


Ever tried to use that Parser?

Include the archive rtf.php somewhere in your project. Then do the following:

$reader = new RtfReader();
$rtf = file_get_contents("test.rtf"); // Ou uma string
$reader->Parse($rtf);

If you want to see what is being analyzed do:

$reader->root->dump();

To perform the conversion to HTML:

$formatter = new RtfHtml();
echo $formatter->Format($reader->root);

To save use some function that writes this content in a file, such as file_put_contents.

Follows as an example:

$html = $formatter->Format($reader->root);
file_put_contents('test.html', $html);

The archive rtf.php:

/**
   * RTF parser/formatter
   *
   * This code reads RTF files and formats the RTF data to HTML.
   *
   * PHP version 5
   *
   * @author     Alexander van Oostenrijk
   * @copyright  2014 Alexander van Oostenrijk
   * @license    GNU
   * @version    1
   * @link       http://www.websofia.com
   * 
   * Sample of use:
   * 
   * $reader = new RtfReader();
   * $rtf = file_get_contents("itc.rtf"); // or use a string
   * $reader->Parse($rtf);
   * //$reader->root->dump(); // to see what the reader read
   * $formatter = new RtfHtml();
   * echo $formatter->Format($reader->root);   
   */

  class RtfElement
  {
    protected function Indent($level)
    {
      for($i = 0; $i < $level * 2; $i++) echo "&nbsp;";
    }
  }

  class RtfGroup extends RtfElement
  {
    public $parent;
    public $children;

    public function __construct()
    {
      $this->parent = null;
      $this->children = array();
    }

    public function GetType()
    {
      // No children?
      if(sizeof($this->children) == 0) return null;
      // First child not a control word?
      $child = $this->children[0];
      if(get_class($child) != "RtfControlWord") return null;
      return $child->word;
    }    

    public function IsDestination()
    {
      // No children?
      if(sizeof($this->children) == 0) return null;
      // First child not a control symbol?
      $child = $this->children[0];
      if(get_class($child) != "RtfControlSymbol") return null;
      return $child->symbol == '*';
    }

    public function dump($level = 0)
    {
      echo "<div>";
      $this->Indent($level);
      echo "{";
      echo "</div>";

      foreach($this->children as $child)
      {
        if(get_class($child) == "RtfGroup")
        {
          if ($child->GetType() == "fonttbl") continue;
          if ($child->GetType() == "colortbl") continue;
          if ($child->GetType() == "stylesheet") continue;
          if ($child->GetType() == "info") continue;
          // Skip any pictures:
          if (substr($child->GetType(), 0, 4) == "pict") continue;
          if ($child->IsDestination()) continue;
        }
        $child->dump($level + 2);
      }

      echo "<div>";
      $this->Indent($level);
      echo "}";
      echo "</div>";
    }
  }

  class RtfControlWord extends RtfElement
  {
    public $word;
    public $parameter;

    public function dump($level)
    {
      echo "<div style='color:green'>";
      $this->Indent($level);
      echo "WORD {$this->word} ({$this->parameter})";
      echo "</div>";
    }
  }

  class RtfControlSymbol extends RtfElement
  {
    public $symbol;
    public $parameter = 0;

    public function dump($level)
    {
      echo "<div style='color:blue'>";
      $this->Indent($level);
      echo "SYMBOL {$this->symbol} ({$this->parameter})";
      echo "</div>";
    }    
  }

  class RtfText extends RtfElement
  {
    public $text;

    public function dump($level)
    {
      echo "<div style='color:red'>";
      $this->Indent($level);
      echo "TEXT {$this->text}";
      echo "</div>";
    }    
  }

  class RtfReader
  {
    public $root = null;

    protected function GetChar()
    {
      $this->char = $this->rtf[$this->pos++];
    }

    protected function ParseStartGroup()
    {
      // Store state of document on stack.
      $group = new RtfGroup();
      if($this->group != null) $group->parent = $this->group;
      if($this->root == null)
      {
        $this->group = $group;
        $this->root = $group;
      }
      else
      {
        array_push($this->group->children, $group);
        $this->group = $group;
      }
    }

    protected function is_letter()
    {
      if(ord($this->char) >= 65 && ord($this->char) <= 90) return TRUE;
      if(ord($this->char) >= 90 && ord($this->char) <= 122) return TRUE;
      return FALSE;
    }

    protected function is_digit()
    {
      if(ord($this->char) >= 48 && ord($this->char) <= 57) return TRUE;
      return FALSE;
    }

    protected function ParseEndGroup()
    {
      // Retrieve state of document from stack.
      $this->group = $this->group->parent;
    }

    protected function ParseControlWord()
    {
      $this->GetChar();
      $word = "";
      while($this->is_letter())
      {
        $word .= $this->char;
        $this->GetChar();
      }

      // Read parameter (if any) consisting of digits.
      // Paramater may be negative.
      $parameter = null;
      $negative = false;
      if($this->char == '-') 
      {
        $this->GetChar();
        $negative = true;
      }
      while($this->is_digit())
      {
        if($parameter == null) $parameter = 0;
        $parameter = $parameter * 10 + $this->char;
        $this->GetChar();
      }
      if($parameter === null) $parameter = 1;
      if($negative) $parameter = -$parameter;

      // If this is \u, then the parameter will be followed by 
      // a character.
      if($word == "u") 
      {
      }
      // If the current character is a space, then
      // it is a delimiter. It is consumed.
      // If it's not a space, then it's part of the next
      // item in the text, so put the character back.
      else
      {
        if($this->char != ' ') $this->pos--; 
      }

      $rtfword = new RtfControlWord();
      $rtfword->word = $word;
      $rtfword->parameter = $parameter;
      array_push($this->group->children, $rtfword);
    }

    protected function ParseControlSymbol()
    {
      // Read symbol (one character only).
      $this->GetChar();
      $symbol = $this->char;

      // Symbols ordinarily have no parameter. However, 
      // if this is \', then it is followed by a 2-digit hex-code:
      $parameter = 0;
      if($symbol == '\'')
      {
        $this->GetChar(); 
        $parameter = $this->char;
        $this->GetChar(); 
        $parameter = hexdec($parameter . $this->char);
      }

      $rtfsymbol = new RtfControlSymbol();
      $rtfsymbol->symbol = $symbol;
      $rtfsymbol->parameter = $parameter;
      array_push($this->group->children, $rtfsymbol);
    }

    protected function ParseControl()
    {
      // Beginning of an RTF control word or control symbol.
      // Look ahead by one character to see if it starts with
      // a letter (control world) or another symbol (control symbol):
      $this->GetChar();
      $this->pos--;
      if($this->is_letter()) 
        $this->ParseControlWord();
      else
        $this->ParseControlSymbol();
    }

    protected function ParseText()
    {
      // Parse plain text up to backslash or brace,
      // unless escaped.
      $text = "";

      do
      {
        $terminate = false;
        $escape = false;

        // Is this an escape?
        if($this->char == '\\')
        {
          // Perform lookahead to see if this
          // is really an escape sequence.
          $this->GetChar();
          switch($this->char)
          {
            case '\\': $text .= '\\'; break;
            case '{': $text .= '{'; break;
            case '}': $text .= '}'; break;
            default:
              // Not an escape. Roll back.
              $this->pos = $this->pos - 2;
              $terminate = true;
              break;
          }
        }
        else if($this->char == '{' || $this->char == '}')
        {
          $this->pos--;
          $terminate = true;
        }

        if(!$terminate && !$escape)
        {
          $text .= $this->char;
          $this->GetChar();
        }
      }
      while(!$terminate && $this->pos < $this->len);

      $rtftext = new RtfText();
      $rtftext->text = $text;
      array_push($this->group->children, $rtftext);
    }

    public function Parse($rtf)
    {
      $this->rtf = $rtf;
      $this->pos = 0;
      $this->len = strlen($this->rtf);
      $this->group = null;
      $this->root = null;

      while($this->pos < $this->len)
      {
        // Read next character:
        $this->GetChar();

        // Ignore \r and \n
        if($this->char == "\n" || $this->char == "\r") continue;

        // What type of character is this?
        switch($this->char)
        {
          case '{':
            $this->ParseStartGroup();
            break;
          case '}':
            $this->ParseEndGroup();
            break;
          case '\\':
            $this->ParseControl();
            break;
          default:
            $this->ParseText();
            break;
        }
      }
    }
  }

  class RtfState
  {
    public function __construct()
    {
      $this->Reset();
    }

    public function Reset()
    {
      $this->bold = false;
      $this->italic = false;
      $this->underline = false;
      $this->end_underline = false;
      $this->strike = false;
      $this->hidden = false;
      $this->fontsize = 0;
    }
  }

  class RtfHtml
  {
    public function Format($root)
    {
      $this->output = "";
      // Create a stack of states:
      $this->states = array();
      // Put an initial standard state onto the stack:
      $this->state = new RtfState();
      array_push($this->states, $this->state);
      $this->FormatGroup($root);
      return $this->output;
    }

    protected function FormatGroup($group)
    {
      // Can we ignore this group?
      if ($group->GetType() == "fonttbl") return;
      if ($group->GetType() == "colortbl") return;
      if ($group->GetType() == "stylesheet") return;
      if ($group->GetType() == "info") return;
      // Skip any pictures:
      if (substr($group->GetType(), 0, 4) == "pict") return;
      if ($group->IsDestination()) return;

      // Push a new state onto the stack:
      $this->state = clone $this->state;
      array_push($this->states, $this->state);

      foreach($group->children as $child)
      {
        if(get_class($child) == "RtfGroup") $this->FormatGroup($child);
        if(get_class($child) == "RtfControlWord") $this->FormatControlWord($child);
        if(get_class($child) == "RtfControlSymbol") $this->FormatControlSymbol($child);
        if(get_class($child) == "RtfText") $this->FormatText($child);
      }

      // Pop state from stack.
      array_pop($this->states);
      $this->state = $this->states[sizeof($this->states)-1];
    }

    protected function FormatControlWord($word)
    {
      if($word->word == "plain") $this->state->Reset();
      if($word->word == "b") $this->state->bold = $word->parameter;
      if($word->word == "i") $this->state->italic = $word->parameter;
      if($word->word == "ul") $this->state->underline = $word->parameter;
      if($word->word == "ulnone") $this->state->end_underline = $word->parameter;
      if($word->word == "strike") $this->state->strike = $word->parameter;
      if($word->word == "v") $this->state->hidden = $word->parameter;
      if($word->word == "fs") $this->state->fontsize = ceil(($word->parameter / 24) * 16);

      if($word->word == "par") $this->output .= "<p>";

      // Characters:
      if($word->word == "lquote") $this->output .= "&lsquo;";
      if($word->word == "rquote") $this->output .= "&rsquo;";
      if($word->word == "ldblquote") $this->output .= "&ldquo;";
      if($word->word == "rdblquote") $this->output .= "&rdquo;";
      if($word->word == "emdash") $this->output .= "&mdash;";
      if($word->word == "endash") $this->output .= "&ndash;";
      if($word->word == "bullet") $this->output .= "&bull;";
      if($word->word == "u") $this->output .= "&loz;";
    }

    protected function BeginState()
    {
      $span = "";
      if($this->state->bold) $span .= "font-weight:bold;";
      if($this->state->italic) $span .= "font-style:italic;";
      if($this->state->underline) $span .= "text-decoration:underline;";
      if($this->state->end_underline) $span .= "text-decoration:none;";
      if($this->state->strike) $span .= "text-decoration:strikethrough;";
      if($this->state->hidden) $span .= "display:none;";
      if($this->state->fontsize != 0) $span .= "font-size: {$this->state->fontsize}px;";
      $this->output .= "<span style='{$span}'>";
    }

    protected function EndState()
    {
      $this->output .= "</span>";
    }

    protected function FormatControlSymbol($symbol)
    {
      if($symbol->symbol == '\'')
      {
        $this->BeginState();
        $this->output .= htmlentities(chr($symbol->parameter), ENT_QUOTES, 'ISO-8859-1');
        $this->EndState();
      }
    }

    protected function FormatText($text)
    {
      $this->BeginState();
      $this->output .= $text->text;
      $this->EndState();
    }
  }

Updating

Before doing the Parsing check if the content to be converted indicates having the format .rtf, with a regular expression we can see if a sequence has any traces of this format:

$texto = '{\rtf1\ansi\ansicpg1252\deff0\deflang2057{\fonttbl{\f0\fnil\fcharset0 Tahoma;}}';

if (preg_match("/(\{\})|}|(\\\S+)/", $texto) > 0) {
  // $texto parece ter o formato .rtf
} else {
  // $texto parece ser texto puro
}

If the text is in format .rtf you do the Parsing otherwise return the plain text.

  • I tested this solution, and it worked, but there is another case, it identifies if it is rtf and turns into html, okay, but if I send a plain text without rtf it is no information. turns out, for the Delphi that creates warnings in rtf I need to convert, but in the web system I create warnings in plain text

  • @Gabrielrodrigues In this case you would have adapted the function Parse to answer that. How are you calling Parse?

  • I leave the class in a rtf.php file of a include after this do a select in the table warnings and put the result in a variable that I put instead of the file and take the text in html with another variable and show on the screen

  • To post the code only tomorrow because I’m no longer in service :(

  • @Gabrielrodrigues I updated the answer, before doing the parse see if the text has the format .rtf if there is no return of the plain text.

Browser other questions tagged

You are not signed in. Login or sign up in order to post.