Word HTML Cleaner

By Peter Bromberg
Access over 40 UI widgets with everything from interactive menus to rich charts.

This is the complete code for a Windows Forms app that uses Regex to load, clean, and resave HTML documents saved from MS Word. Very handy! There's no exception checking to keep it simple, but if used correctly you should never have an exception. Just start a new Windows Application, clear out the Form1 code, and paste everything below into it!

using System;
using System.Drawing;
using System.Collections;
using System.ComponentModel;
using System.IO;
using System.Windows.Forms;
using System.Data;
using System.Text.RegularExpressions ;

namespace WordCleaner
{
public class Form1 : System.Windows.Forms.Form
{
private System.Windows.Forms.OpenFileDialog openFileDialog1;
private System.Windows.Forms.SaveFileDialog saveFileDialog1;
private System.Windows.Forms.Button button1;
private System.ComponentModel.Container components = null;
public Form1()
{
InitializeComponent();
}

protected override void Dispose( bool disposing )
{
if( disposing )
{
if (components != null)
{
components.Dispose();
}
}
base.Dispose( disposing );
}

#region Windows Form Designer generated code
private void InitializeComponent()
{
this.openFileDialog1 = new System.Windows.Forms.OpenFileDialog();
this.saveFileDialog1 = new System.Windows.Forms.SaveFileDialog();
this.button1 = new System.Windows.Forms.Button();
this.SuspendLayout();
this.button1.Location = new System.Drawing.Point(16, 224);
this.button1.Name = "button1";
this.button1.Size = new System.Drawing.Size(128, 24);
this.button1.TabIndex = 0;
this.button1.Text = "Get Word Html";
this.button1.Click += new System.EventHandler(this.button1_Click);
this.AutoScaleBaseSize = new System.Drawing.Size(5, 13);
this.ClientSize = new System.Drawing.Size(292, 266);
this.Controls.Add(this.button1);
this.Name = "Form1";
this.Text = "Form1";
this.ResumeLayout(false);
}
#endregion

[STAThread]
static void Main()
{
Application.Run(new Form1());
}

string pat= @"(?s)( class=\w+(?=([^<]*>)))|(<!--\[if.*?<!\[endif\]-->)|(<!\[if !\w+\]>)|(<!\[endif\]>)|(<o:p>[^<]*</o:p>)|(<span[^>]*>)|(</span>)|(font-family:[^>]*[;'])|(font-size:[^>]*[;'])(?-s)";


private void button1_Click(object sender, System.EventArgs e)
{
openFileDialog1.Title ="Open Word Html";
openFileDialog1.Filter = "htm files (*.htm)|*.htm|html files (*.html)|*.html";

DialogResult res = this.openFileDialog1.ShowDialog();
string filename =openFileDialog1.FileName ;
FileStream stm =(FileStream)this.openFileDialog1.OpenFile();
byte[] b = new byte[(int)stm.Length];
stm.Read(b,0,(int)stm.Length );
stm.Close() ;
string doc = System.Text.Encoding.UTF8.GetString(b);
string resultDoc = Regex.Replace(doc,pat,"");
byte[] b2 = System.Text.Encoding.UTF8.GetBytes(resultDoc) ;
filename=filename.Replace(".","_Fixed.");
saveFileDialog1.FileName =filename;
saveFileDialog1.Title ="Save Fixed Word Html";
DialogResult res2 = this.saveFileDialog1.ShowDialog() ;
FileStream fs = (FileStream) this.saveFileDialog1.OpenFile() ;
fs.Write(b2,0,b2.Length );
fs.Close() ;
}
}
}


// Here is a link to a handy online version:

http://www.eggheadcafe.com/articles/wordcleaner.aspx


Submission Date:  12/2/2005 6:44:39 AM
Submitted By:  Peter Bromberg
My Home Page:  http://www.eggheadcafe.com

Popularity  (253 Views)