Please start any new threads on our new site at https://forums.sqlteam.com. We've got lots of great SQL Server experts to answer whatever question you can come up with.

 All Forums
 SQL Server 2005 Forums
 Transact-SQL (2005)
 A relational technique to strip the HTML tags out

Author  Topic 

prasad332
Starting Member

3 Posts

Posted - 2007-11-27 : 10:43:41
I had a problem with the ntext datatype. I need to strip the HTML tags out of a ntext datatype column. I have sample query for that, which works fine for STRING, as stuff is the string function, what to do for ntext field.

=======The Process follows like this =========

--**************************************
--
-- Name: A relational technique to strip
-- the HTML tags out of a string
-- Description:A relational technique to
-- strip the HTML tags out of a string. Th
-- is solution demonstrates how to use simp
-- le tables & search functions effectively
-- in SQL Server to solve procedural / ite
-- rative problems.
-- By: Umachandar
--
--This code is copyrighted and has-- limited warranties.Please see http://
-- www.Planet-Source-Code.com/vb/scripts/Sh
-- owCode.asp?txtCodeId=197&lngWId=5--for details.--**************************************
--

-- This table contains the tags to be re
-- placed. The % in <head%>
-- will take care of any extra informati
-- on in the tag that you needn't worry
-- about as a whole. In any case, this t
-- able contains all the tags that needs
-- to be search & replaced.
CREATE TABLE #html ( tag varchar(30) )
INSERT #html VALUES ( '<html>' )
INSERT #html VALUES ( '<head%>' )
INSERT #html VALUES ( '<title%>' )
INSERT #html VALUES ( '<link%>' )
INSERT #html VALUES ( '</title>' )
INSERT #html VALUES ( '</head>' )
INSERT #html VALUES ( '<body%>' )
INSERT #html VALUES ( '</html>' )
go
-- A simple table with the HTML strings
CREATE TABLE #t ( id tinyint IDENTITY , string varchar(255) )
INSERT #t VALUES (
'<HTML><HEAD><TITLE>Some Name</TITLE>
<LINK REL="stylesheet" HREF="/style.css" TYPE="text/css" ></HEAD>
<BODY BGCOLOR="FFFFFF" VLINK="#444444">
SOME HTML text after the body</HTML>'
)
INSERT #t VALUES (
'<HTML><HEAD><TITLE>Another Name</TITLE>
<LINK REL="stylesheet" HREF="/style.css"></HEAD>
<BODY BGCOLOR="FFFFFF" VLINK="#444444">Another HTML text after the body</HTML>'
)
go
-- This is the code to strip the tags ou
-- t.
-- It finds the starting location of eac
-- h tag in the HTML string ,
-- finds the length of the tag with the
-- extra properties if any. This is
-- done by locating the end of the tag n
-- amely '>'. The same is done
-- in a loop till all tags are replaced.
--


BEGIN TRAN
WHILE exists(select * FROM #t JOIN #html on patindex('%' + tag + '%' , string ) > 0 )
UPDATE #t
SET string = stuff( string , patindex('%' + tag + '%' , string ) ,
charindex( '>' , string , patindex('%' + tag + '%' , string ) )
- patindex('%' + tag + '%' , string ) + 1 , '' )
FROM #t JOIN #html
ON patindex('%' + tag + '%' , string ) > 0
SELECT * FROM #t
rollback

cr8nk
Yak Posting Veteran

77 Posts

Posted - 2007-11-27 : 12:53:10
can you change your ntext field to nvarchar(max)? Or can you copy your ntext records that need to be scrubbed to a work table with a nvarchar(max) data type column then perform the data manipulation in the work table then copy back to the main table?

use tempdb

go

CREATE TABLE #html ( tag varchar(30) )
INSERT #html VALUES ( '<html>' )
INSERT #html VALUES ( '<head%>' )
INSERT #html VALUES ( '<title%>' )
INSERT #html VALUES ( '<link%>' )
INSERT #html VALUES ( '</title>' )
INSERT #html VALUES ( '</head>' )
INSERT #html VALUES ( '<body%>' )
INSERT #html VALUES ( '</html>' )
go

CREATE TABLE #t ( id tinyint IDENTITY , textcol ntext)
INSERT #t VALUES (
'<HTML><HEAD><TITLE>Some Name</TITLE>
<LINK REL="stylesheet" HREF="/style.css" TYPE="text/css" ></HEAD>
<BODY BGCOLOR="FFFFFF" VLINK="#444444">
SOME HTML text after the body</HTML>'
)
INSERT #t VALUES (
'<HTML><HEAD><TITLE>Another Name</TITLE>
<LINK REL="stylesheet" HREF="/style.css"></HEAD>
<BODY BGCOLOR="FFFFFF" VLINK="#444444">Another HTML text after the body</HTML>'
)

CREATE TABLE #work (id int, textcol nvarchar(max) )

INSERT INTO #work (id, textcol)
SELECT id, textcol FROM #t
WHERE 1 = 1 --where records need to be scrubbed

WHILE exists(select * FROM #work JOIN #html on patindex('%' + tag + '%' , textcol ) > 0 )
UPDATE #work
SET textcol = stuff( textcol , patindex('%' + tag + '%' , textcol ) ,
charindex( '>' , textcol , patindex('%' + tag + '%' , textcol ) )
- patindex('%' + tag + '%' , textcol ) + 1 , '' )
FROM #work JOIN #html
ON patindex('%' + tag + '%' , textcol ) > 0

SELECT * FROM #work

UPDATE t
SET textcol = w.textcol
FROM #t t
INNER JOIN #work w
ON t.id = w.id

SELECT * FROM #t

Go to Top of Page

anonymous1
Posting Yak Master

185 Posts

Posted - 2007-11-27 : 14:39:47
if possible changing the datatype may work best in the long run, if not this may be what you wanted...

UPDATE #t
SET string =
substring(string
,0
,patindex('%' + tag + '%' , string)
)
+
substring(string
,patindex('%' + tag + '%' , string) + datalength(replace(tag, '%>', '')) + patindex( '%>%' , substring(string, patindex('%' + tag + '%' , string) + datalength(replace(tag, '%>', '')) ,datalength(string)))
,datalength(string)
)
Go to Top of Page
   

- Advertisement -