Please start any new threads on our new site at https://forums.sqlteam.com. We've got lots of great SQL Server experts to answer whatever question you can come up with.

 All Forums
 General SQL Server Forums
 Script Library
 UDFs GETWORDCOUNT, GETWORDNUM

Author  Topic 

Igor2004
More clever than you

78 Posts

Posted - 2005-06-17 : 22:47:13
[code]
-- Author: Igor Nikiforov, Montreal, EMail: udfs@sympatico.ca
-- GETWORDCOUNT() User-Defined Function Counts the words in a string.
-- GETWORDCOUNT(@cString[, @cDelimiters])
-- Parameters
-- @cString nvarchar(4000) - Specifies the string whose words will be counted.
-- @cDelimiters nvarchar(256) - Optional. Specifies one or more optional characters used to separate words in @cString.
-- The default delimiters are space, tab, carriage return, and line feed. Note that GETWORDCOUNT( ) uses each of the characters in @cDelimiters as individual delimiters, not the entire string as a single delimiter.
-- Return Value smallint
-- Remarks GETWORDCOUNT() by default assumes that words are delimited by spaces or tabs. If you specify another character as delimiter, this function ignores spaces and tabs and uses only the specified character.
-- If you use 'AAA aaa, BBB bbb, CCC ccc.' as the target string for dbo.GETWORDCOUNT(), you can get all the following results.
-- declare @cString nvarchar(4000)
-- set @cString = 'AAA aaa, BBB bbb, CCC ccc.'
-- select dbo.GETWORDCOUNT(@cString, default) -- 6 - character groups, delimited by ' '
-- select dbo.GETWORDCOUNT(@cString, ',') -- 3 - character groups, delimited by ','
-- select dbo.GETWORDCOUNT(@cString, '.') -- 1 - character group, delimited by '.'
-- See Also GETWORDNUM() User-Defined Function
-- UDF the name and functionality of which correspond to the same built-in function of Visual FoxPro
CREATE function GETWORDCOUNT (@cSrting nvarchar(4000), @cDelimiters nvarchar(256) )
returns smallint
as
begin
-- if no break string is specified, the function uses spaces, tabs and line feed to delimit words.
set @cDelimiters = isnull(@cDelimiters, space(1)+char(9)+char(10))
declare @p smallint, @end_of_string smallint, @wordcount smallint
select @p = 1, @wordcount = 0
select @end_of_string = 1 + datalength(@cSrting)/(case SQL_VARIANT_PROPERTY(@cSrting,'BaseType') when 'nvarchar' then 2 else 1 end) -- for unicode

while dbo.CHARINDEX_BIN(substring(@cSrting, @p, 1), @cDelimiters, 1) > 0 and @end_of_string > @p -- skip opening break characters, if any
set @p = @p + 1

if @p < @end_of_string
begin
set @wordcount = 1 -- count the one we are in now count transitions from 'not in word' to 'in word'
-- if the current character is a break char, but the next one is not, we have entered a new word
while @p < @end_of_string
begin
if @p +1 < @end_of_string and dbo.CHARINDEX_BIN(substring(@cSrting, @p, 1), @cDelimiters, 1) > 0 and dbo.CHARINDEX_BIN(substring(@cSrting, @p+1, 1), @cDelimiters, 1) = 0
select @wordcount = @wordcount + 1, @p = @p + 1 -- Skip over the first character in the word. We know it cannot be a break character.
set @p = @p + 1
end
end

return @wordcount
end
GO

-- Author: Igor Nikiforov, Montreal, EMail: udfs@sympatico.ca
-- GETWORDNUM() User-Defined Function
-- Returns a specified word from a string.
-- GETWORDNUM(@cString, @nIndex[, @cDelimiters])
-- Parameters @cString nvarchar(4000) - Specifies the string to be evaluated
-- @nIndex smallint - Specifies the index position of the word to be returned. For example, if @nIndex is 3, GETWORDNUM( ) returns the third word (if @cString contains three or more words).
-- @cDelimiters nvarchar(256) - Optional. Specifies one or more optional characters used to separate words in @cString.
-- The default delimiters are space, tab, carriage return, and line feed. Note that GetWordNum( ) uses each of the characters in @cDelimiters as individual delimiters, not the entire string as a single delimiter.
-- Return Value nvarchar(4000)
-- Remarks Returns the word at the position specified by @nIndex in the target string, @cString. If @cString contains fewer than @nIndex words, GETWORDNUM( ) returns an empty string.
-- See Also
-- GETWORDCOUNT() User-Defined Function
-- UDF the name and functionality of which correspond to the same built-in function of Visual FoxPro
CREATE function GETWORDNUM (@cSrting nvarchar(4000), @nIndex smallint, @cDelimiters nvarchar(256) )
returns nvarchar(4000)
as
begin
-- if no break string is specified, the function uses spaces, tabs and line feed to delimit words.
set @cDelimiters = isnull(@cDelimiters, space(1)+char(9)+char(10))
declare @i smallint, @j smallint, @p smallint, @q smallint, @qmin smallint, @end_of_string smallint, @LenDelimiters smallint, @outstr nvarchar(4000)
select @i = 1, @p = 1, @q = 0, @outstr = ''
select @end_of_string = 1 + datalength(@cSrting)/(case SQL_VARIANT_PROPERTY(@cSrting,'BaseType') when 'nvarchar' then 2 else 1 end) -- for unicode
select @LenDelimiters = datalength(@cDelimiters)/(case SQL_VARIANT_PROPERTY(@cDelimiters,'BaseType') when 'nvarchar' then 2 else 1 end) -- for unicode

while @i <= @nIndex
begin
while dbo.CHARINDEX_BIN(substring(@cSrting, @p, 1), @cDelimiters, 1) > 0 and @end_of_string > @p -- skip opening break characters, if any
set @p = @p + 1

if @p >= @end_of_string
break

select @j = 1, @qmin = @end_of_string -- find next break character it marks the end of this word
while @j <= @LenDelimiters
begin
set @q = dbo.CHARINDEX_BIN(substring(@cDelimiters, @j, 1), @cSrting, @p)
set @j = @j + 1
if @q > 0 and @qmin > @q
set @qmin = @q
end

if @i = @nIndex -- this is the actual word we are looking for
begin
set @outstr = substring(@cSrting, @p, @qmin-@p)
break
end
set @p = @qmin + 1

if (@p >= @end_of_string)
break
set @i = @i + 1
end

return @outstr
end
GO


-- Is similar to the built-in function Transact-SQL charindex, but regardless of collation settings,
-- executes case-sensitive search
-- Author: Igor Nikiforov, Montreal, EMail: udfs@sympatico.ca
CREATE function CHARINDEX_BIN(@expression1 nvarchar(4000), @expression2 nvarchar(4000), @start_location smallint = 1)
returns nvarchar(4000)
as
begin
return charindex( cast(@expression1 as nvarchar(4000)) COLLATE Latin1_General_BIN, cast(@expression2 as nvarchar(4000)) COLLATE Latin1_General_BIN, @start_location )
end
GO
[/code]

myleslee
Starting Member

1 Post

Posted - 2006-05-31 : 22:23:54
select dbo.getWordCount('we are the world , we are the children.', default)
=========
the result is 9 instead of 8.
Because a space was accidentally input before the comma. But it would be great to filter out punctuations.

thanks.
Go to Top of Page

SwePeso
Patron Saint of Lost Yaks

30421 Posts

Posted - 2006-06-01 : 02:28:32
[code]A real quicky could look like this code snippet below


CREATE FUNCTION dbo.fnFastWordCount
(
@Text NVARCHAR(4000)
)
RETURNS SMALLINT
AS

BEGIN
-- Get rid of prefix and suffix spaces
SELECT @Text = SUBSTRING(@Text, PATINDEX('%[^\ ]%', @Text), 4000),
@Text = LEFT(@Text, LEN(@Text))

-- Get rid of all unwanted, non word characters. Add more if you want to, such as CRLF...
WHILE CHARINDEX(',', @Text) > 0
SELECT @Text = REPLACE(@Text, ',', ' ')
WHILE CHARINDEX('.', @Text) > 0
SELECT @Text = REPLACE(@Text, '.', ' ')

-- Get rid of all double spaces
WHILE CHARINDEX(' ', @Text) > 0
SELECT @Text = REPLACE(@Text, ' ', ' ')

RETURN LEN(@Text) - LEN(REPLACE(@Text, ' ', '')) + 1
END


This is about 15 times faster than Igor's and does the same job and better.
You can easily add lines of code to remove other non wanted, non word characters.
Then my function is only about 10 times faster.

Here is a test script to measure calculation times


declare @cString nvarchar(4000),
@start datetime,
@i smallint,
@dummy smallint

select @cstring = '2068 2683 9537 3822 9752 6135 5787 3431 8676 4790 9175 1727 7117
1364 4590 3954 7801 5188 7858 1898 2880 9547 3250 9819 8947 9711 4250 2796 4133 3751
1010 8701 7803 4055 4444 5473 6376 120 9747 6026 1193 1231 8050 8914 3849 9617 2920 1748
9860 1802 2458 4478 7557 8822 2118 3950 3099 7947 645 7463 1600 9353 4802 4464 2087 5572
7487 6043 6141 7614 2328 8417 4362 2577 9760 2697 8306 6100 9080 455 3464 4976 8726 1740
9764 3498 4610 5623 1503 4089 3885 9460 9702 6314 2302 7251 2603 1371 7142 7995 5924
1722 6945 1294 3391 2527 700 8494 6059 6642 1330 8595 7815 1518 9539 3731 1493 434 985
5231 7649 77 1618 2809 4566 3435 2943 2391 1272 8002 5050 4793 9833 2779 7001 1291 5039
3372 118 7584 6020 1837 4709 5845 6834 2959 6987 6378 8491 1038 1231 1266 7081 4718 8083
5554 6123 3397 2403 367 1244 3877 4611 4088 3440 2040 9007 4701 5846 8952 1711 5618 9577
3530 9105 1444 4911 6263 9763 5064 1703 8253 7978 5861 479 781 7696 2272 737 1705 6114
3449 5101 7534 2216 2232 3634 189 4516 9898 2343 1758 1079 3355 5283 3830 3346 2788 28
8579 7825 76 9115 2381 8360 192 8157 2025 9065 9718 2047 4317 5613 6663 6745 9032 9612
8400 2092 1971 4345 8831 5662 2016 839 5364 1428 940 3910 1729 2142 3296 7482 5901 5729
1265 5725 7867 3910 6671 140 4385 4345 7497 7469 6243 3963 6847 3578 219 6265 8941 2612
3279 4984 5113 7584 4559 2499 7202 165 8266 2626 4373 5454 3228 4899 7204 9455 9209 1835
4601 4145 6457 5980 7994 8379 7547 3584 714 192 9166 214 1122 7769 9913 3947 876 850
9213 648 8266 9662 8119 5271 9323 1208 9532 1681 1400 2405 7980 13 4377 7854 4677 7404
3613 6893 4054 6323 2453 2065 3261 6080 6622 3811 6955 3433 431 1270 2174 134 832 166
1964 6675 7013 458 8151 3387 9441 8209 3834 2602 9036 8070 3807 2492 6535 498 9441 8390
1358 335 5709 6474 2654 7277 1030 4 4063 5413 5859 7511 9728 423 3688 7534 5502 1739
2171 1029 6241 9735 6344 8173 7204 2743 3892 1309 8805 1524 7100 2280 8237 3298 1355
1171 749 5045 9571 6275 6854 431 8212 430 414 2045 5 4366 1646 6257 4593 1341 1741 9696
9301 5545 8882 523 4406 2564 8565 5335 7965 5879 7167 8312 7167 6235 7408 5723 9411 419
9365 1999 6169 6687 5185 5997 5394 561 7677 6900 874 9911 7375 7229 209 8028 5376 6047
2402 9885 5502 8415 4319 6201 8162 5084 1012 259 2979 8939 7874 5782 3619 4659 2990 1491
4810 8378 8242 4566 8064 4871 1409 133 9371 8833 8337 7228 7434 3859 8112 5348 1232 8626
5450 8807 5598 2260 8620 8522 4689 2368 9389 3995 2598 7161 8055 3147 230 8026 6610
8245 7394 1908 4452 790 9186 5141 2829 1172 6672 8349 6880 2402 2655 2575 267 5542 7917
4212 5516 4533 5955 7644 9943 4971 1173 5092 2194 4821 8674 7882 4348 7342 2688 6650
3415 5591 1622 4746 3109 2212 7944 2214 6546 8878 8441 4762 3887 1135 8050 9264 594
4200 7339 2773 2699 7501 8405 6828 9842 5579 2686 4680 9840 1898 1867 4479 1910 8290
2840 7612 1484 4386 1108 7259 4922 4813 2762 3008 3071 8499 3422 7462 1564 5377 480 7993
10 4471 2752 7588 6784 940 1772 8062 4279 1130 3545 5900 4030 107 5497 8430 6219
5387 1403 33 6462 3989 2000 2596 1409 9734 1532 9947 8591 6058 2883 1659 586 8078 5942
6510 2541 6429 613 8370 2038 730 7418 2878 3636 589 1526 6466 1341 7560 9233 6155 1493
2088 3287 2815 311 6835 5049 7330 8226 5477 95 4653 7835 9914 6754 6795 6106 742 1145
3347 9864 8913 4597 9781 5312 4520 7359 5627 9082 2219 6434 8117 9523 4730 3607 7479
3256 8196 3929 507 5006 5579 7958 1154 8553 5859 3346 6916 8440 6620 701 2109 9708 9614
9853 1679 3421 5338 6324 7272 7107 4841 4387 2649 7543 631 1616 7369 3726 3575 8575 3076
3842 8114 6125 4625 7139 5317 2841 1705 9737 3320 5593 3937 6119 9905 7831 2665 8088
6222 3628 2654 1165 6176 562 6658 1441 8473 5209 3192 6102 2678 9354 4569 1840 6485 7848
4678 6013 2440 6030 8265 1806 7656 649 9672 740 5526 8697 4738 3226 791 1147 3540 534
5895 9617 3244 2356 7214 9926 1569 1522 1123 1666 8888 5762 4597 2663 3149 5674 7264
4985 9669 5375 2868 3389 1534 4557 5983 7415 1993 2723 9341 2079 3324 6713 3422 277 9818
6783 6385 6768 4563 6983 298 3193 2027 6774 5524 7562 168 8231 1950 9397 3438 7290 4612
9327 1346 8580 4657'

select @start = getdate(),
@i = 1000
/*
@i is 1000 -> whole test ~ 70 sec
@i is 100 -> whole test ~ 7 sec
@i is 10 -> whole test ~ 1 sec
*/
while @i > 0
select @dummy = dbo.GETWORDCOUNT(@cString, default),
@i = @i -1

select datediff(ms, @start, getdate()) 'ms'

select @start = getdate(),
@i = 1000

while @i > 0
select @dummy = dbo.fnFastWordCount(@cString),
@i = @i - 1

select datediff(ms, @start, getdate()) 'ms'
[/code]
Go to Top of Page

spirit1
Cybernetic Yak Master

11752 Posts

Posted - 2006-06-01 : 06:20:17
oh my god.... Igor's function has an error?!!?!??!??!???

That's unbeliveable!!!!

Someone get me some aspirin.... quick!!!



Go with the flow & have fun! Else fight the flow
Blog thingie: [URL="http://weblogs.sqlteam.com/mladenp"]
Go to Top of Page

SamC
White Water Yakist

3467 Posts

Posted - 2006-06-01 : 08:42:13
A sticky for functions like these would be handy. I've seen several come and go over the years.
Go to Top of Page

Michael Valentine Jones
Yak DBA Kernel (pronounced Colonel)

7020 Posts

Posted - 2006-06-01 : 10:51:40
quote:
Originally posted by myleslee

select dbo.getWordCount('we are the world , we are the children.', default)
=========
the result is 9 instead of 8.
Because a space was accidentally input before the comma. But it would be great to filter out punctuations.

thanks.



You just used the function incorrectly.

You need to specify the comma as one of the delimeters, or modify it to include commas as a default delimiter.
declare @Delim nvarchar(256)
set @Delim = space(1)+char(9)+char(10)+','

select
[Word Count] =
dbo.getWordCount('we are the world , we are the children.',@Delim)

Results:

Word Count
----------
8

(1 row(s) affected)


Igor really is a very clever fellow.



CODO ERGO SUM
Go to Top of Page

Igor2004
More clever than you

78 Posts

Posted - 2006-06-08 : 11:07:49
To Peso
1) Your function doesn’t does the same job and better.
See, please, attentively the description of my function.
Your function works with delimiters ' .,'.
2) Your function isn’t about 15 times faster than my function.
Try it, please.

declare	@cString nvarchar(4000),
@start datetime, @i smallint, @cnt smallint,
@dummy smallint, @test1 float, @test2 float

select @cString = '2068,,,, 2683 9537 3822 9752 61.35 57 87 3 431 8676 4790 9175 1727 7117 1364 4590 3954 7801 5188 7858 1898 2880 9547 3250 9819 8947 9711 4250 2796 4133 3751 1010 8701 7803 4055 4444 5473 6376 120 9747 6026 1193 1231 8050 8914 3849 9617 292.............................2527 700 8494 6059 6642 1330 8595 7815 1518 9539 3731 1493 434 985 5231 7649 77 1618 2809 4566 3435 2943 2391 1272 8002 5050 4793 9833 2779 7001 1291 5039 3372 118 7584 6020 1837 4709 5845 6834 2959 6987 6378 8491 1038 1231 1266 7081 4718 8083 5554 6123 3397 2403 367 1244 3877 4611 4088 3440 2040 9007 4701 5846 8952 1711 5618 9577 3530 9105 1444 4911 6263 9763 5064 1703 8253 7978 5861 479 781 7696 2272 737 1705 6114 3449 5101 7534 2216 2232 3634 189 4516 9898 2343 1758 1079 3355 5283 3830 3346 2788 28 8579 7825 76 9115 2381 8360 192 8157 2025 9065 9718 2047 4317 5613 6663 6745 9032 9612 8400 2092 1971 4345 8831 5662 2016 839 5364 1428 940 3910 1729 2142 3296 7482 5901 5729 1265 5725 7867 3910 6671 140 4385 4345 7497 7469 6243 3963 6847 3578 219 6265 8941 2612 3279 4984 5113 7584 4559 2499 7202 165 8266 2626 4373 5454 3228 4899 7204 9455 9209 1835 4601 4145 6457 5980 7994 8379 7547 3584 714 192 9166 214 1122 7769 9913 3947 876 850 9213 648 8266 9662 8119 5271 9323 1208 9532 1681 1400 2405 7980 13 4377 7854 4677 7404 3613 6893 4054 6323 2453 2065 3261 6080 6622 3811 6955 3433 431 1270 2174 134 832 166 1964 6675 7013 458 8151 3387 9441 8209 3834 2602 9036 8070 3807 2492 6535 498 9441 8390 1358 335 5709 6474 2654 7277 1030 4 4063 5413 5859 7511 9728 423 3688 7534 5502 1739 2171 1029 6241 9735 6344 8173 7204 2743 3892 1309 8805 1524 7100 2280 8237 3298 1355 1171 749 5045 9571 6275 6854 431 8212 430 414 2045 5 4366 1646 6257 4593 1341 1741 9696 9301 5545 8882 523 4406 2564 8565 5335 7965 5879 7167 8312 7167 6235 7408 5723 9411 419 9365 1999 6169 6687 5185 5997 5394 561 7677 6900 874 9911 7375 7229 209 8028 5376 6047 2402 9885 5502 8415 4319 6201 8162 5084 1012 259 2979 8939 7874 5782 3619 4659 2990 1491 4810 8378 8242 4566 8064 4871 1409 133 9371 8833 8337 7228 7434 3859 8112 5348 1232 8626 5450 8807 5598 2260 8620 8522 4689 2368 9389 3995 2598 7161 8055 3147 230 8026 6610 8245 7394 1908 4452 790 9186 5141 2829 1172 6672 8349 6880 2402 2655 2575 267 5542 7917 4212 5516 4533 5955 7644 9943 4971 1173 5092 2194 4821 8674 7882 4348 7342 2688 6650 3415 5591 1622 4746 3109 2212 7944 2214 6546 8878 8441 4762 3887 1135 8050 9264 594 4200 7339 2773 2699 7501 8,,,,,,,,,,,,, 4479 1910 8290 2840 7612 1484 4386 1108 7259 4922 4813 2762 3008 3071 8499 3422 7462 1564 5377 480 799 310 4471 2752 7588 6784 940 1772 8062 4279 1130 3545 5900 4030 107 5497 8430 6219 5387 1403 33 6462 3989 2000 2596 1409 9734 1532 9947 8591 6058 2883 1659 586 8078 5942 6510 2541 6429 613 8370 2038 730 7418 2878 3636 589 1526 6466 1341 7560 9233 6155 1493 2088 3287 2815 311 6835 5049 7330 8226 5477 95 4653 7835 9914 6754 6795 6106 742 1145 3347 9864 8913 4597 9781 5312 4520 7359 5627 9082 2219 6434 8117 9523 4730 3607 7479 3256 8196 3929 507 5006 5579 7958 1154 8553 5859 3346 6916 8440 6620 701 2109 9708 9614 9853 1679 3421 5338 6324 7272 7107 4841 4387 2649 7543 631 1616 7369 3726 3575 8575 3076 3842 8114 6125 4625 7139 5317 2841 1705 9737 3320 5593 3937 6119 9905 7831 2665 8088 6222 3628 2654 1165 6176 562 6658 1441 8473 5209 3192 6102 2678 9354 4569 1840 6485 7848 4678 6013 2440 6030 8265 1806 7656 649 9672 740 5526 8697 4738 3226 791 1147 3540 534 5895 9617 3244 2356 7214 9926 1569 1522 1123 1666 8888 5762 4597 2663 3149 5674 7264 4985 9669 5375 2868 3389 1534 4557 5983 7415 1993 2723 9341 2079 3324 6713 3422 277 9818 6783 6385 6768 4563 6983 298 3193 2027 6774 5524 7562 168 8231 1950 9. 39..7 3438 ...... 7290 4612,,,, 9327 1346 .... .... 8580 465..7'

set @cnt = 1000
/*
@cnt is 1000 -> whole test ~ 70 sec
@cnt is 100 -> whole test ~ 7 sec
@cnt is 10 -> whole test ~ 1 sec
*/

select @start = getdate(), @i = @cnt
while @i > 0
select @dummy = dbo.GETWORDCOUNT(@cString, ',. '), @i = @i -1

set @test1 = datediff(ms, @start, getdate())
select @test1 'GETWORDCOUNT is slow ms'

select @start = getdate(), @i = @cnt

while @i > 0
select @dummy = dbo.fnFastWordCount(@cString), @i = @i - 1
set @test2 = datediff(ms, @start, getdate())
select @test2 'fnFastWordCount is very, very fast ms'
select @test1/@test2 'faster ? 15 time ? You are sure ? :-) '


Regards,
Igor
Go to Top of Page

Igor2004
More clever than you

78 Posts

Posted - 2006-06-08 : 11:51:53
to Peso
in conclusion
Your "better" fonction is Erroneous
Try it, please
select dbo.fnFastWordCount('.'), dbo.fnFastWordCount(' ') -- display 1, 1
FYI
quote:
The accuracy criteria of user-defined functions
GETWORDNUM() , GETWORDCOUNT(), GETALLWORDS(), GETALLWORDS2() working with strings:

1) Both character parameters are empty - the function returns nothing - either 0 strings, or 0.

2) The first parameter is not empty, the second is empty - the function returns the table from one string or 1. The first parameter is in this case the required word.

3) The function works correctly with strings starting and/or finishing with one or several delimiters.

4) The function works correctly on strings consisting only of delimiters - a result in this case is 0 strings, or 0 words.

5) The function works correctly irrespective of the character sets in the string or delimiters, including /, \,?, ^, %, -, ' etc., that is, any character that have special function.

No exceptions, if the parameters of an incorrect type are transmitted, there is a standard error message.

Above mentioned and other functions completely correspond to these criteria.

See, please
[url]http://www.universalthread.com/wconnect/wc.dll?LevelExtreme~2,54,33,27115[/url]
Igor
Go to Top of Page

X002548
Not Just a Number

15586 Posts

Posted - 2006-06-08 : 12:24:19
I need a function to find out code that is more clever than you



Brett

8-)

Hint: Want your questions answered fast? Follow the direction in this link
http://weblogs.sqlteam.com/brettk/archive/2005/05/25/5276.aspx

Add yourself!
http://www.frappr.com/sqlteam
Go to Top of Page

SwePeso
Patron Saint of Lost Yaks

30421 Posts

Posted - 2006-06-08 : 16:27:35
quote:

2) Your function isn’t about 15 times faster than my function.
Try it, please.

Regards,
Igor



I tried. I got 50.0 ms for your function and 0.0 ms for my function, and division by zero. Does that mean that my function is infinity times faster than yours?

I was just pointing out that there were others ways to go... No hard feelings?
Go to Top of Page

Igor2004
More clever than you

78 Posts

Posted - 2006-06-08 : 17:19:38
-- This is my result of above mentioned test
5530 ms GETWORDCOUNT
8076 ms fnFastWordCount
0.684744923229321 = 5530 / 8076
-- Here is my information:
SELECT SERVERPROPERTY('Edition')  -- displays  Developer Edition
SELECT SERVERPROPERTY('ProductVersion') -- displays 9.00.1399.06


Try the test with my example of the string, please
Go to Top of Page

Igor2004
More clever than you

78 Posts

Posted - 2006-06-08 : 17:56:00
May be results are different because Collations Settings are different
My Collation Setting is Case Sensitive

quote:
I was just pointing out that there were others ways to go

This is small correction
quote:
I was just pointing out that there were others UNCOMFORTABLES ans ERRONEOUS ways to go


Try it, please
select dbo.fnFastWordCount('.....s....s.')

The difference is large between
'uncomfortable erroneous' and 'better faster'

Regards,
Igor
Go to Top of Page

Igor2004
More clever than you

78 Posts

Posted - 2006-06-08 : 18:07:57
This is example very very very fast function !
This is other way to go !
CREATE FUNCTION dbo.fnVeryVeryVeryFastWordCount
( @Text NVARCHAR(4000))
RETURNS SMALLINT
AS
BEGIN
RETURN 1
END
GO

But this function is erroneous as your function.
Go to Top of Page

SwePeso
Patron Saint of Lost Yaks

30421 Posts

Posted - 2006-06-09 : 01:53:19
quote:

Try it, please
select dbo.fnFastWordCount('.....s....s.')

select dbo.fnFastWordCount('.'), dbo.fnFastWordCount(' ')

The difference is large between
'uncomfortable erroneous' and 'better faster'

Regards,
Igor

I didn't know those ['.....s....s.', ',' and ' '] were real sentences.
Igor, I believe you have teached me something new today!

BTW, I get 757 words (in 0.0 ms), as you (in 50.0 ms), in your last example starting with
select	@cString = '2068,,,, 2683 9537
and
SELECT SERVERPROPERTY('Edition') -- Enterprise Edition
SELECT SERVERPROPERTY('ProductVersion') -- 9.00.2153.00

SELECT SERVERPROPERTY('Edition') -- Developer Edition
SELECT SERVERPROPERTY('ProductVersion') -- 8.00.2039

SELECT SERVERPROPERTY('Edition') -- Developer Edition
SELECT SERVERPROPERTY('ProductVersion') -- 10.00.208.00

Go to Top of Page

Igor2004
More clever than you

78 Posts

Posted - 2006-06-09 : 17:34:37
quote:
Igor, I believe you have teached me something new today!

Surely
This is function without errors
-- Igor Nikiforov "Quality - forever"
-- GETWORDCOUNT1() User-Defined Function Counts the words in a string.
-- GETWORDCOUNT1(@cString[, @cDelimiters])
-- Parameters
-- @cString nvarchar(4000) - Specifies the string whose words will be counted.
-- @cDelimiters nvarchar(256) - Optional. Specifies one or more optional characters used to separate words in @cString.
-- The default delimiters are space, ',' and '.'. Note that GETWORDCOUNT1( ) uses each of the characters in @cDelimiters as individual delimiters, not the entire string as a single delimiter.
-- Return Value smallint
-- Remarks GETWORDCOUNT1() by default assumes that words are delimited by spaces or ',' and '.'. If you specify another character as delimiter, this function ignores spaces ',' and '.' and uses only the specified character.
-- If you use 'AAA aaa, BBB bbb, CCC ccc.' as the target string for dbo.GETWORDCOUNT1(), you can get all the following results.
-- declare @cString nvarchar(4000)
-- set @cString = 'AAA aaa, BBB bbb, CCC ccc.'
-- select dbo.GETWORDCOUNT1(@cString, default) -- 6 - character groups, delimited by ' '
-- select dbo.GETWORDCOUNT1(@cString, ',') -- 3 - character groups, delimited by ','
-- select dbo.GETWORDCOUNT1(@cString, '.') -- 1 - character group, delimited by '.'
CREATE function [dbo].[GETWORDCOUNT1] (@cString nvarchar(4000), @cDelimiters nvarchar(256) = ' ,.' )
returns smallint
as
begin
declare @k smallint
select @k = 2, @cDelimiters = isnull(@cDelimiters, ' ,.') -- if no break string is specified, the function uses spaces, ',' and '.' to delimit words.

if datalength(@cString) = 0
return 0
if datalength(@cDelimiters) = 0
return 1

while @k <= datalength(@cDelimiters)/2
select @cString = replace(@cString COLLATE Latin1_General_BIN,
substring(@cDelimiters, @k, 1) COLLATE Latin1_General_BIN,
left(@cDelimiters, 1) COLLATE Latin1_General_BIN),
@k = @k + 1

while charindex(replicate(left(@cDelimiters, 1),2), @cString) > 0
select @cString = replace(@cString COLLATE Latin1_General_BIN,
replicate(left(@cDelimiters, 1),2) COLLATE Latin1_General_BIN, left(@cDelimiters, 1) COLLATE Latin1_General_BIN)

if ascii(left(@cDelimiters, 1)) = ascii(left(@cString, 1))
select @cString = right(@cString, datalength(@cString)/2 - 1)

if ascii(left(@cDelimiters, 1)) = ascii(right(@cString, 1))
select @cString = left(@cString, datalength(@cString)/2 - 1)

return case when datalength(@cString) = 0 then 0
else (datalength(@cString) - datalength(replace(@cString COLLATE Latin1_General_BIN, left(@cDelimiters, 1) COLLATE Latin1_General_BIN, '')))/2 + 1 end

end

Try it, please
select [dbo].[GETWORDCOUNT1]('', '')
select [dbo].[GETWORDCOUNT1]('1', '')
select [dbo].[GETWORDCOUNT1]('', '123456789')
select [dbo].[GETWORDCOUNT1]('123456789', '123456789')
select [dbo].[GETWORDCOUNT1](' 1 2 3 4 5 6 7 8 9 ', '123456789')
select [dbo].[GETWORDCOUNT1]('Igor Nikiforov "Quality - forever"', default)

This is my traditional function
 -- GETWORDCOUNT() User-Defined Function Counts the words in a string.
-- GETWORDCOUNT(@cString[, @cDelimiters])
-- Parameters
-- @cString nvarchar(4000) - Specifies the string whose words will be counted.
-- @cDelimiters nvarchar(256) - Optional. Specifies one or more optional characters used to separate words in @cString.
-- The default delimiters are space, tab, carriage return, and line feed. Note that GETWORDCOUNT( ) uses each of the characters in @cDelimiters as individual delimiters, not the entire string as a single delimiter.
-- Return Value smallint
-- Remarks GETWORDCOUNT() by default assumes that words are delimited by spaces or tabs. If you specify another character as delimiter, this function ignores spaces and tabs and uses only the specified character.
-- If you use 'AAA aaa, BBB bbb, CCC ccc.' as the target string for dbo.GETWORDCOUNT(), you can get all the following results.
-- declare @cString nvarchar(4000)
-- set @cString = 'AAA aaa, BBB bbb, CCC ccc.'
-- select dbo.GETWORDCOUNT(@cString, default) -- 6 - character groups, delimited by ' '
-- select dbo.GETWORDCOUNT(@cString, ',') -- 3 - character groups, delimited by ','
-- select dbo.GETWORDCOUNT(@cString, '.') -- 1 - character group, delimited by '.'
-- See Also GETWORDNUM(), GETALLWORDS() User-Defined Functions
-- UDF the name and functionality of which correspond to the Visual FoxPro function
CREATE function [dbo].[GETWORDCOUNT] (@cString nvarchar(4000), @cDelimiters nvarchar(256) )
returns smallint
as
begin
declare @k smallint, @nEndString smallint, @wordcount smallint
select @k = 1, @wordcount = 0, @cDelimiters = isnull(@cDelimiters, nchar(32)+nchar(9)+nchar(10)+nchar(13)), -- if no break string is specified, the function uses spaces, tabs, carriage return and line feed to delimit words.
@nEndString = 1 + datalength(@cString)/(case SQL_VARIANT_PROPERTY(@cString,'BaseType') when 'nvarchar' then 2 else 1 end) -- for unicode

while charindex(substring(@cString, @k, 1) COLLATE Latin1_General_BIN, @cDelimiters COLLATE Latin1_General_BIN) > 0 and @nEndString > @k -- skip opening break characters, if any
set @k = @k + 1

if @k < @nEndString
begin
set @wordcount = 1 -- count the one we are in now count transitions from 'not in word' to 'in word'
-- if the current character is a break char, but the next one is not, we have entered a new word
while @k < @nEndString
begin
if @k +1 < @nEndString and charindex(substring(@cString, @k, 1) COLLATE Latin1_General_BIN, @cDelimiters COLLATE Latin1_General_BIN) > 0
and charindex(substring(@cString, @k+1, 1) COLLATE Latin1_General_BIN, @cDelimiters COLLATE Latin1_General_BIN) = 0
select @wordcount = @wordcount + 1, @k = @k + 1 -- Skip over the first character in the word. We know it cannot be a break character.
set @k = @k + 1
end
end

return @wordcount
end


Go to Top of Page

SwePeso
Patron Saint of Lost Yaks

30421 Posts

Posted - 2006-06-12 : 05:04:57
Yes, I tried you function without errors, again, as you asked me to.

It still confuses me that hyphen is a word in your example (I get 5 as output result).
select [dbo].[GETWORDCOUNT1]('Igor Nikiforov "Quality - forever"', default)
If I change
select [dbo].[GETWORDCOUNT1]('Igor Nikiforov "Quality - forever"', default)
to
select [dbo].[GETWORDCOUNT1]('Igor Nikiforov "Quality - forever"', '-,. ')
I get 4 words as I would expect. That's ok! I understand the difference.


But then I have a problem with
select [dbo].[GETWORDCOUNT1]('Peter Larsson is very, very annoying with so-called comments.', '-,. ')
Your function returns 10, where I would expect 9.

With this select, I would like to have 14 as returned output
select [dbo].[GETWORDCOUNT1]('How many alpha-numeric characters are there in the english alphabet - 26, 52 or 91?', '-,. ')
not 15 as the current output tells me.

But maybe that is a feature in your function without errors?

Perhaps it would be possible to check for imbedded hyphens? If surrounded by non-delimiter characters, don't treat hyphen as delimiter, even if selected as one?


Peter Larsson
Helsingborg, Sweden
Go to Top of Page

Igor2004
More clever than you

78 Posts

Posted - 2006-06-12 : 10:57:11
Hi, Peter

select [dbo].[GETWORDCOUNT1]('Peter Larsson is very, very annoying with so-called comments.', '-,. ') -- displau 10
quote:
Your function returns 10, where I would expect 9.

1 Peter
2 Larsson
3 is
4 very
5 very
6 annoying
7 with
8 so
9 called
10 comments
See, attentively, the delimiters, please

BTW
1) Do you speak French ?
2) Would you like help me. I need the translation 18 phrases to Swedish language ?
Igor.

http://udfunctions.blogspot.com/
Go to Top of Page

Igor2004
More clever than you

78 Posts

Posted - 2006-06-12 : 11:02:10
quote:
Perhaps it would be possible to check for imbedded hyphens? If surrounded by non-delimiter characters, don't treat hyphen as delimiter, even if selected as one?

Certainly, but
this is a problem is more complex.
Igor.

http://udfunctions.blogspot.com/
Go to Top of Page

SwePeso
Patron Saint of Lost Yaks

30421 Posts

Posted - 2006-06-12 : 12:04:50
quote:
BTW
1) Do you speak French ?
2) Would you like help me. I need the translation 18 phrases to Swedish language ?
Igor.


1) Oui, je parle francais en peu.
2) Glad to help.


Peter Larsson
Helsingborg, Sweden
Go to Top of Page

Igor2004
More clever than you

78 Posts

Posted - 2006-06-12 : 22:02:10
quote:
Perhaps it would be possible to check for imbedded hyphens? If surrounded by non-delimiter characters, don't treat hyphen as delimiter, even if selected as one?

This is function, without errors :-)
CREATE function [dbo].[GETWORDCOUNT3]  (@cString nvarchar(4000), @cDelimiters1 nvarchar(256) = ' ,.', @cDelimiters2 nvarchar(256) = '-')
returns smallint
as
begin
declare @wordcount smallint
select @cDelimiters1 = isnull(@cDelimiters1, ' ,.'), -- if no break string1 is specified, the function uses spaces, '.' and ',' to delimit words.
@cDelimiters2 = isnull(@cDelimiters2, '-') -- if no break string2 is specified, the function uses '-' to delimit words.

select @wordcount = count(*) from dbo.GETALLWORDS_AUXILIARY(@cString, @cDelimiters1+@cDelimiters2 ) where WORDNUM = 1 or not(datalength(DELIMITERS)/2 = 1 and charindex(DELIMITERS, @cDelimiters2) > 0)

return @wordcount
end


 -- GETALLWORDS_AUXILIARY() User-Defined Function Inserts the words from a string into the table.
-- GETALLWORDS_AUXILIARY(@cString[, @cDelimiters])
-- Parameters
-- @cString nvarchar(4000) - Specifies the string whose words will be inserted into the table @GETALLWORDS.
-- @cDelimiters nvarchar(256) - Optional. Specifies one or more optional characters used to separate words in @cString.
-- The default delimiters are space, tab, carriage return, and line feed. Note that GETALLWORDS_AUXILIARY( ) uses each of the characters in @cDelimiters as individual delimiters, not the entire string as a single delimiter.
-- Return Value table
-- Remarks GETALLWORDS_AUXILIARY() by default assumes that words are delimited by spaces or tabs. If you specify another character as delimiter, this function ignores spaces and tabs and uses only the specified character.
-- Example
-- declare @cString nvarchar(4000)
-- set @cString = 'The default delimiters are space, tab, carriage return, and line feed. If you specify another character as delimiter, this function ignores spaces and tabs and uses only the specified character.'
-- select * from dbo.GETALLWORDS_AUXILIARY(@cString, default)
-- select * from dbo.GETALLWORDS_AUXILIARY(@cString, ' ,.')
-- See Also GETWORDNUM() , GETWORDCOUNT() User-Defined Functions
ALTER function [dbo].[GETALLWORDS_AUXILIARY] (@cString nvarchar(4000), @cDelimiters nvarchar(256))
returns @GETALLWORDS table (WORDNUM smallint, WORD nvarchar(4000), STARTOFWORD smallint, LENGTHOFWORD smallint, DELIMITERS nvarchar(4000))
begin
declare @k smallint, @wordcount smallint, @nEndString smallint, @BegOfWord smallint, @flag bit, @EndPreviousWord smallint, @Delimiters nvarchar(4000)

select @k = 1, @wordcount = 1, @BegOfWord = 1, @flag = 0, @cString = isnull(@cString, ''), @Delimiters = '',
@cDelimiters = isnull(@cDelimiters, nchar(32)+nchar(9)+nchar(10)+nchar(13)), -- if no break string is specified, the function uses spaces, tabs, carriage return and line feed to delimit words.
@nEndString = 1 + datalength(@cString) /(case SQL_VARIANT_PROPERTY(@cString,'BaseType') when 'nvarchar' then 2 else 1 end) -- for unicode

while 1 > 0
begin
if @k - @BegOfWord > 0
begin
if @wordcount > 1
begin
select @EndPreviousWord = STARTOFWORD + LENGTHOFWORD from @GETALLWORDS where WORDNUM = @wordcount - 1
select @Delimiters = substring(@cString, @EndPreviousWord, @BegOfWord - @EndPreviousWord)
end
else if @BegOfWord > 1
select @Delimiters = substring(@cString, 1, @BegOfWord -1)

insert into @GETALLWORDS (WORDNUM, WORD, STARTOFWORD, LENGTHOFWORD, DELIMITERS) values( @wordcount, substring(@cString, @BegOfWord, @k-@BegOfWord), @BegOfWord, @k-@BegOfWord, @Delimiters) -- previous word
select @wordcount = @wordcount + 1, @BegOfWord = @k
end
if @flag = 1
break

while charindex(substring(@cString, @k, 1) COLLATE Latin1_General_BIN, @cDelimiters COLLATE Latin1_General_BIN) > 0 and @nEndString > @k -- skip break characters, if any
select @k = @k + 1, @BegOfWord = @BegOfWord + 1
while charindex(substring(@cString, @k, 1) COLLATE Latin1_General_BIN, @cDelimiters COLLATE Latin1_General_BIN) = 0 and @nEndString > @k -- skip the character in the word
select @k = @k + 1
if @k >= @nEndString
select @flag = 1
end
return
end


select [dbo].[GETWORDCOUNT3]('Peter Larsson - is very, very annoying with so-called comments.', default, default)  -- display 9


http://udfunctions.blogspot.com/
Go to Top of Page

X002548
Not Just a Number

15586 Posts

Posted - 2006-06-26 : 14:51:07
Why would you want a function without errors?

Brett

8-)

Hint: Want your questions answered fast? Follow the direction in this link
http://weblogs.sqlteam.com/brettk/archive/2005/05/25/5276.aspx

Add yourself!
http://www.frappr.com/sqlteam
Go to Top of Page
    Next Page

- Advertisement -