Technical Article

Proper Case A String Or Name

,

SQL 2005 user defined function to convert a string to proper case (title case).

Flexible handling of delimiters and apostrophes to cater for words like "hasn't" and also names like "O'Reilly" and "d'Angelo".

Mac and Mc surnames are handled generically, with some rules and exceptions which may need to be extended.  Since McIntyre and Mcintyre are both valid capitalisations, the input string capitalisation is preserved where capitalisation cannot be conclusively determined as incorrect.  Credits to Kim Ryan's NameParse http://cpan.uwinnipeg.ca/htdocs/Lingua-EN-NameParse/Lingua/EN/NameParse.pm.htm for Mac and Mc exceptions.

This rule could also apply for other family name variants like "d'Angelo" and "D'Angelo" but it has not been applied in this version.

To use in SQL 2000, replace nvarchar(max) with unicode or non-unicode limited length string, and remove semi-colons within the script.

CREATE FUNCTION [dbo].[ProperCase] 
(
	@input	nvarchar(max)
) 
/*
Copyright © D.Allen-Williams
SQL 2005 user defined function to convert a string to proper case (title case).

Flexible handling of delimiters and apostrophes to cater for words like "hasn't" and also names like "O'Reilly" and "d'Angelo".

Mac and Mc surnames are handled generically, with some rules and exceptions which may need to be extended.  Since McIntyre and Mcintyre are both valid capitalisations, the input string capitalisation is preserved where capitalisation cannot be conclusively determined as incorrect.  Credits to Kim Ryan's NameParse http://cpan.uwinnipeg.ca/htdocs/Lingua-EN-NameParse/Lingua/EN/NameParse.pm.htm for Mac and Mc exceptions.

This rule could also apply for other name variants like "d'Angelo" and "D'Angelo" but it has not been applied in this version.

To use in SQL 2000, replace nvarchar(max) with unicode or non-unicode limited length string, and remove semi-colons within the script.

Examples:
	SELECT	dbo.ProperCase(null),
			dbo.ProperCase('')
	SELECT	dbo.ProperCase('gus van der sant'),
			dbo.ProperCase('gus van sant')		-- this won't work unless the exceptions section is modified
	SELECT	dbo.ProperCase('machinery macaw mackie macjewski m'),
			dbo.ProperCase('MacDonald Macdonald macdonald '),					-- middle spelling should be preserved
			dbo.ProperCase('McIntyre mcintyre Mcintyre MCINTYRE McinàTyre')
	SELECT	dbo.ProperCase('23b 8th avenue, o''reilly''s tavern, i''ve found something, haven''t i?')
*/
RETURNS nvarchar(max)
AS
BEGIN
	-- Return NULL if input string is NULL
	IF @input IS NULL RETURN NULL;
	
	-- Declarations
	DECLARE @output			nvarchar(max)
		,	@char			nvarchar(1)
		,	@nextChar		nvarchar(1)
		,	@delim			nvarchar(1)
		,	@posn			int
		,	@nextPosn		int
		,	@len			int
		,	@lowerExists	bit;
	
	-- Initialisation
	SELECT	@posn			= 1
		,	@len			= LEN(@input)
		,	@output			= LOWER(@input)
		,	@char			= LEFT(LOWER(@input), 1)
		,	@lowerExists	= CASE WHEN LEFT(LOWER(@input), 1) = 'i' THEN 1 ELSE 0 END;
	
	-- Get the first character
	WHILE @posn <= @len
	BEGIN
		-- Skip non-letters (delimiters)
		WHILE UNICODE(@char) = UNICODE(UPPER(@char)) AND @posn <= @len
			SELECT	@delim = @char
				,	@char  = SUBSTRING(@output, @posn+1, 1)
				,	@posn  = @posn + 1;
		
		-- Convert the first character after a delimiter to upper case
		IF (@delim <> N'''' OR @lowerExists = 0)	-- ignore Isn't, but match O'Reilly
		   AND (CHARINDEX(@delim,'0123456789') = 0 OR SUBSTRING(@output, @posn, 2) NOT IN ('st','nd','rd','th'))	-- ignore 1st, 2nd, 3rd, 4th
		   AND @posn <= @len
		BEGIN
			SET @output = STUFF(@output, @posn, 1, UPPER(@char));

			-- Check for McXXX
			IF SUBSTRING(@output, @posn, 2) = 'Mc'
			BEGIN
				-- Find next delimiter
				SET @nextPosn = @posn + 1;
				WHILE UNICODE(SUBSTRING(@output, @nextPosn, 1)) <> UNICODE(UPPER(SUBSTRING(@output, @nextPosn, 1))) AND @nextPosn <= @len
					SET @nextPosn = @nextPosn + 1;

				-- Ignore any names less than 5 letters
				IF @nextPosn >= @posn + 5
				   -- preserve original capitalisation if first letter was capitalised and remaining weren't
				   AND (SUBSTRING(@output, @posn, @nextPosn-@posn) COLLATE Latin1_General_CS_AS) NOT LIKE (SUBSTRING(@input, @posn, @nextPosn-@posn) COLLATE Latin1_General_CS_AS)
					SET @output = STUFF(@output, @posn + 2, 1, UPPER(SUBSTRING(@output, @posn + 2, 1)));

				SET @posn = @nextPosn - 1;	-- skip to last character of name
			END

			-- Check for MacXXX
			IF SUBSTRING(@output, @posn, 3) = 'Mac'
			BEGIN
				-- Find next delimiter and the trailing character of the name
				SET @nextPosn = @posn + 2;
				WHILE UNICODE(SUBSTRING(@output, @nextPosn, 1)) <> UNICODE(UPPER(SUBSTRING(@output, @nextPosn, 1))) AND @nextPosn <= @len
					SELECT	@nextChar = SUBSTRING(@output, @nextPosn, 1)
						,	@nextPosn = @nextPosn + 1;

				-- Ignore any names less than 6 letters
				IF @nextPosn >= @posn + 6
				   -- exclude typical Polish or Italian Mac names ending in a,c,i,o,z or j
				   AND @nextChar NOT IN ('a','c','i','o','z','j')
				   -- preserve original capitalisation if first letter was capitalised and remaining weren't
				   AND (SUBSTRING(@output, @posn, @nextPosn-@posn) COLLATE Latin1_General_CS_AS) NOT LIKE (SUBSTRING(@input, @posn, @nextPosn-@posn) COLLATE Latin1_General_CS_AS)
					SET @output = STUFF(@output, @posn + 3, 1, UPPER(SUBSTRING(@output, @posn + 3, 1)));

				SET @posn = @nextPosn - 1;	-- skip to last character of name
			END
				
		END

		-- Increment position.  Treat as if the apostrophe follows non capitals for I've, I'm, I'd.
		SELECT	@lowerExists = CASE WHEN @char = 'i' THEN 1 ELSE 0 END	
			,	@char  = SUBSTRING(@output, @posn+1, 1)
			,	@posn  = @posn + 1;

		-- Skip remaining letters.
		WHILE UNICODE(@char) <> UNICODE(UPPER(@char)) AND @posn <= @len
			SELECT	@lowerExists = 1
				,	@char  = SUBSTRING(@output, @posn+1, 1)
				,	@posn  = @posn + 1;
	END

	-- Handle special name cases
	SET @output = ' ' + @output + ' ';	-- pad so we can find matches at the start and end of the string
	SET @output = REPLACE(@output, ' D''',  ' d''');
	SET @output = REPLACE(@output, ' De ',  ' de ');
	SET @output = REPLACE(@output, ' Del ', ' del ');
	SET @output = REPLACE(@output, ' Der ', ' der ');
	SET @output = REPLACE(@output, ' Di ',  ' di ');
	SET @output = REPLACE(@output, ' Du ',  ' du ');
	SET @output = REPLACE(@output, ' El ',  ' el ');
	SET @output = REPLACE(@output, ' La ',  ' la ');
	SET @output = REPLACE(@output, ' Le ',  ' le ');
	SET @output = REPLACE(@output, ' Lo ',  ' lo ');
	SET @output = REPLACE(@output, ' Los ', ' los ');
	SET @output = REPLACE(@output, ' Von ', ' von ');
	SET @output = REPLACE(@output, ' Van de ', ' van de ');
	SET @output = REPLACE(@output, ' Van der ',' van der ');
	-- SET @output = REPLACE(@output, ' Van ', ' van '); -- don't replace because this might be a motor vehicle
	SET @output = SUBSTRING(@output, 2, @len);	-- remove padding

	-- Correct Mac exceptions
	SET @output = REPLACE(@output, 'MacHin',	'Machin');		-- machine and variants
	SET @output = REPLACE(@output, 'MacHlin',	'Machlin');
	SET @output = REPLACE(@output, 'MacKle',	'Mackle');
	SET @output = REPLACE(@output, 'MacKlin',	'Macklin');
	SET @output = REPLACE(@output, 'MacKie',	'Mackie');
	SET @output = REPLACE(@output, 'MacHado',	'Machado');		-- Portuguese
	SET @output = REPLACE(@output, 'MacEvicius','Macevicius');	-- Lithuanian
	SET @output = REPLACE(@output, 'MacIulis',	'Maciulis');	-- Lithuanian
	SET @output = REPLACE(@output, 'MacIas',	'Macias');		-- Lithuanian
	SET @output = REPLACE(@output, 'Macmurdo',	'MacMurdo');	-- only 'Mac' name ending in 'o'?
	-- SET @output = REPLACE(@output, 'MacHar',	'Machar');		-- not an exception, MacHarg is valid

	RETURN @output;
END

Rate

5 (2)

You rated this post out of 5. Change rating

Share

Share

Rate

5 (2)

You rated this post out of 5. Change rating