Attribute VB_Name = "basUTF8"
Option Explicit
' *******************************************************************************************
' Copyright (c) 2005 DI Management Services Pty Limited. All rights reserved.
' This code was originally written by David Ireland.
' It is not to be altered or distributed, except as part of an application.
' You are free to use it in any application, provided the copyright notice is left unchanged.
' *******************************************************************************************
Public Function IsUTF8(Str As String, Optional AcceptNull As Boolean = False) As Boolean
' Returns True if string Str conforms to requirements for valid UTF-8, otherwise False.
' Note that a string of 7-bit ASCII characters will return True (because that is still valid UTF-8)
' but this version will return False if the string contains a zero
' (NUL, Chr(0)) character unless flag AcceptNull is True.
' Returns True for all valid UTF-8 strings, not just those that could be converted to Latin-1.
' References:
' 1. "UTF-8 and Unicode FAQ for Unix/Linux" by Marcus Kuhn <http://www.cl.cam.ac.uk/~mgk25/unicode.html>
' 2. "Unicode UTF-8 encoding" by Oscar van Vlijmen <http://www1.tip.nl/~t876506/utf8tbl.html>
Dim i As Long
Dim nFillBytes As Long
Dim bv As Integer
IsUTF8 = False ' Guilty until proven innocent
nFillBytes = 0
' VB uses little-endian 16-bit Unicode strings so we examine values byte-by-byte
' expecting every odd byte {1,3,5,...} to be non-zero
' and every even byte {2,4,6,...} to be zero
For i = 1 To LenB(Str)
' Get character code for this byte
bv = AscB(MidB(Str, i, 1))
''Debug.Print Hex(bv) & " ";
' Odd or even?
If (i And 1) Then
' An odd byte so expecting non-zero value
If bv = 0 And AcceptNull = False Then
' We choose to reject strings that contain a NUL character
GoTo Done
End If
If bv = &HFE Or bv = &HFF Then
' Two values UTF-8 never uses
GoTo Done
End If
If nFillBytes > 0 Then
' We are expecting a `fill' byte that must be in the range 0x80 to 0xBF
If bv >= &H80 And bv <= &HBF Then
' OK, we have a valid fill byte
nFillBytes = nFillBytes - 1
Else
GoTo Done
End If
ElseIf bv >= &HC0 Then
' We have found a `lead' byte: how many `fill' bytes follow?
If bv >= &HC0 And bv <= &HDF Then
nFillBytes = 1
ElseIf bv >= &HE0 And bv <= &HEF Then
nFillBytes = 2
ElseIf bv >= &HF0 And bv <= &HF7 Then
nFillBytes = 3
ElseIf bv >= &HF8 And bv <= &HFB Then
nFillBytes = 4
ElseIf bv >= &HFC And bv <= &HFD Then
nFillBytes = 5
Else
GoTo Done
End If
ElseIf bv > &H7F Then
' Not allowed on its own
GoTo Done
Else
' We have a 7-bit ASCII char, which is OK
' so just continue
End If
Else
' An even byte that should be zero
If bv <> 0 Then
' We have a UCS-2 Unicode character > 255 => not UTF-8
GoTo Done
End If
End If
Next
' If we got here, we must be valid
IsUTF8 = True
Done:
''Debug.Print
Exit Function
End Function