Attribute VB_Name = "basUTF8"
Option Explicit

' *******************************************************************************************
' Copyright (c) 2005 DI Management Services Pty Limited. All rights reserved.
' This code was originally written by David Ireland.
' It is not to be altered or distributed, except as part of an application.
' You are free to use it in any application, provided the copyright notice is left unchanged.
' *******************************************************************************************

Public Function IsUTF8(Str As String, Optional AcceptNull As Boolean = False) As Boolean
' Returns True if string Str conforms to requirements for valid UTF-8, otherwise False.
' Note that a string of 7-bit ASCII characters will return True (because that is still valid UTF-8)
' but this version will return False if the string contains a zero
' (NUL, Chr(0)) character unless flag AcceptNull is True.
' Returns True for all valid UTF-8 strings, not just those that could be converted to Latin-1.
' References:
' 1. "UTF-8 and Unicode FAQ for Unix/Linux" by Marcus Kuhn <http://www.cl.cam.ac.uk/~mgk25/unicode.html>
' 2. "Unicode UTF-8 encoding" by Oscar van Vlijmen <http://www1.tip.nl/~t876506/utf8tbl.html>

    Dim i As Long
    Dim nFillBytes As Long
    Dim bv As Integer
    
    IsUTF8 = False  ' Guilty until proven innocent
    nFillBytes = 0
    ' VB uses little-endian 16-bit Unicode strings so we examine values byte-by-byte
    ' expecting every odd byte {1,3,5,...} to be non-zero
    ' and every even byte {2,4,6,...} to be zero
    For i = 1 To LenB(Str)
        ' Get character code for this byte
        bv = AscB(MidB(Str, i, 1))
        ''Debug.Print Hex(bv) & " ";
        ' Odd or even?
        If (i And 1) Then
            ' An odd byte so expecting non-zero value
            If bv = 0 And AcceptNull = False Then
                ' We choose to reject strings that contain a NUL character
                GoTo Done
            End If
            If bv = &HFE Or bv = &HFF Then
                ' Two values UTF-8 never uses
                GoTo Done
            End If
            If nFillBytes > 0 Then
                ' We are expecting a `fill' byte that must be in the range 0x80 to 0xBF
                If bv >= &H80 And bv <= &HBF Then
                    ' OK, we have a valid fill byte
                    nFillBytes = nFillBytes - 1
                Else
                    GoTo Done
                End If
            ElseIf bv >= &HC0 Then
                ' We have found a `lead' byte: how many `fill' bytes follow?
                If bv >= &HC0 And bv <= &HDF Then
                    nFillBytes = 1
                ElseIf bv >= &HE0 And bv <= &HEF Then
                    nFillBytes = 2
                ElseIf bv >= &HF0 And bv <= &HF7 Then
                    nFillBytes = 3
                ElseIf bv >= &HF8 And bv <= &HFB Then
                    nFillBytes = 4
                ElseIf bv >= &HFC And bv <= &HFD Then
                    nFillBytes = 5
                Else
                    GoTo Done
                End If
            ElseIf bv > &H7F Then
                ' Not allowed on its own
                GoTo Done
            Else
                ' We have a 7-bit ASCII char, which is OK
                ' so just continue
            End If
            
        Else
            ' An even byte that should be zero
            If bv <> 0 Then
                ' We have a UCS-2 Unicode character > 255 => not UTF-8
                GoTo Done
            End If
        End If
    Next
    
    ' If we got here, we must be valid
    IsUTF8 = True
    
Done:
    ''Debug.Print
    Exit Function
    
End Function