diff --git a/Sources/FoundationEssentials/String/CMakeLists.txt b/Sources/FoundationEssentials/String/CMakeLists.txt index 720eb218d..6cc7994d7 100644 --- a/Sources/FoundationEssentials/String/CMakeLists.txt +++ b/Sources/FoundationEssentials/String/CMakeLists.txt @@ -15,10 +15,12 @@ target_sources(FoundationEssentials PRIVATE BidirectionalCollection.swift BuiltInUnicodeScalarSet.swift + IANACharsetNames.swift RegexPatternCache.swift String+Bridging.swift String+Comparison.swift String+Encoding.swift + String+Encoding+Names.swift String+EndianAdaptorSequence.swift String+Essentials.swift String+IO.swift diff --git a/Sources/FoundationEssentials/String/IANACharsetNames.swift b/Sources/FoundationEssentials/String/IANACharsetNames.swift new file mode 100644 index 000000000..8f3e88f09 --- /dev/null +++ b/Sources/FoundationEssentials/String/IANACharsetNames.swift @@ -0,0 +1,213 @@ +//===----------------------------------------------------------------------===// +// +// This source file is part of the Swift.org open source project +// +// Copyright (c) 2025 Apple Inc. and the Swift project authors +// Licensed under Apache License v2.0 with Runtime Library Exception +// +// See https://swift.org/LICENSE.txt for license information +// See https://swift.org/CONTRIBUTORS.txt for the list of Swift project authors +// +//===----------------------------------------------------------------------===// + + +// WARNING: DO NOT EDIT THIS FILE DIRECTLY. +// This is auto-generated by `update-iana-charset-names`. + + +extension IANACharset { + /// IANA Charset `US-ASCII`. + static let usASCII = IANACharset( + preferredMIMEName: "US-ASCII", + name: "US-ASCII", + aliases: [ + "iso-ir-6", + "ANSI_X3.4-1968", + "ANSI_X3.4-1986", + "ISO_646.irv:1991", + "ISO646-US", + "US-ASCII", + "us", + "IBM367", + "cp367", + "csASCII", + ] + ) + + /// IANA Charset `ISO-8859-1`. + static let iso8859_1 = IANACharset( + preferredMIMEName: "ISO-8859-1", + name: "ISO_8859-1:1987", + aliases: [ + "iso-ir-100", + "ISO_8859-1", + "ISO-8859-1", + "latin1", + "l1", + "IBM819", + "CP819", + "csISOLatin1", + ] + ) + + /// IANA Charset `ISO-8859-2`. + static let iso8859_2 = IANACharset( + preferredMIMEName: "ISO-8859-2", + name: "ISO_8859-2:1987", + aliases: [ + "iso-ir-101", + "ISO_8859-2", + "ISO-8859-2", + "latin2", + "l2", + "csISOLatin2", + ] + ) + + /// IANA Charset `Shift_JIS`. + static let shiftJIS = IANACharset( + preferredMIMEName: "Shift_JIS", + name: "Shift_JIS", + aliases: [ + "MS_Kanji", + "csShiftJIS", + ] + ) + + /// IANA Charset `EUC-JP`. + static let eucJP = IANACharset( + preferredMIMEName: "EUC-JP", + name: "Extended_UNIX_Code_Packed_Format_for_Japanese", + aliases: [ + "csEUCPkdFmtJapanese", + "EUC-JP", + ] + ) + + /// IANA Charset `ISO-2022-JP`. + static let iso2022JP = IANACharset( + preferredMIMEName: "ISO-2022-JP", + name: "ISO-2022-JP", + aliases: [ + "csISO2022JP", + ] + ) + + /// IANA Charset `UTF-8`. + static let utf8 = IANACharset( + preferredMIMEName: nil, + name: "UTF-8", + aliases: [ + "csUTF8", + ] + ) + + /// IANA Charset `UTF-16BE`. + static let utf16BE = IANACharset( + preferredMIMEName: nil, + name: "UTF-16BE", + aliases: [ + "csUTF16BE", + ] + ) + + /// IANA Charset `UTF-16LE`. + static let utf16LE = IANACharset( + preferredMIMEName: nil, + name: "UTF-16LE", + aliases: [ + "csUTF16LE", + ] + ) + + /// IANA Charset `UTF-16`. + static let utf16 = IANACharset( + preferredMIMEName: nil, + name: "UTF-16", + aliases: [ + "csUTF16", + ] + ) + + /// IANA Charset `UTF-32`. + static let utf32 = IANACharset( + preferredMIMEName: nil, + name: "UTF-32", + aliases: [ + "csUTF32", + ] + ) + + /// IANA Charset `UTF-32BE`. + static let utf32BE = IANACharset( + preferredMIMEName: nil, + name: "UTF-32BE", + aliases: [ + "csUTF32BE", + ] + ) + + /// IANA Charset `UTF-32LE`. + static let utf32LE = IANACharset( + preferredMIMEName: nil, + name: "UTF-32LE", + aliases: [ + "csUTF32LE", + ] + ) + + /// IANA Charset `macintosh`. + static let macintosh = IANACharset( + preferredMIMEName: nil, + name: "macintosh", + aliases: [ + "mac", + "csMacintosh", + ] + ) + + /// IANA Charset `windows-1250`. + static let windows1250 = IANACharset( + preferredMIMEName: nil, + name: "windows-1250", + aliases: [ + "cswindows1250", + ] + ) + + /// IANA Charset `windows-1251`. + static let windows1251 = IANACharset( + preferredMIMEName: nil, + name: "windows-1251", + aliases: [ + "cswindows1251", + ] + ) + + /// IANA Charset `windows-1252`. + static let windows1252 = IANACharset( + preferredMIMEName: nil, + name: "windows-1252", + aliases: [ + "cswindows1252", + ] + ) + + /// IANA Charset `windows-1253`. + static let windows1253 = IANACharset( + preferredMIMEName: nil, + name: "windows-1253", + aliases: [ + "cswindows1253", + ] + ) + + /// IANA Charset `windows-1254`. + static let windows1254 = IANACharset( + preferredMIMEName: nil, + name: "windows-1254", + aliases: [ + "cswindows1254", + ] + ) +} diff --git a/Sources/FoundationEssentials/String/String+Encoding+Names.swift b/Sources/FoundationEssentials/String/String+Encoding+Names.swift new file mode 100644 index 000000000..1407e5ae1 --- /dev/null +++ b/Sources/FoundationEssentials/String/String+Encoding+Names.swift @@ -0,0 +1,155 @@ +//===----------------------------------------------------------------------===// +// +// This source file is part of the Swift.org open source project +// +// Copyright (c) 2025 Apple Inc. and the Swift project authors +// Licensed under Apache License v2.0 with Runtime Library Exception +// +// See https://swift.org/LICENSE.txt for license information +// See https://swift.org/CONTRIBUTORS.txt for the list of Swift project authors +// +//===----------------------------------------------------------------------===// + + +// MARK: - Private extensions for parsing encoding names + +private extension UTF8.CodeUnit { + func _isASCIICaseInsensitivelyEqual(to other: UTF8.CodeUnit) -> Bool { + return switch self { + case other, other._uppercased, other._lowercased: true + default: false + } + } +} + +private extension String { + func _isASCIICaseInsensitivelyEqual(to other: String) -> Bool { + let (myUTF8, otherUTF8) = (self.utf8, other.utf8) + var (myIndex, otherIndex) = (myUTF8.startIndex, otherUTF8.startIndex) + while myIndex < myUTF8.endIndex && otherIndex < otherUTF8.endIndex { + guard myUTF8[myIndex]._isASCIICaseInsensitivelyEqual(to: otherUTF8[otherIndex]) else { + return false + } + + myUTF8.formIndex(after: &myIndex) + otherUTF8.formIndex(after: &otherIndex) + } + return myIndex == myUTF8.endIndex && otherIndex == otherUTF8.endIndex + } +} + + +// MARK: - IANA Charset Names + +/// Info about IANA Charset. +internal struct IANACharset { + /// Preferred MIME Name + let preferredMIMEName: String? + + /// The name of this charset + let name: String + + /// The aliases of this charset + let aliases: Array + + var representativeName: String { + return preferredMIMEName ?? name + } + + init(preferredMIMEName: String?, name: String, aliases: Array) { + self.preferredMIMEName = preferredMIMEName + self.name = name + self.aliases = aliases + } + + func matches(_ string: String) -> Bool { + if let preferredMIMEName = self.preferredMIMEName, + preferredMIMEName._isASCIICaseInsensitivelyEqual(to: string) { + return true + } + if name._isASCIICaseInsensitivelyEqual(to: string) { + return true + } + for alias in aliases { + if alias._isASCIICaseInsensitivelyEqual(to: string) { + return true + } + } + return false + } +} + + +// MARK: - `String.Encoding` Names + +extension String.Encoding { + private var _ianaCharset: IANACharset? { + switch self { + case .utf8: .utf8 + case .ascii: .usASCII + case .japaneseEUC: .eucJP + case .isoLatin1: .iso8859_1 + case .shiftJIS: .shiftJIS + case .isoLatin2: .iso8859_2 + case .unicode: .utf16 + case .windowsCP1251: .windows1251 + case .windowsCP1252: .windows1252 + case .windowsCP1253: .windows1253 + case .windowsCP1254: .windows1254 + case .windowsCP1250: .windows1250 + case .iso2022JP: .iso2022JP + case .macOSRoman: .macintosh + case .utf16BigEndian: .utf16BE + case .utf16LittleEndian: .utf16LE + case .utf32: .utf32 + case .utf32BigEndian: .utf32BE + case .utf32LittleEndian: .utf32LE + default: nil + } + } + + /// The name of this encoding that is compatible with the one of the IANA registry "charset". + @available(FoundationPreview 6.3, *) + public var ianaName: String? { + return _ianaCharset?.representativeName + } + + /// Creates an instance from the name of the IANA registry "charset". + /// + /// - Note: The given name is compared to each IANA "charset" name + /// with ASCII case-insensitive collation + /// to determine which encoding is suitable. + @available(FoundationPreview 6.3, *) + public init?(ianaName charsetName: String) { + let possibilities: [String.Encoding] = [ + .utf8, + .ascii, + .japaneseEUC, + .isoLatin1, + .shiftJIS, + .isoLatin2, + .unicode, // .utf16 + .windowsCP1251, + .windowsCP1252, + .windowsCP1253, + .windowsCP1254, + .windowsCP1250, + .iso2022JP, + .macOSRoman, + .utf16BigEndian, + .utf16LittleEndian, + .utf32, + .utf32BigEndian, + .utf32LittleEndian, + ] + + for encoding in possibilities { + if encoding._ianaCharset!.matches(charsetName) { + self = encoding + return + } + } + return nil + } +} + diff --git a/Tests/FoundationEssentialsTests/StringTests.swift b/Tests/FoundationEssentialsTests/StringTests.swift index 26286be15..6a6781874 100644 --- a/Tests/FoundationEssentialsTests/StringTests.swift +++ b/Tests/FoundationEssentialsTests/StringTests.swift @@ -1397,6 +1397,69 @@ private struct StringTests { "abcd🎺efgh" ]) } + + @Test func encodingNames() { + // Encoding to Name + #expect(String.Encoding.ascii.ianaName == "US-ASCII") + #expect(String.Encoding.nextstep.ianaName == nil) + #expect(String.Encoding.japaneseEUC.ianaName == "EUC-JP") + #expect(String.Encoding.utf8.ianaName == "UTF-8") + #expect(String.Encoding.isoLatin1.ianaName == "ISO-8859-1") + #expect(String.Encoding.symbol.ianaName == nil) + #expect(String.Encoding.nonLossyASCII.ianaName == nil) + #expect(String.Encoding.shiftJIS.ianaName == "Shift_JIS") + #expect(String.Encoding.isoLatin2.ianaName == "ISO-8859-2") + #expect(String.Encoding.unicode.ianaName == "UTF-16") + #expect(String.Encoding.windowsCP1251.ianaName == "windows-1251") + #expect(String.Encoding.windowsCP1252.ianaName == "windows-1252") + #expect(String.Encoding.windowsCP1253.ianaName == "windows-1253") + #expect(String.Encoding.windowsCP1254.ianaName == "windows-1254") + #expect(String.Encoding.windowsCP1250.ianaName == "windows-1250") + #expect(String.Encoding.iso2022JP.ianaName == "ISO-2022-JP") + #expect(String.Encoding.macOSRoman.ianaName == "macintosh") + #expect(String.Encoding.utf16BigEndian.ianaName == "UTF-16BE") + #expect(String.Encoding.utf16LittleEndian.ianaName == "UTF-16LE") + #expect(String.Encoding.utf32.ianaName == "UTF-32") + #expect(String.Encoding.utf32BigEndian.ianaName == "UTF-32BE") + #expect(String.Encoding.utf32LittleEndian.ianaName == "UTF-32LE") + #expect(String.Encoding(rawValue: .max).ianaName == nil) + + // Name to Encoding + #expect(String.Encoding(ianaName: "us-ascii") == .ascii) + #expect(String.Encoding(ianaName: "iso-ir-2") == nil) + #expect(String.Encoding(ianaName: "x-nextstep") == nil) + #expect(String.Encoding(ianaName: "euc-jp") == .japaneseEUC) + #expect(String.Encoding(ianaName: "CP51932") == nil) + #expect(String.Encoding(ianaName: "utf-8") == .utf8) + #expect(String.Encoding(ianaName: "iso_8859-1") == .isoLatin1) + #expect(String.Encoding(ianaName: "x-mac-symbol") == nil) + #expect(String.Encoding(ianaName: "Adobe-symbol-encoding") == nil) + #expect(String.Encoding(ianaName: "cp932") == nil) + #expect(String.Encoding(ianaName: "shift_jis") == .shiftJIS) + #expect(String.Encoding(ianaName: "windows-31j") == nil) + #expect(String.Encoding(ianaName: "iso_8859-2") == .isoLatin2) + #expect(String.Encoding(ianaName: "utf-16") == .utf16) + #expect(String.Encoding(ianaName: "iso-10646-ucs-2") == nil) + #expect(String.Encoding(ianaName: "unicode-1-1") == nil) + #expect(String.Encoding(ianaName: "windows-1251") == .windowsCP1251) + #expect(String.Encoding(ianaName: "windows-1252") == .windowsCP1252) + #expect(String.Encoding(ianaName: "ISO-8859-1-Windows-3.0-Latin-1") == nil) + #expect(String.Encoding(ianaName: "ISO-8859-1-Windows-3.1-Latin-1") == nil) + #expect(String.Encoding(ianaName: "windows-1253") == .windowsCP1253) + #expect(String.Encoding(ianaName: "windows-1254") == .windowsCP1254) + #expect(String.Encoding(ianaName: "iso-8859-9-windows-Latin-5") == nil) + #expect(String.Encoding(ianaName: "windows-1250") == .windowsCP1250) + #expect(String.Encoding(ianaName: "iso-8859-2-windows-Latin-2") == nil) + #expect(String.Encoding(ianaName: "iso-2022-jp") == .iso2022JP) + #expect(String.Encoding(ianaName: "macintosh") == .macOSRoman) + #expect(String.Encoding(ianaName: "utf-16be") == .utf16BigEndian) + #expect(String.Encoding(ianaName: "utf-16le") == .utf16LittleEndian) + #expect(String.Encoding(ianaName: "utf-32") == .utf32) + #expect(String.Encoding(ianaName: "iso-10646-ucs-4") == nil) + #expect(String.Encoding(ianaName: "utf-32be") == .utf32BigEndian) + #expect(String.Encoding(ianaName: "utf-32le") == .utf32LittleEndian) + #expect(String.Encoding(ianaName: "foo-bar-baz") == nil) + } } // MARK: - Helper functions diff --git a/utils/update-iana-charset-names b/utils/update-iana-charset-names new file mode 100755 index 000000000..e56e972b1 --- /dev/null +++ b/utils/update-iana-charset-names @@ -0,0 +1,62 @@ +#!/usr/bin/env bash +##===----------------------------------------------------------------------===## +## +## This source file is part of the Swift.org open source project +## +## Copyright (c) 2025 Apple Inc. and the Swift project authors +## Licensed under Apache License v2.0 with Runtime Library Exception +## +## See https://swift.org/LICENSE.txt for license information +## See https://swift.org/CONTRIBUTORS.txt for the list of Swift project authors +## +##===----------------------------------------------------------------------===## + +# This is a shell script that generates a Swift source code file which contains +# the list of IANA "Character Sets". + +set -eu + +declare -r commandName="$(basename "$0")" +declare -r utilsDir="$(cd "$(dirname "$0")" && pwd)" +declare -r foundationRepoDir="$(cd "${utilsDir}/.." && pwd)" +declare -r targetSwiftFileRelativePath="Sources/FoundationEssentials/String/IANACharsetNames.swift" + +declare -r copyrightYear=$( + currentYear=$(date +%Y) + if [[ $currentYear -eq 2025 ]]; then + echo 2025 + else + echo 2025-${currentYear} + fi +) +declare -r swiftLicenseHeader=" +//===----------------------------------------------------------------------===// +// +// This source file is part of the Swift.org open source project +// +// Copyright (c) ${copyrightYear} Apple Inc. and the Swift project authors +// Licensed under Apache License v2.0 with Runtime Library Exception +// +// See https://swift.org/LICENSE.txt for license information +// See https://swift.org/CONTRIBUTORS.txt for the list of Swift project authors +// +//===----------------------------------------------------------------------===// +" +declare -r warningComment=" +// WARNING: DO NOT EDIT THIS FILE DIRECTLY. +// This is auto-generated by \`${commandName}\`. + +" + +echo "Generating Swift source code..." 1>&2 +declare generatedCode +generatedCode=$( + echo "${swiftLicenseHeader##$'\n'}" + echo "$warningComment" + swift -D PRINT_CODE "${utilsDir}/${commandName}-impl.swift" +) + +echo "Writing the code to '${targetSwiftFileRelativePath}'..." 1>&2 +echo "$generatedCode" >"${foundationRepoDir}/${targetSwiftFileRelativePath}" + +echo "Done." 1>&2 diff --git a/utils/update-iana-charset-names-impl.swift b/utils/update-iana-charset-names-impl.swift new file mode 100755 index 000000000..c7e83e0d8 --- /dev/null +++ b/utils/update-iana-charset-names-impl.swift @@ -0,0 +1,195 @@ +#!/usr/bin/env swift -D PRINT_CODE +//===----------------------------------------------------------------------===// +// +// This source file is part of the Swift.org open source project +// +// Copyright (c) 2025 Apple Inc. and the Swift project authors +// Licensed under Apache License v2.0 with Runtime Library Exception +// +// See https://swift.org/LICENSE.txt for license information +// See https://swift.org/CONTRIBUTORS.txt for the list of Swift project authors +// +//===----------------------------------------------------------------------===// + +/* + +This is a Swift script that converts an XML file containing the list of IANA +"Character Sets" to Swift source code. +This script generates minimum code and is intended to be executed by other shell +script. + + */ + +import Foundation +#if canImport(FoundationXML) +import FoundationXML +#endif + +// MARK: - Constants + +let requiredCharsetNames = [ + "UTF-8", + "US-ASCII", + "EUC-JP", + "ISO-8859-1", + "Shift_JIS", + "ISO-8859-2", + "UTF-16", + "windows-1251", + "windows-1252", + "windows-1253", + "windows-1254", + "windows-1250", + "ISO-2022-JP", + "macintosh", + "UTF-16BE", + "UTF-16LE", + "UTF-32", + "UTF-32BE", + "UTF-32LE", +] +let charsetsXMLURL = URL( + string: "https://www.iana.org/assignments/character-sets/character-sets.xml" +)! +let charsetsXMLNamespace = "http://www.iana.org/assignments" +let swiftCodeIndent = " " + + +// MARK: - Implementation + +enum CodeGenerationError: Swift.Error { + case missingName + case missingAliasValue + case noRootElement +} + +/// Representation of element in 'character-sets.xml' +/// +/// The structure of element is as blow: +/// ```xml +/// +/// US-ASCII +/// +/// 3 +/// ANSI X3.4-1986 +/// iso-ir-6 +/// ANSI_X3.4-1968 +/// ANSI_X3.4-1986 +/// ISO_646.irv:1991 +/// ISO646-US +/// US-ASCII +/// us +/// IBM367 +/// cp367 +/// csASCII +/// US-ASCII +/// +/// ``` +struct IANACharsetNameRecord { + /// Preferred MIME Name + let preferredMIMEName: String? + + /// The name of this charset + let name: String + + /// The aliases of this charset + let aliases: Array + + var representativeName: String { + return preferredMIMEName ?? name + } + + var swiftCodeLines: [String] { + var lines: [String] = [] + lines.append("/// IANA Charset `\(representativeName)`.") + lines.append("static let \(representativeName._camelcased()) = IANACharset(") + lines.append("\(swiftCodeIndent)preferredMIMEName: \(preferredMIMEName.map { #""\#($0)""# } ?? "nil"),") + lines.append("\(swiftCodeIndent)name: \"\(name)\",") + lines.append("\(swiftCodeIndent)aliases: [") + for alias in aliases { + lines.append("\(swiftCodeIndent)\(swiftCodeIndent)\"\(alias)\",") + } + lines.append("\(swiftCodeIndent)]") + lines.append(")") + return lines + } + + init(_ node: XMLNode) throws { + guard let name = try node.nodes(forXPath: "./name").first?.stringValue else { + throw CodeGenerationError.missingName + } + self.name = name + self.preferredMIMEName = try node.nodes(forXPath: "./preferred_alias").first?.stringValue + self.aliases = try node.nodes(forXPath: "./alias").map { + guard let alias = $0.stringValue else { + throw CodeGenerationError.missingAliasValue + } + return alias + } + } +} + +func generateSwiftCode() throws -> String { + let charsetsXMLDocument = try XMLDocument(contentsOf: charsetsXMLURL) + guard let charsetsXMLRoot = charsetsXMLDocument.rootElement() else { + throw CodeGenerationError.noRootElement + } + let charsetsXMLRecordElements = try charsetsXMLRoot.nodes(forXPath: "./registry/record") + + var result = "extension IANACharset {" + + for record in try charsetsXMLRecordElements.map({ + try IANACharsetNameRecord($0) + }) where requiredCharsetNames.contains(record.representativeName) { + result += "\n" + result += record.swiftCodeLines.map({ swiftCodeIndent + $0 }).joined(separator: "\n") + result += "\n" + } + + result += "}\n" + return result +} + +#if PRINT_CODE +print(try generateSwiftCode()) +#endif + +// MARK: - Extensions + +extension UTF8.CodeUnit { + var _isASCIINumeric: Bool { (0x30...0x39).contains(self) } + var _isASCIIUppercase: Bool { (0x41...0x5A).contains(self) } + var _isASCIILowercase: Bool { (0x61...0x7A).contains(self) } +} + +extension String { + func _camelcased() -> String { + var result = "" + var previousWord: Substring.UTF8View? = nil + for wordUTF8 in self.utf8.split(whereSeparator: { + !$0._isASCIINumeric && + !$0._isASCIIUppercase && + !$0._isASCIILowercase + }) { + defer { + previousWord = wordUTF8 + } + let word = String(Substring(wordUTF8)) + guard let previousWord else { + result += word.lowercased() + continue + } + if previousWord.last!._isASCIINumeric && wordUTF8.first!._isASCIINumeric { + result += "_" + } + if let firstNonNumericIndex = wordUTF8.firstIndex(where: { !$0._isASCIINumeric }), + wordUTF8[firstNonNumericIndex...].allSatisfy({ $0._isASCIIUppercase }) { + result += word + } else { + result += word.capitalized(with: nil) + } + + } + return result + } +}