From 3bc76c01f956557e4766ac466f76edd895483fa4 Mon Sep 17 00:00:00 2001 From: YOCKOW Date: Fri, 9 May 2025 15:49:34 +0900 Subject: [PATCH 01/14] Import implementation for String Encoding Names from other repo. - source: https://github.com/YOCKOW/SF-StringEncodingNameImpl --- .../String/String+Encoding+Names.swift | 551 ++++++++++++++++++ 1 file changed, 551 insertions(+) create mode 100644 Sources/FoundationEssentials/String/String+Encoding+Names.swift diff --git a/Sources/FoundationEssentials/String/String+Encoding+Names.swift b/Sources/FoundationEssentials/String/String+Encoding+Names.swift new file mode 100644 index 000000000..07ca26c21 --- /dev/null +++ b/Sources/FoundationEssentials/String/String+Encoding+Names.swift @@ -0,0 +1,551 @@ +//===----------------------------------------------------------------------===// +// +// This source file is part of the Swift.org open source project +// +// Copyright (c) 2025 Apple Inc. and the Swift project authors +// Licensed under Apache License v2.0 with Runtime Library Exception +// +// See https://swift.org/LICENSE.txt for license information +// See https://swift.org/CONTRIBUTORS.txt for the list of Swift project authors +// +//===----------------------------------------------------------------------===// + + +// MARK: - Private extensions for parsing encoding names + +private extension Unicode.Scalar { + var _isASCIINumeric: Bool { + return ("0"..."9").contains(self) + } + + var _asciiNumericValue: Int { + assert(_isASCIINumeric) + return Int(self.value - 0x30) + } + + /// Returns the Boolean value that indicates whether or not `self` is "ASCII whitespace". + /// + /// Reference: https://infra.spec.whatwg.org/#ascii-whitespace + var _isASCIIWhitespace: Bool { + switch self.value { + case 0x09, 0x0A, 0x0C, 0x0D, 0x20: true + default: false + } + } +} + +private extension String { + var _trimmed: Substring.UnicodeScalarView { + let scalars = self.unicodeScalars + let isNonWhitespace: (Unicode.Scalar) -> Bool = { !$0._isASCIIWhitespace } + guard let firstIndexOfNonWhitespace = scalars.firstIndex(where: isNonWhitespace), + let lastIndexOfNonWhitespace = scalars.lastIndex(where: isNonWhitespace) else { + return Substring.UnicodeScalarView() + } + return scalars[firstIndexOfNonWhitespace...lastIndexOfNonWhitespace] + } +} + +/// A type that holds a `Unicode.Scalar` where its value is compared case-insensitively with others' +/// _if the value is within ASCII range_. +private struct ASCIICaseInsensitiveUnicodeScalar: Equatable, + ExpressibleByUnicodeScalarLiteral { + typealias UnicodeScalarLiteralType = Unicode.Scalar.UnicodeScalarLiteralType + + let scalar: Unicode.Scalar + + @inlinable + init(_ scalar: Unicode.Scalar) { + assert(scalar.isASCII) + self.scalar = scalar + } + + init(unicodeScalarLiteral value: Unicode.Scalar.UnicodeScalarLiteralType) { + self.init(Unicode.Scalar(unicodeScalarLiteral: value)) + } + + @inlinable + static func ==( + lhs: ASCIICaseInsensitiveUnicodeScalar, + rhs: ASCIICaseInsensitiveUnicodeScalar + ) -> Bool { + if lhs.scalar == rhs.scalar { + return true + } else if ("A"..."Z").contains(lhs.scalar) { + return lhs.scalar.value + 0x20 == rhs.scalar.value + } else if ("a"..."z").contains(lhs.scalar) { + return lhs.scalar.value - 0x20 == rhs.scalar.value + } + return false + } +} + +/// A type to tokenize string for `String.Encoding` names. +private protocol StringEncodingNameTokenizer: ~Copyable { + associatedtype Token: Equatable + init(name: String) + mutating func nextToken() throws -> Token? +} + +extension StringEncodingNameTokenizer where Self: ~Copyable { + mutating func hasEqualTokens(with other: consuming Self) throws -> Bool { + while let myToken = try self.nextToken() { + guard let otherToken = try other.nextToken(), + myToken == otherToken else { + return false + } + } + return try other.nextToken() == nil + } +} + +/// ICU-independent parser that follows [Charset Alias Matching](https://www.unicode.org/reports/tr22/tr22-8.html#Charset_Alias_Matching). +private struct UTS22Tokenizer: StringEncodingNameTokenizer, ~Copyable { + enum Token: Equatable { + case numeric(Int) + case alphabet(ASCIICaseInsensitiveUnicodeScalar) + } + + enum Error: Swift.Error { + case tooLargeNumericValue + } + + let scalars: String.UnicodeScalarView + + private var _currentIndex: String.UnicodeScalarView.Index + + init(name: String) { + self.scalars = name.unicodeScalars + self._currentIndex = scalars.startIndex + } + + mutating func nextToken() throws -> Token? { + guard _currentIndex < scalars.endIndex else { + return nil + } + + let scalar = scalars[_currentIndex] + switch scalar { + case "0"..."9": + // Parse a numeric value ignoring leading zeros. + // + // NOTE: To prevent the value from overflow, a threhold is set here. + // The max number of digits to be expected is 8 as of now: i.g. `csISO42JISC62261978`. + // It wouldn't matter to throw an error in practice when the value is too large. + + let threshold: Int = 999_999_999 + var value = scalar._asciiNumericValue + scalars.formIndex(after: &_currentIndex) + while _currentIndex < scalars.endIndex { + let currentScalar = scalars[_currentIndex] + guard currentScalar._isASCIINumeric else { + break + } + value = value * 10 + currentScalar._asciiNumericValue + if value > threshold { + throw Error.tooLargeNumericValue + } + scalars.formIndex(after: &_currentIndex) + } + return .numeric(value) + case "A"..."Z", "a"..."z": + scalars.formIndex(after: &_currentIndex) + return .alphabet(ASCIICaseInsensitiveUnicodeScalar(scalar)) + default: + scalars.formIndex(after: &_currentIndex) + if _currentIndex < scalars.endIndex { + return try nextToken() + } + return nil + } + } +} + + +/// A parser that tokenizes a string into `ASCIICaseInsensitiveUnicodeScalar`s. +private struct ASCIICaseInsensitiveTokenizer: StringEncodingNameTokenizer, ~Copyable { + typealias Token = ASCIICaseInsensitiveUnicodeScalar + + enum Error: Swift.Error { + case nonASCII + } + + let scalars: Substring.UnicodeScalarView + + var _currentIndex: Substring.UnicodeScalarView.Index + + init(name: String) { + self.scalars = name._trimmed + self._currentIndex = scalars.startIndex + } + + mutating func nextToken() throws -> Token? { + guard _currentIndex < scalars.endIndex else { + return nil + } + let scalar = scalars[_currentIndex] + guard scalar.isASCII else { throw Error.nonASCII } + defer { + scalars.formIndex(after: &_currentIndex) + } + return ASCIICaseInsensitiveUnicodeScalar(scalar) + } +} + + +private extension String { + func isEqual( + to other: String, + tokenizedBy tokenizer: T.Type + ) -> Bool where T: StringEncodingNameTokenizer, T: ~Copyable { + do { + var myTokenizer = T(name: self) + let otherTokenizer = T(name: other) + return try myTokenizer.hasEqualTokens(with: otherTokenizer) + } catch { + // Any errors imply that `self` or `other` contains invalid characters. + return false + } + } +} + + +// MARK: - IANA Charset Names + +/// Info about IANA Charset. +private struct IANACharset { + /// Preferred MIME Name + let preferredMIMEName: String? + + /// The name of this charset + let name: String + + /// The aliases of this charset + let aliases: Array + + var representativeName: String { + return preferredMIMEName ?? name + } + + init(preferredMIMEName: String?, name: String, aliases: Array) { + self.preferredMIMEName = preferredMIMEName + self.name = name + self.aliases = aliases + } + + func matches( + _ string: String, + tokenizedBy tokenizer: T.Type + ) -> Bool where T: StringEncodingNameTokenizer, T: ~Copyable { + if let preferredMIMEName = self.preferredMIMEName, + preferredMIMEName.isEqual(to: string, tokenizedBy: tokenizer) { + return true + } + if name.isEqual(to: string, tokenizedBy: tokenizer) { + return true + } + for alias in aliases { + if alias.isEqual(to: string, tokenizedBy: tokenizer) { + return true + } + } + return false + } +} + +// Extracted only necessary charsets from https://www.iana.org/assignments/character-sets/character-sets.xhtml +extension IANACharset { + /// IANA Characater Set `US-ASCII` + static let usASCII = IANACharset( + preferredMIMEName: "US-ASCII", + name: "US-ASCII", + aliases: [ + "iso-ir-6", + "ANSI_X3.4-1968", + "ANSI_X3.4-1986", + "ISO_646.irv:1991", + "ISO646-US", + "US-ASCII", + "us", + "IBM367", + "cp367", + "csASCII", + ] + ) + + /// IANA Characater Set `ISO-8859-1` + static let iso8859_1 = IANACharset( + preferredMIMEName: "ISO-8859-1", + name: "ISO_8859-1:1987", + aliases: [ + "iso-ir-100", + "ISO_8859-1", + "ISO-8859-1", + "latin1", + "l1", + "IBM819", + "CP819", + "csISOLatin1", + ] + ) + + /// IANA Characater Set `ISO-8859-2` + static let iso8859_2 = IANACharset( + preferredMIMEName: "ISO-8859-2", + name: "ISO_8859-2:1987", + aliases: [ + "iso-ir-101", + "ISO_8859-2", + "ISO-8859-2", + "latin2", + "l2", + "csISOLatin2", + ] + ) + + /// IANA Characater Set `Shift_JIS` + static let shiftJIS = IANACharset( + preferredMIMEName: "Shift_JIS", + name: "Shift_JIS", + aliases: [ + "MS_Kanji", + "csShiftJIS", + ] + ) + + /// IANA Characater Set `EUC-JP` + static let eucJP = IANACharset( + preferredMIMEName: "EUC-JP", + name: "Extended_UNIX_Code_Packed_Format_for_Japanese", + aliases: [ + "csEUCPkdFmtJapanese", + "EUC-JP", + ] + ) + + /// IANA Characater Set `ISO-2022-JP` + static let iso2022JP = IANACharset( + preferredMIMEName: "ISO-2022-JP", + name: "ISO-2022-JP", + aliases: [ + "csISO2022JP", + ] + ) + + /// IANA Characater Set `UTF-8` + static let utf8 = IANACharset( + preferredMIMEName: nil, + name: "UTF-8", + aliases: [ + "csUTF8", + ] + ) + + /// IANA Characater Set `UTF-16BE` + static let utf16BE = IANACharset( + preferredMIMEName: nil, + name: "UTF-16BE", + aliases: [ + "csUTF16BE", + ] + ) + + /// IANA Characater Set `UTF-16LE` + static let utf16LE = IANACharset( + preferredMIMEName: nil, + name: "UTF-16LE", + aliases: [ + "csUTF16LE", + ] + ) + + /// IANA Characater Set `UTF-16` + static let utf16 = IANACharset( + preferredMIMEName: nil, + name: "UTF-16", + aliases: [ + "csUTF16", + ] + ) + + /// IANA Characater Set `UTF-32` + static let utf32 = IANACharset( + preferredMIMEName: nil, + name: "UTF-32", + aliases: [ + "csUTF32", + ] + ) + + /// IANA Characater Set `UTF-32BE` + static let utf32BE = IANACharset( + preferredMIMEName: nil, + name: "UTF-32BE", + aliases: [ + "csUTF32BE", + ] + ) + + /// IANA Characater Set `UTF-32LE` + static let utf32LE = IANACharset( + preferredMIMEName: nil, + name: "UTF-32LE", + aliases: [ + "csUTF32LE", + ] + ) + + /// IANA Characater Set `macintosh` + static let macintosh = IANACharset( + preferredMIMEName: nil, + name: "macintosh", + aliases: [ + "mac", + "csMacintosh", + ] + ) + + /// IANA Characater Set `windows-1250` + static let windows1250 = IANACharset( + preferredMIMEName: nil, + name: "windows-1250", + aliases: [ + "cswindows1250", + ] + ) + + /// IANA Characater Set `windows-1251` + static let windows1251 = IANACharset( + preferredMIMEName: nil, + name: "windows-1251", + aliases: [ + "cswindows1251", + ] + ) + + /// IANA Characater Set `windows-1252` + static let windows1252 = IANACharset( + preferredMIMEName: nil, + name: "windows-1252", + aliases: [ + "cswindows1252", + ] + ) + + /// IANA Characater Set `windows-1253` + static let windows1253 = IANACharset( + preferredMIMEName: nil, + name: "windows-1253", + aliases: [ + "cswindows1253", + ] + ) + + /// IANA Characater Set `windows-1254` + static let windows1254 = IANACharset( + preferredMIMEName: nil, + name: "windows-1254", + aliases: [ + "cswindows1254", + ] + ) +} + +// MARK: - `String.Encoding` Names + +extension String.Encoding { + private var _ianaCharset: IANACharset? { + switch self { + case .utf8: .utf8 + case .ascii: .usASCII + case .japaneseEUC: .eucJP + case .isoLatin1: .iso8859_1 + case .shiftJIS: .shiftJIS + case .isoLatin2: .iso8859_2 + case .unicode: .utf16 + case .windowsCP1251: .windows1251 + case .windowsCP1252: .windows1252 + case .windowsCP1253: .windows1253 + case .windowsCP1254: .windows1254 + case .windowsCP1250: .windows1250 + case .iso2022JP: .iso2022JP + case .macOSRoman: .macintosh + case .utf16BigEndian: .utf16BE + case .utf16LittleEndian: .utf16LE + case .utf32: .utf32 + case .utf32BigEndian: .utf32BE + case .utf32LittleEndian: .utf32LE + default: nil + } + } + + /// The name of this encoding that is compatible with the one of the IANA registry "charset". + @available(FoundationPreview 6.2, *) + public var ianaName: String? { + return _ianaCharset?.representativeName + } + + /// Creates an instance from the name of the IANA registry "charset". + @available(FoundationPreview 6.2, *) + public init?(ianaName charsetName: String) { + func __determineEncoding() -> String.Encoding? { + func __matches(_ charsets: IANACharset...) -> Bool { + assert(!charsets.isEmpty) + return charsets.contains { + $0.matches( + charsetName, + tokenizedBy: ASCIICaseInsensitiveTokenizer.self + ) + } + } + + return if __matches(.utf8) { + .utf8 + } else if __matches(.usASCII) { + .ascii + } else if __matches(.eucJP) { + .japaneseEUC + } else if __matches(.iso8859_1) { + .isoLatin1 + } else if __matches(.shiftJIS) { + .shiftJIS + } else if __matches(.iso8859_2) { + .isoLatin2 + } else if __matches(.utf16) { + .utf16 + } else if __matches(.windows1251) { + .windowsCP1251 + } else if __matches(.windows1252) { + .windowsCP1252 + } else if __matches(.windows1253) { + .windowsCP1253 + } else if __matches(.windows1254) { + .windowsCP1254 + } else if __matches(.windows1250) { + .windowsCP1250 + } else if __matches(.iso2022JP) { + .iso2022JP + } else if __matches(.macintosh) { + .macOSRoman + } else if __matches(.utf16BE) { + .utf16BigEndian + } else if __matches(.utf16LE) { + .utf16LittleEndian + } else if __matches(.utf32) { + .utf32 + } else if __matches(.utf32BE) { + .utf32BigEndian + } else if __matches(.utf32LE) { + .utf32LittleEndian + } else { + nil + } + } + + guard let encoding = __determineEncoding() else { + return nil + } + self = encoding + } +} + From 0ea8aff0a7b772fd8500bc7bb36d68b7379ce4db Mon Sep 17 00:00:00 2001 From: YOCKOW Date: Fri, 9 May 2025 16:27:55 +0900 Subject: [PATCH 02/14] Import tests for String Encoding Names from other repo. - source: https://github.com/YOCKOW/SF-StringEncodingNameImpl/blob/0.4.0/Tests/StringEncodingNameImplTests/StringEncodingNameParserTests.swift --- .../StringTests.swift | 63 +++++++++++++++++++ 1 file changed, 63 insertions(+) diff --git a/Tests/FoundationEssentialsTests/StringTests.swift b/Tests/FoundationEssentialsTests/StringTests.swift index 26286be15..a2305ff82 100644 --- a/Tests/FoundationEssentialsTests/StringTests.swift +++ b/Tests/FoundationEssentialsTests/StringTests.swift @@ -1397,6 +1397,69 @@ private struct StringTests { "abcd🎺efgh" ]) } + + func test_Encoding_names() { + // Encoding to Name + XCTAssertEqual(String._Encoding.ascii.ianaName, "US-ASCII") + XCTAssertEqual(String._Encoding.nextstep.ianaName, nil) + XCTAssertEqual(String._Encoding.japaneseEUC.ianaName, "EUC-JP") + XCTAssertEqual(String._Encoding.utf8.ianaName, "UTF-8") + XCTAssertEqual(String._Encoding.isoLatin1.ianaName, "ISO-8859-1") + XCTAssertEqual(String._Encoding.symbol.ianaName, nil) + XCTAssertEqual(String._Encoding.nonLossyASCII.ianaName, nil) + XCTAssertEqual(String._Encoding.shiftJIS.ianaName, "Shift_JIS") + XCTAssertEqual(String._Encoding.isoLatin2.ianaName, "ISO-8859-2") + XCTAssertEqual(String._Encoding.unicode.ianaName, "UTF-16") + XCTAssertEqual(String._Encoding.windowsCP1251.ianaName, "windows-1251") + XCTAssertEqual(String._Encoding.windowsCP1252.ianaName, "windows-1252") + XCTAssertEqual(String._Encoding.windowsCP1253.ianaName, "windows-1253") + XCTAssertEqual(String._Encoding.windowsCP1254.ianaName, "windows-1254") + XCTAssertEqual(String._Encoding.windowsCP1250.ianaName, "windows-1250") + XCTAssertEqual(String._Encoding.iso2022JP.ianaName, "ISO-2022-JP") + XCTAssertEqual(String._Encoding.macOSRoman.ianaName, "macintosh") + XCTAssertEqual(String._Encoding.utf16BigEndian.ianaName, "UTF-16BE") + XCTAssertEqual(String._Encoding.utf16LittleEndian.ianaName, "UTF-16LE") + XCTAssertEqual(String._Encoding.utf32.ianaName, "UTF-32") + XCTAssertEqual(String._Encoding.utf32BigEndian.ianaName, "UTF-32BE") + XCTAssertEqual(String._Encoding.utf32LittleEndian.ianaName, "UTF-32LE") + XCTAssertEqual(String._Encoding(rawValue: .max).ianaName, nil) + + // Name to Encoding + XCTAssertEqual(String._Encoding(ianaName: "us-ascii"), .ascii) + XCTAssertEqual(String._Encoding(ianaName: "iso-ir-2"), nil) + XCTAssertEqual(String._Encoding(ianaName: "x-nextstep"), nil) + XCTAssertEqual(String._Encoding(ianaName: "euc-jp"), .japaneseEUC) + XCTAssertEqual(String._Encoding(ianaName: "CP51932"), nil) + XCTAssertEqual(String._Encoding(ianaName: "utf-8"), .utf8) + XCTAssertEqual(String._Encoding(ianaName: "iso_8859-1"), .isoLatin1) + XCTAssertEqual(String._Encoding(ianaName: "x-mac-symbol"), nil) + XCTAssertEqual(String._Encoding(ianaName: "Adobe-symbol-encoding"), nil) + XCTAssertEqual(String._Encoding(ianaName: "cp932"), nil) + XCTAssertEqual(String._Encoding(ianaName: "shift_jis"), .shiftJIS) + XCTAssertEqual(String._Encoding(ianaName: "windows-31j"), nil) + XCTAssertEqual(String._Encoding(ianaName: "iso_8859-2"), .isoLatin2) + XCTAssertEqual(String._Encoding(ianaName: "utf-16"), .utf16) + XCTAssertEqual(String._Encoding(ianaName: "iso-10646-ucs-2"), nil) + XCTAssertEqual(String._Encoding(ianaName: "unicode-1-1"), nil) + XCTAssertEqual(String._Encoding(ianaName: "windows-1251"), .windowsCP1251) + XCTAssertEqual(String._Encoding(ianaName: "windows-1252"), .windowsCP1252) + XCTAssertEqual(String._Encoding(ianaName: "ISO-8859-1-Windows-3.0-Latin-1"), nil) + XCTAssertEqual(String._Encoding(ianaName: "ISO-8859-1-Windows-3.1-Latin-1"), nil) + XCTAssertEqual(String._Encoding(ianaName: "windows-1253"), .windowsCP1253) + XCTAssertEqual(String._Encoding(ianaName: "windows-1254"), .windowsCP1254) + XCTAssertEqual(String._Encoding(ianaName: "iso-8859-9-windows-Latin-5"), nil) + XCTAssertEqual(String._Encoding(ianaName: "windows-1250"), .windowsCP1250) + XCTAssertEqual(String._Encoding(ianaName: "iso-8859-2-windows-Latin-2"), nil) + XCTAssertEqual(String._Encoding(ianaName: "iso-2022-jp"), .iso2022JP) + XCTAssertEqual(String._Encoding(ianaName: "macintosh"), .macOSRoman) + XCTAssertEqual(String._Encoding(ianaName: "utf-16be"), .utf16BigEndian) + XCTAssertEqual(String._Encoding(ianaName: "utf-16le"), .utf16LittleEndian) + XCTAssertEqual(String._Encoding(ianaName: "utf-32"), .utf32) + XCTAssertEqual(String._Encoding(ianaName: "iso-10646-ucs-4"), nil) + XCTAssertEqual(String._Encoding(ianaName: "utf-32be"), .utf32BigEndian) + XCTAssertEqual(String._Encoding(ianaName: "utf-32le"), .utf32LittleEndian) + XCTAssertEqual(String._Encoding(ianaName: "foo-bar-baz"), nil) + } } // MARK: - Helper functions From 7acaa40d0ed11bd088defe6cafbe47ac6126cb0a Mon Sep 17 00:00:00 2001 From: YOCKOW Date: Tue, 17 Jun 2025 10:23:47 +0900 Subject: [PATCH 03/14] Remove dead code in terms of the current proposal. --- .../String/String+Encoding+Names.swift | 71 ------------------- 1 file changed, 71 deletions(-) diff --git a/Sources/FoundationEssentials/String/String+Encoding+Names.swift b/Sources/FoundationEssentials/String/String+Encoding+Names.swift index 07ca26c21..48c1c37ce 100644 --- a/Sources/FoundationEssentials/String/String+Encoding+Names.swift +++ b/Sources/FoundationEssentials/String/String+Encoding+Names.swift @@ -14,15 +14,6 @@ // MARK: - Private extensions for parsing encoding names private extension Unicode.Scalar { - var _isASCIINumeric: Bool { - return ("0"..."9").contains(self) - } - - var _asciiNumericValue: Int { - assert(_isASCIINumeric) - return Int(self.value - 0x30) - } - /// Returns the Boolean value that indicates whether or not `self` is "ASCII whitespace". /// /// Reference: https://infra.spec.whatwg.org/#ascii-whitespace @@ -99,68 +90,6 @@ extension StringEncodingNameTokenizer where Self: ~Copyable { } } -/// ICU-independent parser that follows [Charset Alias Matching](https://www.unicode.org/reports/tr22/tr22-8.html#Charset_Alias_Matching). -private struct UTS22Tokenizer: StringEncodingNameTokenizer, ~Copyable { - enum Token: Equatable { - case numeric(Int) - case alphabet(ASCIICaseInsensitiveUnicodeScalar) - } - - enum Error: Swift.Error { - case tooLargeNumericValue - } - - let scalars: String.UnicodeScalarView - - private var _currentIndex: String.UnicodeScalarView.Index - - init(name: String) { - self.scalars = name.unicodeScalars - self._currentIndex = scalars.startIndex - } - - mutating func nextToken() throws -> Token? { - guard _currentIndex < scalars.endIndex else { - return nil - } - - let scalar = scalars[_currentIndex] - switch scalar { - case "0"..."9": - // Parse a numeric value ignoring leading zeros. - // - // NOTE: To prevent the value from overflow, a threhold is set here. - // The max number of digits to be expected is 8 as of now: i.g. `csISO42JISC62261978`. - // It wouldn't matter to throw an error in practice when the value is too large. - - let threshold: Int = 999_999_999 - var value = scalar._asciiNumericValue - scalars.formIndex(after: &_currentIndex) - while _currentIndex < scalars.endIndex { - let currentScalar = scalars[_currentIndex] - guard currentScalar._isASCIINumeric else { - break - } - value = value * 10 + currentScalar._asciiNumericValue - if value > threshold { - throw Error.tooLargeNumericValue - } - scalars.formIndex(after: &_currentIndex) - } - return .numeric(value) - case "A"..."Z", "a"..."z": - scalars.formIndex(after: &_currentIndex) - return .alphabet(ASCIICaseInsensitiveUnicodeScalar(scalar)) - default: - scalars.formIndex(after: &_currentIndex) - if _currentIndex < scalars.endIndex { - return try nextToken() - } - return nil - } - } -} - /// A parser that tokenizes a string into `ASCIICaseInsensitiveUnicodeScalar`s. private struct ASCIICaseInsensitiveTokenizer: StringEncodingNameTokenizer, ~Copyable { From 556347230dc0a9042ec31c4263199dedcf634c6c Mon Sep 17 00:00:00 2001 From: YOCKOW Date: Sun, 21 Sep 2025 15:07:46 +0900 Subject: [PATCH 04/14] Use `Testing` for String Encoding Names tests. --- .../StringTests.swift | 116 +++++++++--------- 1 file changed, 58 insertions(+), 58 deletions(-) diff --git a/Tests/FoundationEssentialsTests/StringTests.swift b/Tests/FoundationEssentialsTests/StringTests.swift index a2305ff82..6a6781874 100644 --- a/Tests/FoundationEssentialsTests/StringTests.swift +++ b/Tests/FoundationEssentialsTests/StringTests.swift @@ -1398,67 +1398,67 @@ private struct StringTests { ]) } - func test_Encoding_names() { + @Test func encodingNames() { // Encoding to Name - XCTAssertEqual(String._Encoding.ascii.ianaName, "US-ASCII") - XCTAssertEqual(String._Encoding.nextstep.ianaName, nil) - XCTAssertEqual(String._Encoding.japaneseEUC.ianaName, "EUC-JP") - XCTAssertEqual(String._Encoding.utf8.ianaName, "UTF-8") - XCTAssertEqual(String._Encoding.isoLatin1.ianaName, "ISO-8859-1") - XCTAssertEqual(String._Encoding.symbol.ianaName, nil) - XCTAssertEqual(String._Encoding.nonLossyASCII.ianaName, nil) - XCTAssertEqual(String._Encoding.shiftJIS.ianaName, "Shift_JIS") - XCTAssertEqual(String._Encoding.isoLatin2.ianaName, "ISO-8859-2") - XCTAssertEqual(String._Encoding.unicode.ianaName, "UTF-16") - XCTAssertEqual(String._Encoding.windowsCP1251.ianaName, "windows-1251") - XCTAssertEqual(String._Encoding.windowsCP1252.ianaName, "windows-1252") - XCTAssertEqual(String._Encoding.windowsCP1253.ianaName, "windows-1253") - XCTAssertEqual(String._Encoding.windowsCP1254.ianaName, "windows-1254") - XCTAssertEqual(String._Encoding.windowsCP1250.ianaName, "windows-1250") - XCTAssertEqual(String._Encoding.iso2022JP.ianaName, "ISO-2022-JP") - XCTAssertEqual(String._Encoding.macOSRoman.ianaName, "macintosh") - XCTAssertEqual(String._Encoding.utf16BigEndian.ianaName, "UTF-16BE") - XCTAssertEqual(String._Encoding.utf16LittleEndian.ianaName, "UTF-16LE") - XCTAssertEqual(String._Encoding.utf32.ianaName, "UTF-32") - XCTAssertEqual(String._Encoding.utf32BigEndian.ianaName, "UTF-32BE") - XCTAssertEqual(String._Encoding.utf32LittleEndian.ianaName, "UTF-32LE") - XCTAssertEqual(String._Encoding(rawValue: .max).ianaName, nil) + #expect(String.Encoding.ascii.ianaName == "US-ASCII") + #expect(String.Encoding.nextstep.ianaName == nil) + #expect(String.Encoding.japaneseEUC.ianaName == "EUC-JP") + #expect(String.Encoding.utf8.ianaName == "UTF-8") + #expect(String.Encoding.isoLatin1.ianaName == "ISO-8859-1") + #expect(String.Encoding.symbol.ianaName == nil) + #expect(String.Encoding.nonLossyASCII.ianaName == nil) + #expect(String.Encoding.shiftJIS.ianaName == "Shift_JIS") + #expect(String.Encoding.isoLatin2.ianaName == "ISO-8859-2") + #expect(String.Encoding.unicode.ianaName == "UTF-16") + #expect(String.Encoding.windowsCP1251.ianaName == "windows-1251") + #expect(String.Encoding.windowsCP1252.ianaName == "windows-1252") + #expect(String.Encoding.windowsCP1253.ianaName == "windows-1253") + #expect(String.Encoding.windowsCP1254.ianaName == "windows-1254") + #expect(String.Encoding.windowsCP1250.ianaName == "windows-1250") + #expect(String.Encoding.iso2022JP.ianaName == "ISO-2022-JP") + #expect(String.Encoding.macOSRoman.ianaName == "macintosh") + #expect(String.Encoding.utf16BigEndian.ianaName == "UTF-16BE") + #expect(String.Encoding.utf16LittleEndian.ianaName == "UTF-16LE") + #expect(String.Encoding.utf32.ianaName == "UTF-32") + #expect(String.Encoding.utf32BigEndian.ianaName == "UTF-32BE") + #expect(String.Encoding.utf32LittleEndian.ianaName == "UTF-32LE") + #expect(String.Encoding(rawValue: .max).ianaName == nil) // Name to Encoding - XCTAssertEqual(String._Encoding(ianaName: "us-ascii"), .ascii) - XCTAssertEqual(String._Encoding(ianaName: "iso-ir-2"), nil) - XCTAssertEqual(String._Encoding(ianaName: "x-nextstep"), nil) - XCTAssertEqual(String._Encoding(ianaName: "euc-jp"), .japaneseEUC) - XCTAssertEqual(String._Encoding(ianaName: "CP51932"), nil) - XCTAssertEqual(String._Encoding(ianaName: "utf-8"), .utf8) - XCTAssertEqual(String._Encoding(ianaName: "iso_8859-1"), .isoLatin1) - XCTAssertEqual(String._Encoding(ianaName: "x-mac-symbol"), nil) - XCTAssertEqual(String._Encoding(ianaName: "Adobe-symbol-encoding"), nil) - XCTAssertEqual(String._Encoding(ianaName: "cp932"), nil) - XCTAssertEqual(String._Encoding(ianaName: "shift_jis"), .shiftJIS) - XCTAssertEqual(String._Encoding(ianaName: "windows-31j"), nil) - XCTAssertEqual(String._Encoding(ianaName: "iso_8859-2"), .isoLatin2) - XCTAssertEqual(String._Encoding(ianaName: "utf-16"), .utf16) - XCTAssertEqual(String._Encoding(ianaName: "iso-10646-ucs-2"), nil) - XCTAssertEqual(String._Encoding(ianaName: "unicode-1-1"), nil) - XCTAssertEqual(String._Encoding(ianaName: "windows-1251"), .windowsCP1251) - XCTAssertEqual(String._Encoding(ianaName: "windows-1252"), .windowsCP1252) - XCTAssertEqual(String._Encoding(ianaName: "ISO-8859-1-Windows-3.0-Latin-1"), nil) - XCTAssertEqual(String._Encoding(ianaName: "ISO-8859-1-Windows-3.1-Latin-1"), nil) - XCTAssertEqual(String._Encoding(ianaName: "windows-1253"), .windowsCP1253) - XCTAssertEqual(String._Encoding(ianaName: "windows-1254"), .windowsCP1254) - XCTAssertEqual(String._Encoding(ianaName: "iso-8859-9-windows-Latin-5"), nil) - XCTAssertEqual(String._Encoding(ianaName: "windows-1250"), .windowsCP1250) - XCTAssertEqual(String._Encoding(ianaName: "iso-8859-2-windows-Latin-2"), nil) - XCTAssertEqual(String._Encoding(ianaName: "iso-2022-jp"), .iso2022JP) - XCTAssertEqual(String._Encoding(ianaName: "macintosh"), .macOSRoman) - XCTAssertEqual(String._Encoding(ianaName: "utf-16be"), .utf16BigEndian) - XCTAssertEqual(String._Encoding(ianaName: "utf-16le"), .utf16LittleEndian) - XCTAssertEqual(String._Encoding(ianaName: "utf-32"), .utf32) - XCTAssertEqual(String._Encoding(ianaName: "iso-10646-ucs-4"), nil) - XCTAssertEqual(String._Encoding(ianaName: "utf-32be"), .utf32BigEndian) - XCTAssertEqual(String._Encoding(ianaName: "utf-32le"), .utf32LittleEndian) - XCTAssertEqual(String._Encoding(ianaName: "foo-bar-baz"), nil) + #expect(String.Encoding(ianaName: "us-ascii") == .ascii) + #expect(String.Encoding(ianaName: "iso-ir-2") == nil) + #expect(String.Encoding(ianaName: "x-nextstep") == nil) + #expect(String.Encoding(ianaName: "euc-jp") == .japaneseEUC) + #expect(String.Encoding(ianaName: "CP51932") == nil) + #expect(String.Encoding(ianaName: "utf-8") == .utf8) + #expect(String.Encoding(ianaName: "iso_8859-1") == .isoLatin1) + #expect(String.Encoding(ianaName: "x-mac-symbol") == nil) + #expect(String.Encoding(ianaName: "Adobe-symbol-encoding") == nil) + #expect(String.Encoding(ianaName: "cp932") == nil) + #expect(String.Encoding(ianaName: "shift_jis") == .shiftJIS) + #expect(String.Encoding(ianaName: "windows-31j") == nil) + #expect(String.Encoding(ianaName: "iso_8859-2") == .isoLatin2) + #expect(String.Encoding(ianaName: "utf-16") == .utf16) + #expect(String.Encoding(ianaName: "iso-10646-ucs-2") == nil) + #expect(String.Encoding(ianaName: "unicode-1-1") == nil) + #expect(String.Encoding(ianaName: "windows-1251") == .windowsCP1251) + #expect(String.Encoding(ianaName: "windows-1252") == .windowsCP1252) + #expect(String.Encoding(ianaName: "ISO-8859-1-Windows-3.0-Latin-1") == nil) + #expect(String.Encoding(ianaName: "ISO-8859-1-Windows-3.1-Latin-1") == nil) + #expect(String.Encoding(ianaName: "windows-1253") == .windowsCP1253) + #expect(String.Encoding(ianaName: "windows-1254") == .windowsCP1254) + #expect(String.Encoding(ianaName: "iso-8859-9-windows-Latin-5") == nil) + #expect(String.Encoding(ianaName: "windows-1250") == .windowsCP1250) + #expect(String.Encoding(ianaName: "iso-8859-2-windows-Latin-2") == nil) + #expect(String.Encoding(ianaName: "iso-2022-jp") == .iso2022JP) + #expect(String.Encoding(ianaName: "macintosh") == .macOSRoman) + #expect(String.Encoding(ianaName: "utf-16be") == .utf16BigEndian) + #expect(String.Encoding(ianaName: "utf-16le") == .utf16LittleEndian) + #expect(String.Encoding(ianaName: "utf-32") == .utf32) + #expect(String.Encoding(ianaName: "iso-10646-ucs-4") == nil) + #expect(String.Encoding(ianaName: "utf-32be") == .utf32BigEndian) + #expect(String.Encoding(ianaName: "utf-32le") == .utf32LittleEndian) + #expect(String.Encoding(ianaName: "foo-bar-baz") == nil) } } From a783db10f9231a2f12fd923e41f2c26001e167a9 Mon Sep 17 00:00:00 2001 From: YOCKOW Date: Sun, 21 Sep 2025 15:18:15 +0900 Subject: [PATCH 05/14] NFC: Fix indentation in "String+Encoding+Names.swift". --- .../String/String+Encoding+Names.swift | 22 +++++++++---------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/Sources/FoundationEssentials/String/String+Encoding+Names.swift b/Sources/FoundationEssentials/String/String+Encoding+Names.swift index 48c1c37ce..8c5b76532 100644 --- a/Sources/FoundationEssentials/String/String+Encoding+Names.swift +++ b/Sources/FoundationEssentials/String/String+Encoding+Names.swift @@ -14,15 +14,15 @@ // MARK: - Private extensions for parsing encoding names private extension Unicode.Scalar { - /// Returns the Boolean value that indicates whether or not `self` is "ASCII whitespace". - /// - /// Reference: https://infra.spec.whatwg.org/#ascii-whitespace - var _isASCIIWhitespace: Bool { - switch self.value { - case 0x09, 0x0A, 0x0C, 0x0D, 0x20: true - default: false + /// Returns the Boolean value that indicates whether or not `self` is "ASCII whitespace". + /// + /// Reference: https://infra.spec.whatwg.org/#ascii-whitespace + var _isASCIIWhitespace: Bool { + switch self.value { + case 0x09, 0x0A, 0x0C, 0x0D, 0x20: true + default: false + } } - } } private extension String { @@ -95,9 +95,9 @@ extension StringEncodingNameTokenizer where Self: ~Copyable { private struct ASCIICaseInsensitiveTokenizer: StringEncodingNameTokenizer, ~Copyable { typealias Token = ASCIICaseInsensitiveUnicodeScalar - enum Error: Swift.Error { - case nonASCII - } + enum Error: Swift.Error { + case nonASCII + } let scalars: Substring.UnicodeScalarView From 7515bf433ef105837c6124963ec44ee845044fb6 Mon Sep 17 00:00:00 2001 From: YOCKOW Date: Sun, 21 Sep 2025 16:19:32 +0900 Subject: [PATCH 06/14] SF-0033: Adjust comments/attributes to match the accepted proposal. --- .../String/String+Encoding+Names.swift | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/Sources/FoundationEssentials/String/String+Encoding+Names.swift b/Sources/FoundationEssentials/String/String+Encoding+Names.swift index 8c5b76532..12f6466de 100644 --- a/Sources/FoundationEssentials/String/String+Encoding+Names.swift +++ b/Sources/FoundationEssentials/String/String+Encoding+Names.swift @@ -409,13 +409,17 @@ extension String.Encoding { } /// The name of this encoding that is compatible with the one of the IANA registry "charset". - @available(FoundationPreview 6.2, *) + @available(FoundationPreview 6.3, *) public var ianaName: String? { return _ianaCharset?.representativeName } /// Creates an instance from the name of the IANA registry "charset". - @available(FoundationPreview 6.2, *) + /// + /// - Note: The given name is compared to each IANA "charset" name + /// with ASCII case-insensitive collation + /// to determine which encoding is suitable. + @available(FoundationPreview 6.3, *) public init?(ianaName charsetName: String) { func __determineEncoding() -> String.Encoding? { func __matches(_ charsets: IANACharset...) -> Bool { From c72697106b6028003552b100c996c3bf27c88a94 Mon Sep 17 00:00:00 2001 From: YOCKOW Date: Sun, 12 Oct 2025 15:02:50 +0900 Subject: [PATCH 07/14] Auto-generate Swift source code for IANA Charset names. --- .../String/IANACharsetNames.swift | 213 ++++++++++++++++++ .../String/String+Encoding+Names.swift | 201 +---------------- utils/update-iana-charset-names | 62 +++++ utils/update-iana-charset-names-impl.py | 174 ++++++++++++++ 4 files changed, 451 insertions(+), 199 deletions(-) create mode 100644 Sources/FoundationEssentials/String/IANACharsetNames.swift create mode 100755 utils/update-iana-charset-names create mode 100644 utils/update-iana-charset-names-impl.py diff --git a/Sources/FoundationEssentials/String/IANACharsetNames.swift b/Sources/FoundationEssentials/String/IANACharsetNames.swift new file mode 100644 index 000000000..8f3e88f09 --- /dev/null +++ b/Sources/FoundationEssentials/String/IANACharsetNames.swift @@ -0,0 +1,213 @@ +//===----------------------------------------------------------------------===// +// +// This source file is part of the Swift.org open source project +// +// Copyright (c) 2025 Apple Inc. and the Swift project authors +// Licensed under Apache License v2.0 with Runtime Library Exception +// +// See https://swift.org/LICENSE.txt for license information +// See https://swift.org/CONTRIBUTORS.txt for the list of Swift project authors +// +//===----------------------------------------------------------------------===// + + +// WARNING: DO NOT EDIT THIS FILE DIRECTLY. +// This is auto-generated by `update-iana-charset-names`. + + +extension IANACharset { + /// IANA Charset `US-ASCII`. + static let usASCII = IANACharset( + preferredMIMEName: "US-ASCII", + name: "US-ASCII", + aliases: [ + "iso-ir-6", + "ANSI_X3.4-1968", + "ANSI_X3.4-1986", + "ISO_646.irv:1991", + "ISO646-US", + "US-ASCII", + "us", + "IBM367", + "cp367", + "csASCII", + ] + ) + + /// IANA Charset `ISO-8859-1`. + static let iso8859_1 = IANACharset( + preferredMIMEName: "ISO-8859-1", + name: "ISO_8859-1:1987", + aliases: [ + "iso-ir-100", + "ISO_8859-1", + "ISO-8859-1", + "latin1", + "l1", + "IBM819", + "CP819", + "csISOLatin1", + ] + ) + + /// IANA Charset `ISO-8859-2`. + static let iso8859_2 = IANACharset( + preferredMIMEName: "ISO-8859-2", + name: "ISO_8859-2:1987", + aliases: [ + "iso-ir-101", + "ISO_8859-2", + "ISO-8859-2", + "latin2", + "l2", + "csISOLatin2", + ] + ) + + /// IANA Charset `Shift_JIS`. + static let shiftJIS = IANACharset( + preferredMIMEName: "Shift_JIS", + name: "Shift_JIS", + aliases: [ + "MS_Kanji", + "csShiftJIS", + ] + ) + + /// IANA Charset `EUC-JP`. + static let eucJP = IANACharset( + preferredMIMEName: "EUC-JP", + name: "Extended_UNIX_Code_Packed_Format_for_Japanese", + aliases: [ + "csEUCPkdFmtJapanese", + "EUC-JP", + ] + ) + + /// IANA Charset `ISO-2022-JP`. + static let iso2022JP = IANACharset( + preferredMIMEName: "ISO-2022-JP", + name: "ISO-2022-JP", + aliases: [ + "csISO2022JP", + ] + ) + + /// IANA Charset `UTF-8`. + static let utf8 = IANACharset( + preferredMIMEName: nil, + name: "UTF-8", + aliases: [ + "csUTF8", + ] + ) + + /// IANA Charset `UTF-16BE`. + static let utf16BE = IANACharset( + preferredMIMEName: nil, + name: "UTF-16BE", + aliases: [ + "csUTF16BE", + ] + ) + + /// IANA Charset `UTF-16LE`. + static let utf16LE = IANACharset( + preferredMIMEName: nil, + name: "UTF-16LE", + aliases: [ + "csUTF16LE", + ] + ) + + /// IANA Charset `UTF-16`. + static let utf16 = IANACharset( + preferredMIMEName: nil, + name: "UTF-16", + aliases: [ + "csUTF16", + ] + ) + + /// IANA Charset `UTF-32`. + static let utf32 = IANACharset( + preferredMIMEName: nil, + name: "UTF-32", + aliases: [ + "csUTF32", + ] + ) + + /// IANA Charset `UTF-32BE`. + static let utf32BE = IANACharset( + preferredMIMEName: nil, + name: "UTF-32BE", + aliases: [ + "csUTF32BE", + ] + ) + + /// IANA Charset `UTF-32LE`. + static let utf32LE = IANACharset( + preferredMIMEName: nil, + name: "UTF-32LE", + aliases: [ + "csUTF32LE", + ] + ) + + /// IANA Charset `macintosh`. + static let macintosh = IANACharset( + preferredMIMEName: nil, + name: "macintosh", + aliases: [ + "mac", + "csMacintosh", + ] + ) + + /// IANA Charset `windows-1250`. + static let windows1250 = IANACharset( + preferredMIMEName: nil, + name: "windows-1250", + aliases: [ + "cswindows1250", + ] + ) + + /// IANA Charset `windows-1251`. + static let windows1251 = IANACharset( + preferredMIMEName: nil, + name: "windows-1251", + aliases: [ + "cswindows1251", + ] + ) + + /// IANA Charset `windows-1252`. + static let windows1252 = IANACharset( + preferredMIMEName: nil, + name: "windows-1252", + aliases: [ + "cswindows1252", + ] + ) + + /// IANA Charset `windows-1253`. + static let windows1253 = IANACharset( + preferredMIMEName: nil, + name: "windows-1253", + aliases: [ + "cswindows1253", + ] + ) + + /// IANA Charset `windows-1254`. + static let windows1254 = IANACharset( + preferredMIMEName: nil, + name: "windows-1254", + aliases: [ + "cswindows1254", + ] + ) +} diff --git a/Sources/FoundationEssentials/String/String+Encoding+Names.swift b/Sources/FoundationEssentials/String/String+Encoding+Names.swift index 12f6466de..ba2cc32ef 100644 --- a/Sources/FoundationEssentials/String/String+Encoding+Names.swift +++ b/Sources/FoundationEssentials/String/String+Encoding+Names.swift @@ -72,7 +72,7 @@ private struct ASCIICaseInsensitiveUnicodeScalar: Equatable, } /// A type to tokenize string for `String.Encoding` names. -private protocol StringEncodingNameTokenizer: ~Copyable { +internal protocol StringEncodingNameTokenizer: ~Copyable { associatedtype Token: Equatable init(name: String) mutating func nextToken() throws -> Token? @@ -142,7 +142,7 @@ private extension String { // MARK: - IANA Charset Names /// Info about IANA Charset. -private struct IANACharset { +internal struct IANACharset { /// Preferred MIME Name let preferredMIMEName: String? @@ -182,203 +182,6 @@ private struct IANACharset { } } -// Extracted only necessary charsets from https://www.iana.org/assignments/character-sets/character-sets.xhtml -extension IANACharset { - /// IANA Characater Set `US-ASCII` - static let usASCII = IANACharset( - preferredMIMEName: "US-ASCII", - name: "US-ASCII", - aliases: [ - "iso-ir-6", - "ANSI_X3.4-1968", - "ANSI_X3.4-1986", - "ISO_646.irv:1991", - "ISO646-US", - "US-ASCII", - "us", - "IBM367", - "cp367", - "csASCII", - ] - ) - - /// IANA Characater Set `ISO-8859-1` - static let iso8859_1 = IANACharset( - preferredMIMEName: "ISO-8859-1", - name: "ISO_8859-1:1987", - aliases: [ - "iso-ir-100", - "ISO_8859-1", - "ISO-8859-1", - "latin1", - "l1", - "IBM819", - "CP819", - "csISOLatin1", - ] - ) - - /// IANA Characater Set `ISO-8859-2` - static let iso8859_2 = IANACharset( - preferredMIMEName: "ISO-8859-2", - name: "ISO_8859-2:1987", - aliases: [ - "iso-ir-101", - "ISO_8859-2", - "ISO-8859-2", - "latin2", - "l2", - "csISOLatin2", - ] - ) - - /// IANA Characater Set `Shift_JIS` - static let shiftJIS = IANACharset( - preferredMIMEName: "Shift_JIS", - name: "Shift_JIS", - aliases: [ - "MS_Kanji", - "csShiftJIS", - ] - ) - - /// IANA Characater Set `EUC-JP` - static let eucJP = IANACharset( - preferredMIMEName: "EUC-JP", - name: "Extended_UNIX_Code_Packed_Format_for_Japanese", - aliases: [ - "csEUCPkdFmtJapanese", - "EUC-JP", - ] - ) - - /// IANA Characater Set `ISO-2022-JP` - static let iso2022JP = IANACharset( - preferredMIMEName: "ISO-2022-JP", - name: "ISO-2022-JP", - aliases: [ - "csISO2022JP", - ] - ) - - /// IANA Characater Set `UTF-8` - static let utf8 = IANACharset( - preferredMIMEName: nil, - name: "UTF-8", - aliases: [ - "csUTF8", - ] - ) - - /// IANA Characater Set `UTF-16BE` - static let utf16BE = IANACharset( - preferredMIMEName: nil, - name: "UTF-16BE", - aliases: [ - "csUTF16BE", - ] - ) - - /// IANA Characater Set `UTF-16LE` - static let utf16LE = IANACharset( - preferredMIMEName: nil, - name: "UTF-16LE", - aliases: [ - "csUTF16LE", - ] - ) - - /// IANA Characater Set `UTF-16` - static let utf16 = IANACharset( - preferredMIMEName: nil, - name: "UTF-16", - aliases: [ - "csUTF16", - ] - ) - - /// IANA Characater Set `UTF-32` - static let utf32 = IANACharset( - preferredMIMEName: nil, - name: "UTF-32", - aliases: [ - "csUTF32", - ] - ) - - /// IANA Characater Set `UTF-32BE` - static let utf32BE = IANACharset( - preferredMIMEName: nil, - name: "UTF-32BE", - aliases: [ - "csUTF32BE", - ] - ) - - /// IANA Characater Set `UTF-32LE` - static let utf32LE = IANACharset( - preferredMIMEName: nil, - name: "UTF-32LE", - aliases: [ - "csUTF32LE", - ] - ) - - /// IANA Characater Set `macintosh` - static let macintosh = IANACharset( - preferredMIMEName: nil, - name: "macintosh", - aliases: [ - "mac", - "csMacintosh", - ] - ) - - /// IANA Characater Set `windows-1250` - static let windows1250 = IANACharset( - preferredMIMEName: nil, - name: "windows-1250", - aliases: [ - "cswindows1250", - ] - ) - - /// IANA Characater Set `windows-1251` - static let windows1251 = IANACharset( - preferredMIMEName: nil, - name: "windows-1251", - aliases: [ - "cswindows1251", - ] - ) - - /// IANA Characater Set `windows-1252` - static let windows1252 = IANACharset( - preferredMIMEName: nil, - name: "windows-1252", - aliases: [ - "cswindows1252", - ] - ) - - /// IANA Characater Set `windows-1253` - static let windows1253 = IANACharset( - preferredMIMEName: nil, - name: "windows-1253", - aliases: [ - "cswindows1253", - ] - ) - - /// IANA Characater Set `windows-1254` - static let windows1254 = IANACharset( - preferredMIMEName: nil, - name: "windows-1254", - aliases: [ - "cswindows1254", - ] - ) -} // MARK: - `String.Encoding` Names diff --git a/utils/update-iana-charset-names b/utils/update-iana-charset-names new file mode 100755 index 000000000..23d9a2ef8 --- /dev/null +++ b/utils/update-iana-charset-names @@ -0,0 +1,62 @@ +#!/usr/bin/env bash +##===----------------------------------------------------------------------===## +## +## This source file is part of the Swift.org open source project +## +## Copyright (c) 2025 Apple Inc. and the Swift project authors +## Licensed under Apache License v2.0 with Runtime Library Exception +## +## See https://swift.org/LICENSE.txt for license information +## See https://swift.org/CONTRIBUTORS.txt for the list of Swift project authors +## +##===----------------------------------------------------------------------===## + +# This is a shell script that generates a Swift source code file which contains +# the list of IANA "Character Sets". + +set -eu + +declare -r commandName="$(basename "$0")" +declare -r utilsDir="$(cd "$(dirname "$0")" && pwd)" +declare -r foundationRepoDir="$(cd "${utilsDir}/.." && pwd)" +declare -r targetSwiftFileRelativePath="Sources/FoundationEssentials/String/IANACharsetNames.swift" + +declare -r copyrightYear=$( + currentYear=$(date +%Y) + if [[ $currentYear -eq 2025 ]]; then + echo 2025 + else + echo 2025-${currentYear} + fi +) +declare -r swiftLicenseHeader=" +//===----------------------------------------------------------------------===// +// +// This source file is part of the Swift.org open source project +// +// Copyright (c) ${copyrightYear} Apple Inc. and the Swift project authors +// Licensed under Apache License v2.0 with Runtime Library Exception +// +// See https://swift.org/LICENSE.txt for license information +// See https://swift.org/CONTRIBUTORS.txt for the list of Swift project authors +// +//===----------------------------------------------------------------------===// +" +declare -r warningComment=" +// WARNING: DO NOT EDIT THIS FILE DIRECTLY. +// This is auto-generated by \`${commandName}\`. + +" + +echo "Generating Swift source code..." 1>&2 +declare generatedCode +generatedCode=$( + echo "${swiftLicenseHeader##$'\n'}" + echo "$warningComment" + python3 "${utilsDir}/${commandName}-impl.py" +) + +echo "Writing the code to '${targetSwiftFileRelativePath}'..." 1>&2 +echo "$generatedCode" >"${foundationRepoDir}/${targetSwiftFileRelativePath}" + +echo "Done." 1>&2 diff --git a/utils/update-iana-charset-names-impl.py b/utils/update-iana-charset-names-impl.py new file mode 100644 index 000000000..399fe16e4 --- /dev/null +++ b/utils/update-iana-charset-names-impl.py @@ -0,0 +1,174 @@ +#!/usr/bin/env python3 +##===----------------------------------------------------------------------===## +## +## This source file is part of the Swift.org open source project +## +## Copyright (c) 2025 Apple Inc. and the Swift project authors +## Licensed under Apache License v2.0 with Runtime Library Exception +## +## See https://swift.org/LICENSE.txt for license information +## See https://swift.org/CONTRIBUTORS.txt for the list of Swift project authors +## +##===----------------------------------------------------------------------===## + +""" +This is a python script that converts an XML file containing the list of IANA +"Character Sets" to Swift source code. +This script generates minimum code and is intended to be executed by other shell +script. +""" + +import re +import urllib.request as request +import xml.etree.ElementTree as ElemTree +from typing import List, Optional + +REQUIRED_CHARSET_NAMES: List[str] = [ + "UTF-8", + "US-ASCII", + "EUC-JP", + "ISO-8859-1", + "Shift_JIS", + "ISO-8859-2", + "UTF-16", + "windows-1251", + "windows-1252", + "windows-1253", + "windows-1254", + "windows-1250", + "ISO-2022-JP", + "macintosh", + "UTF-16BE", + "UTF-16LE", + "UTF-32", + "UTF-32BE", + "UTF-32LE", +] +CHARSETS_XML_URL = "https://www.iana.org/assignments/character-sets/character-sets.xml" +CHARSETS_XML_NS = "http://www.iana.org/assignments" +SWIFT_CODE_INDENT = " " + + +class IANACharsetNameRecord: + """Representation of element in 'character-sets.xml' + + The structure of element is as blow: + + US-ASCII + + 3 + ANSI X3.4-1986 + iso-ir-6 + ANSI_X3.4-1968 + ANSI_X3.4-1986 + ISO_646.irv:1991 + ISO646-US + US-ASCII + us + IBM367 + cp367 + csASCII + US-ASCII + + """ + + def __init__(self, recordElem: ElemTree.Element): + self._name: str = recordElem.find('./{%s}name' % (CHARSETS_XML_NS)).text + self._preferredMIMEName: Optional[str] = getattr( + recordElem.find('./{%s}preferred_alias' % (CHARSETS_XML_NS)), + 'text', + None + ) + self._aliases: List[str] = list(map( + lambda aliasElem: aliasElem.text, + recordElem.findall('./{%s}alias' % (CHARSETS_XML_NS)) + )) + self._camelCasedName = None + + @property + def name(self) -> str: + return self._name + + @property + def preferredMIMEName(self) -> Optional[str]: + return self._preferredMIMEName + + @property + def representativeName(self) -> str: + return self.preferredMIMEName or self.name + + @property + def aliases(self) -> List[str]: + return self._aliases + + @property + def camelCasedName(self) -> str: + if (self._camelCasedName is not None): + return self._camelCasedName + + camelCasedName = "" + previousWord = None + for ii, word in enumerate(re.split(r"[^0-9A-Za-z]", self.representativeName)): + if previousWord is None: + camelCasedName = word.lower() + else: + if re.search(r"[0-9]$", previousWord) and re.search(r"^[0-9]", word): + camelCasedName += "_" + + if (re.fullmatch("[0-9]*[A-Z]+", word)): + camelCasedName += word + else: + camelCasedName += word.capitalize() + + previousWord = word + + self._camelCasedName = camelCasedName + return camelCasedName + + @property + def swiftCodeLines(self) -> List[str]: + def __stringLiteralOrNil(string: Optional[str]) -> str: + if (string is None): + return 'nil' + return f'"{string}"' + + lines: List[str] = [] + lines.append(f"/// IANA Charset `{self.representativeName}`.") + lines.append(f"static let {self.camelCasedName} = IANACharset(") + lines.append(f"{SWIFT_CODE_INDENT}preferredMIMEName: { + __stringLiteralOrNil(self.preferredMIMEName) + },") + lines.append(f'{SWIFT_CODE_INDENT}name: "{self.name}",') + lines.append(f"{SWIFT_CODE_INDENT}aliases: [") + for alias in self.aliases: + lines.append(f"{SWIFT_CODE_INDENT * 2}\"{alias}\",") + lines.append(f"{SWIFT_CODE_INDENT}]") + lines.append(")") + return lines + + +def generateSwiftCode() -> str: + charsetsXMLString = request.urlopen(request.Request(CHARSETS_XML_URL)).read() + charsetsXMLRoot = ElemTree.fromstring(charsetsXMLString) + charsetsXMLRecordElements = charsetsXMLRoot.findall( + "./{%s}registry/{%s}record" % (CHARSETS_XML_NS, CHARSETS_XML_NS) + ) + result = "extension IANACharset {" + for record in map( + lambda recordElem: IANACharsetNameRecord(recordElem), + charsetsXMLRecordElements + ): + if (record.representativeName not in REQUIRED_CHARSET_NAMES): + continue + result += "\n" + result += "\n".join(map( + lambda line: SWIFT_CODE_INDENT + line, + record.swiftCodeLines + )) + result += "\n" + result += "}\n" + return result + + +if __name__ == "__main__": + print(generateSwiftCode()) From 5c9492c5b246af6be0f69f21dbaec004aefb444f Mon Sep 17 00:00:00 2001 From: YOCKOW Date: Thu, 16 Oct 2025 14:33:52 +0900 Subject: [PATCH 08/14] Remove unnecessary `@inlinable`. --- Sources/FoundationEssentials/String/String+Encoding+Names.swift | 2 -- 1 file changed, 2 deletions(-) diff --git a/Sources/FoundationEssentials/String/String+Encoding+Names.swift b/Sources/FoundationEssentials/String/String+Encoding+Names.swift index ba2cc32ef..1d417f003 100644 --- a/Sources/FoundationEssentials/String/String+Encoding+Names.swift +++ b/Sources/FoundationEssentials/String/String+Encoding+Names.swift @@ -45,7 +45,6 @@ private struct ASCIICaseInsensitiveUnicodeScalar: Equatable, let scalar: Unicode.Scalar - @inlinable init(_ scalar: Unicode.Scalar) { assert(scalar.isASCII) self.scalar = scalar @@ -55,7 +54,6 @@ private struct ASCIICaseInsensitiveUnicodeScalar: Equatable, self.init(Unicode.Scalar(unicodeScalarLiteral: value)) } - @inlinable static func ==( lhs: ASCIICaseInsensitiveUnicodeScalar, rhs: ASCIICaseInsensitiveUnicodeScalar From ac421272d276524f2aeb6fcbeac6f7714408d474 Mon Sep 17 00:00:00 2001 From: YOCKOW Date: Thu, 16 Oct 2025 15:02:42 +0900 Subject: [PATCH 09/14] Simplify `String.init(ianaName:)`. --- .../String/String+Encoding+Names.swift | 78 +++++++------------ 1 file changed, 29 insertions(+), 49 deletions(-) diff --git a/Sources/FoundationEssentials/String/String+Encoding+Names.swift b/Sources/FoundationEssentials/String/String+Encoding+Names.swift index 1d417f003..587b2eb45 100644 --- a/Sources/FoundationEssentials/String/String+Encoding+Names.swift +++ b/Sources/FoundationEssentials/String/String+Encoding+Names.swift @@ -222,58 +222,38 @@ extension String.Encoding { /// to determine which encoding is suitable. @available(FoundationPreview 6.3, *) public init?(ianaName charsetName: String) { + let possibilities: [String.Encoding] = [ + .utf8, + .ascii, + .japaneseEUC, + .isoLatin1, + .shiftJIS, + .isoLatin2, + .unicode, // .utf16 + .windowsCP1251, + .windowsCP1252, + .windowsCP1253, + .windowsCP1254, + .windowsCP1250, + .iso2022JP, + .macOSRoman, + .utf16BigEndian, + .utf16LittleEndian, + .utf32, + .utf32BigEndian, + .utf32LittleEndian, + ] + func __determineEncoding() -> String.Encoding? { - func __matches(_ charsets: IANACharset...) -> Bool { - assert(!charsets.isEmpty) - return charsets.contains { - $0.matches( - charsetName, - tokenizedBy: ASCIICaseInsensitiveTokenizer.self - ) + for encoding in possibilities { + guard let ianaCharset = encoding._ianaCharset else { + continue + } + if ianaCharset.matches(charsetName, tokenizedBy: ASCIICaseInsensitiveTokenizer.self) { + return encoding } } - - return if __matches(.utf8) { - .utf8 - } else if __matches(.usASCII) { - .ascii - } else if __matches(.eucJP) { - .japaneseEUC - } else if __matches(.iso8859_1) { - .isoLatin1 - } else if __matches(.shiftJIS) { - .shiftJIS - } else if __matches(.iso8859_2) { - .isoLatin2 - } else if __matches(.utf16) { - .utf16 - } else if __matches(.windows1251) { - .windowsCP1251 - } else if __matches(.windows1252) { - .windowsCP1252 - } else if __matches(.windows1253) { - .windowsCP1253 - } else if __matches(.windows1254) { - .windowsCP1254 - } else if __matches(.windows1250) { - .windowsCP1250 - } else if __matches(.iso2022JP) { - .iso2022JP - } else if __matches(.macintosh) { - .macOSRoman - } else if __matches(.utf16BE) { - .utf16BigEndian - } else if __matches(.utf16LE) { - .utf16LittleEndian - } else if __matches(.utf32) { - .utf32 - } else if __matches(.utf32BE) { - .utf32BigEndian - } else if __matches(.utf32LE) { - .utf32LittleEndian - } else { - nil - } + return nil } guard let encoding = __determineEncoding() else { From c7bdbef8b1998d7083d069b5482f9d245b0605a2 Mon Sep 17 00:00:00 2001 From: YOCKOW Date: Thu, 16 Oct 2025 15:17:36 +0900 Subject: [PATCH 10/14] Add new files related to SF-0033 to CMakeLists.txt. --- Sources/FoundationEssentials/String/CMakeLists.txt | 2 ++ 1 file changed, 2 insertions(+) diff --git a/Sources/FoundationEssentials/String/CMakeLists.txt b/Sources/FoundationEssentials/String/CMakeLists.txt index 720eb218d..6cc7994d7 100644 --- a/Sources/FoundationEssentials/String/CMakeLists.txt +++ b/Sources/FoundationEssentials/String/CMakeLists.txt @@ -15,10 +15,12 @@ target_sources(FoundationEssentials PRIVATE BidirectionalCollection.swift BuiltInUnicodeScalarSet.swift + IANACharsetNames.swift RegexPatternCache.swift String+Bridging.swift String+Comparison.swift String+Encoding.swift + String+Encoding+Names.swift String+EndianAdaptorSequence.swift String+Essentials.swift String+IO.swift From e674fa689f2b0b0f54ad404514418118a6be8642 Mon Sep 17 00:00:00 2001 From: YOCKOW Date: Sun, 19 Oct 2025 17:44:33 +0900 Subject: [PATCH 11/14] Rewrite script in Swift instead of Python. In response to: https://github.com/swiftlang/swift-foundation/pull/1286#discussion_r2437753438 --- utils/update-iana-charset-names | 2 +- utils/update-iana-charset-names-impl.py | 174 ------------------ utils/update-iana-charset-names-impl.swift | 195 +++++++++++++++++++++ 3 files changed, 196 insertions(+), 175 deletions(-) delete mode 100644 utils/update-iana-charset-names-impl.py create mode 100755 utils/update-iana-charset-names-impl.swift diff --git a/utils/update-iana-charset-names b/utils/update-iana-charset-names index 23d9a2ef8..e56e972b1 100755 --- a/utils/update-iana-charset-names +++ b/utils/update-iana-charset-names @@ -53,7 +53,7 @@ declare generatedCode generatedCode=$( echo "${swiftLicenseHeader##$'\n'}" echo "$warningComment" - python3 "${utilsDir}/${commandName}-impl.py" + swift -D PRINT_CODE "${utilsDir}/${commandName}-impl.swift" ) echo "Writing the code to '${targetSwiftFileRelativePath}'..." 1>&2 diff --git a/utils/update-iana-charset-names-impl.py b/utils/update-iana-charset-names-impl.py deleted file mode 100644 index 399fe16e4..000000000 --- a/utils/update-iana-charset-names-impl.py +++ /dev/null @@ -1,174 +0,0 @@ -#!/usr/bin/env python3 -##===----------------------------------------------------------------------===## -## -## This source file is part of the Swift.org open source project -## -## Copyright (c) 2025 Apple Inc. and the Swift project authors -## Licensed under Apache License v2.0 with Runtime Library Exception -## -## See https://swift.org/LICENSE.txt for license information -## See https://swift.org/CONTRIBUTORS.txt for the list of Swift project authors -## -##===----------------------------------------------------------------------===## - -""" -This is a python script that converts an XML file containing the list of IANA -"Character Sets" to Swift source code. -This script generates minimum code and is intended to be executed by other shell -script. -""" - -import re -import urllib.request as request -import xml.etree.ElementTree as ElemTree -from typing import List, Optional - -REQUIRED_CHARSET_NAMES: List[str] = [ - "UTF-8", - "US-ASCII", - "EUC-JP", - "ISO-8859-1", - "Shift_JIS", - "ISO-8859-2", - "UTF-16", - "windows-1251", - "windows-1252", - "windows-1253", - "windows-1254", - "windows-1250", - "ISO-2022-JP", - "macintosh", - "UTF-16BE", - "UTF-16LE", - "UTF-32", - "UTF-32BE", - "UTF-32LE", -] -CHARSETS_XML_URL = "https://www.iana.org/assignments/character-sets/character-sets.xml" -CHARSETS_XML_NS = "http://www.iana.org/assignments" -SWIFT_CODE_INDENT = " " - - -class IANACharsetNameRecord: - """Representation of element in 'character-sets.xml' - - The structure of element is as blow: - - US-ASCII - - 3 - ANSI X3.4-1986 - iso-ir-6 - ANSI_X3.4-1968 - ANSI_X3.4-1986 - ISO_646.irv:1991 - ISO646-US - US-ASCII - us - IBM367 - cp367 - csASCII - US-ASCII - - """ - - def __init__(self, recordElem: ElemTree.Element): - self._name: str = recordElem.find('./{%s}name' % (CHARSETS_XML_NS)).text - self._preferredMIMEName: Optional[str] = getattr( - recordElem.find('./{%s}preferred_alias' % (CHARSETS_XML_NS)), - 'text', - None - ) - self._aliases: List[str] = list(map( - lambda aliasElem: aliasElem.text, - recordElem.findall('./{%s}alias' % (CHARSETS_XML_NS)) - )) - self._camelCasedName = None - - @property - def name(self) -> str: - return self._name - - @property - def preferredMIMEName(self) -> Optional[str]: - return self._preferredMIMEName - - @property - def representativeName(self) -> str: - return self.preferredMIMEName or self.name - - @property - def aliases(self) -> List[str]: - return self._aliases - - @property - def camelCasedName(self) -> str: - if (self._camelCasedName is not None): - return self._camelCasedName - - camelCasedName = "" - previousWord = None - for ii, word in enumerate(re.split(r"[^0-9A-Za-z]", self.representativeName)): - if previousWord is None: - camelCasedName = word.lower() - else: - if re.search(r"[0-9]$", previousWord) and re.search(r"^[0-9]", word): - camelCasedName += "_" - - if (re.fullmatch("[0-9]*[A-Z]+", word)): - camelCasedName += word - else: - camelCasedName += word.capitalize() - - previousWord = word - - self._camelCasedName = camelCasedName - return camelCasedName - - @property - def swiftCodeLines(self) -> List[str]: - def __stringLiteralOrNil(string: Optional[str]) -> str: - if (string is None): - return 'nil' - return f'"{string}"' - - lines: List[str] = [] - lines.append(f"/// IANA Charset `{self.representativeName}`.") - lines.append(f"static let {self.camelCasedName} = IANACharset(") - lines.append(f"{SWIFT_CODE_INDENT}preferredMIMEName: { - __stringLiteralOrNil(self.preferredMIMEName) - },") - lines.append(f'{SWIFT_CODE_INDENT}name: "{self.name}",') - lines.append(f"{SWIFT_CODE_INDENT}aliases: [") - for alias in self.aliases: - lines.append(f"{SWIFT_CODE_INDENT * 2}\"{alias}\",") - lines.append(f"{SWIFT_CODE_INDENT}]") - lines.append(")") - return lines - - -def generateSwiftCode() -> str: - charsetsXMLString = request.urlopen(request.Request(CHARSETS_XML_URL)).read() - charsetsXMLRoot = ElemTree.fromstring(charsetsXMLString) - charsetsXMLRecordElements = charsetsXMLRoot.findall( - "./{%s}registry/{%s}record" % (CHARSETS_XML_NS, CHARSETS_XML_NS) - ) - result = "extension IANACharset {" - for record in map( - lambda recordElem: IANACharsetNameRecord(recordElem), - charsetsXMLRecordElements - ): - if (record.representativeName not in REQUIRED_CHARSET_NAMES): - continue - result += "\n" - result += "\n".join(map( - lambda line: SWIFT_CODE_INDENT + line, - record.swiftCodeLines - )) - result += "\n" - result += "}\n" - return result - - -if __name__ == "__main__": - print(generateSwiftCode()) diff --git a/utils/update-iana-charset-names-impl.swift b/utils/update-iana-charset-names-impl.swift new file mode 100755 index 000000000..c7e83e0d8 --- /dev/null +++ b/utils/update-iana-charset-names-impl.swift @@ -0,0 +1,195 @@ +#!/usr/bin/env swift -D PRINT_CODE +//===----------------------------------------------------------------------===// +// +// This source file is part of the Swift.org open source project +// +// Copyright (c) 2025 Apple Inc. and the Swift project authors +// Licensed under Apache License v2.0 with Runtime Library Exception +// +// See https://swift.org/LICENSE.txt for license information +// See https://swift.org/CONTRIBUTORS.txt for the list of Swift project authors +// +//===----------------------------------------------------------------------===// + +/* + +This is a Swift script that converts an XML file containing the list of IANA +"Character Sets" to Swift source code. +This script generates minimum code and is intended to be executed by other shell +script. + + */ + +import Foundation +#if canImport(FoundationXML) +import FoundationXML +#endif + +// MARK: - Constants + +let requiredCharsetNames = [ + "UTF-8", + "US-ASCII", + "EUC-JP", + "ISO-8859-1", + "Shift_JIS", + "ISO-8859-2", + "UTF-16", + "windows-1251", + "windows-1252", + "windows-1253", + "windows-1254", + "windows-1250", + "ISO-2022-JP", + "macintosh", + "UTF-16BE", + "UTF-16LE", + "UTF-32", + "UTF-32BE", + "UTF-32LE", +] +let charsetsXMLURL = URL( + string: "https://www.iana.org/assignments/character-sets/character-sets.xml" +)! +let charsetsXMLNamespace = "http://www.iana.org/assignments" +let swiftCodeIndent = " " + + +// MARK: - Implementation + +enum CodeGenerationError: Swift.Error { + case missingName + case missingAliasValue + case noRootElement +} + +/// Representation of element in 'character-sets.xml' +/// +/// The structure of element is as blow: +/// ```xml +/// +/// US-ASCII +/// +/// 3 +/// ANSI X3.4-1986 +/// iso-ir-6 +/// ANSI_X3.4-1968 +/// ANSI_X3.4-1986 +/// ISO_646.irv:1991 +/// ISO646-US +/// US-ASCII +/// us +/// IBM367 +/// cp367 +/// csASCII +/// US-ASCII +/// +/// ``` +struct IANACharsetNameRecord { + /// Preferred MIME Name + let preferredMIMEName: String? + + /// The name of this charset + let name: String + + /// The aliases of this charset + let aliases: Array + + var representativeName: String { + return preferredMIMEName ?? name + } + + var swiftCodeLines: [String] { + var lines: [String] = [] + lines.append("/// IANA Charset `\(representativeName)`.") + lines.append("static let \(representativeName._camelcased()) = IANACharset(") + lines.append("\(swiftCodeIndent)preferredMIMEName: \(preferredMIMEName.map { #""\#($0)""# } ?? "nil"),") + lines.append("\(swiftCodeIndent)name: \"\(name)\",") + lines.append("\(swiftCodeIndent)aliases: [") + for alias in aliases { + lines.append("\(swiftCodeIndent)\(swiftCodeIndent)\"\(alias)\",") + } + lines.append("\(swiftCodeIndent)]") + lines.append(")") + return lines + } + + init(_ node: XMLNode) throws { + guard let name = try node.nodes(forXPath: "./name").first?.stringValue else { + throw CodeGenerationError.missingName + } + self.name = name + self.preferredMIMEName = try node.nodes(forXPath: "./preferred_alias").first?.stringValue + self.aliases = try node.nodes(forXPath: "./alias").map { + guard let alias = $0.stringValue else { + throw CodeGenerationError.missingAliasValue + } + return alias + } + } +} + +func generateSwiftCode() throws -> String { + let charsetsXMLDocument = try XMLDocument(contentsOf: charsetsXMLURL) + guard let charsetsXMLRoot = charsetsXMLDocument.rootElement() else { + throw CodeGenerationError.noRootElement + } + let charsetsXMLRecordElements = try charsetsXMLRoot.nodes(forXPath: "./registry/record") + + var result = "extension IANACharset {" + + for record in try charsetsXMLRecordElements.map({ + try IANACharsetNameRecord($0) + }) where requiredCharsetNames.contains(record.representativeName) { + result += "\n" + result += record.swiftCodeLines.map({ swiftCodeIndent + $0 }).joined(separator: "\n") + result += "\n" + } + + result += "}\n" + return result +} + +#if PRINT_CODE +print(try generateSwiftCode()) +#endif + +// MARK: - Extensions + +extension UTF8.CodeUnit { + var _isASCIINumeric: Bool { (0x30...0x39).contains(self) } + var _isASCIIUppercase: Bool { (0x41...0x5A).contains(self) } + var _isASCIILowercase: Bool { (0x61...0x7A).contains(self) } +} + +extension String { + func _camelcased() -> String { + var result = "" + var previousWord: Substring.UTF8View? = nil + for wordUTF8 in self.utf8.split(whereSeparator: { + !$0._isASCIINumeric && + !$0._isASCIIUppercase && + !$0._isASCIILowercase + }) { + defer { + previousWord = wordUTF8 + } + let word = String(Substring(wordUTF8)) + guard let previousWord else { + result += word.lowercased() + continue + } + if previousWord.last!._isASCIINumeric && wordUTF8.first!._isASCIINumeric { + result += "_" + } + if let firstNonNumericIndex = wordUTF8.firstIndex(where: { !$0._isASCIINumeric }), + wordUTF8[firstNonNumericIndex...].allSatisfy({ $0._isASCIIUppercase }) { + result += word + } else { + result += word.capitalized(with: nil) + } + + } + return result + } +} From 8f84db7ae8702407842b4325666a77b34f8a796e Mon Sep 17 00:00:00 2001 From: YOCKOW Date: Wed, 22 Oct 2025 11:25:56 +0900 Subject: [PATCH 12/14] Simplify logic to parse IANA Charset names. In response to: - https://github.com/swiftlang/swift-foundation/pull/1286#discussion_r2441497400 - https://github.com/swiftlang/swift-foundation/pull/1286#discussion_r2441505001 - https://github.com/swiftlang/swift-foundation/pull/1286#discussion_r2441546727 --- .../String/String+Encoding+Names.swift | 135 +++--------------- 1 file changed, 17 insertions(+), 118 deletions(-) diff --git a/Sources/FoundationEssentials/String/String+Encoding+Names.swift b/Sources/FoundationEssentials/String/String+Encoding+Names.swift index 587b2eb45..2e34b3fed 100644 --- a/Sources/FoundationEssentials/String/String+Encoding+Names.swift +++ b/Sources/FoundationEssentials/String/String+Encoding+Names.swift @@ -13,126 +13,28 @@ // MARK: - Private extensions for parsing encoding names -private extension Unicode.Scalar { - /// Returns the Boolean value that indicates whether or not `self` is "ASCII whitespace". - /// - /// Reference: https://infra.spec.whatwg.org/#ascii-whitespace - var _isASCIIWhitespace: Bool { - switch self.value { - case 0x09, 0x0A, 0x0C, 0x0D, 0x20: true +private extension UTF8.CodeUnit { + func _isASCIICaseinsensitivelyEqual(to other: UTF8.CodeUnit) -> Bool { + return switch self { + case other, other._uppercased, other._lowercased: true default: false } } } private extension String { - var _trimmed: Substring.UnicodeScalarView { - let scalars = self.unicodeScalars - let isNonWhitespace: (Unicode.Scalar) -> Bool = { !$0._isASCIIWhitespace } - guard let firstIndexOfNonWhitespace = scalars.firstIndex(where: isNonWhitespace), - let lastIndexOfNonWhitespace = scalars.lastIndex(where: isNonWhitespace) else { - return Substring.UnicodeScalarView() - } - return scalars[firstIndexOfNonWhitespace...lastIndexOfNonWhitespace] - } -} - -/// A type that holds a `Unicode.Scalar` where its value is compared case-insensitively with others' -/// _if the value is within ASCII range_. -private struct ASCIICaseInsensitiveUnicodeScalar: Equatable, - ExpressibleByUnicodeScalarLiteral { - typealias UnicodeScalarLiteralType = Unicode.Scalar.UnicodeScalarLiteralType - - let scalar: Unicode.Scalar - - init(_ scalar: Unicode.Scalar) { - assert(scalar.isASCII) - self.scalar = scalar - } - - init(unicodeScalarLiteral value: Unicode.Scalar.UnicodeScalarLiteralType) { - self.init(Unicode.Scalar(unicodeScalarLiteral: value)) - } - - static func ==( - lhs: ASCIICaseInsensitiveUnicodeScalar, - rhs: ASCIICaseInsensitiveUnicodeScalar - ) -> Bool { - if lhs.scalar == rhs.scalar { - return true - } else if ("A"..."Z").contains(lhs.scalar) { - return lhs.scalar.value + 0x20 == rhs.scalar.value - } else if ("a"..."z").contains(lhs.scalar) { - return lhs.scalar.value - 0x20 == rhs.scalar.value - } - return false - } -} - -/// A type to tokenize string for `String.Encoding` names. -internal protocol StringEncodingNameTokenizer: ~Copyable { - associatedtype Token: Equatable - init(name: String) - mutating func nextToken() throws -> Token? -} - -extension StringEncodingNameTokenizer where Self: ~Copyable { - mutating func hasEqualTokens(with other: consuming Self) throws -> Bool { - while let myToken = try self.nextToken() { - guard let otherToken = try other.nextToken(), - myToken == otherToken else { + func _isASCIICaseinsensitivelyEqual(to other: String) -> Bool { + let (myUTF8, otherUTF8) = (self.utf8, other.utf8) + var (myIndex, otherIndex) = (myUTF8.startIndex, otherUTF8.startIndex) + while myIndex < myUTF8.endIndex && otherIndex < otherUTF8.endIndex { + guard myUTF8[myIndex]._isASCIICaseinsensitivelyEqual(to: otherUTF8[otherIndex]) else { return false } - } - return try other.nextToken() == nil - } -} - - -/// A parser that tokenizes a string into `ASCIICaseInsensitiveUnicodeScalar`s. -private struct ASCIICaseInsensitiveTokenizer: StringEncodingNameTokenizer, ~Copyable { - typealias Token = ASCIICaseInsensitiveUnicodeScalar - enum Error: Swift.Error { - case nonASCII - } - - let scalars: Substring.UnicodeScalarView - - var _currentIndex: Substring.UnicodeScalarView.Index - - init(name: String) { - self.scalars = name._trimmed - self._currentIndex = scalars.startIndex - } - - mutating func nextToken() throws -> Token? { - guard _currentIndex < scalars.endIndex else { - return nil - } - let scalar = scalars[_currentIndex] - guard scalar.isASCII else { throw Error.nonASCII } - defer { - scalars.formIndex(after: &_currentIndex) - } - return ASCIICaseInsensitiveUnicodeScalar(scalar) - } -} - - -private extension String { - func isEqual( - to other: String, - tokenizedBy tokenizer: T.Type - ) -> Bool where T: StringEncodingNameTokenizer, T: ~Copyable { - do { - var myTokenizer = T(name: self) - let otherTokenizer = T(name: other) - return try myTokenizer.hasEqualTokens(with: otherTokenizer) - } catch { - // Any errors imply that `self` or `other` contains invalid characters. - return false + myUTF8.formIndex(after: &myIndex) + otherUTF8.formIndex(after: &otherIndex) } + return myIndex == myUTF8.endIndex && otherIndex == otherUTF8.endIndex } } @@ -160,19 +62,16 @@ internal struct IANACharset { self.aliases = aliases } - func matches( - _ string: String, - tokenizedBy tokenizer: T.Type - ) -> Bool where T: StringEncodingNameTokenizer, T: ~Copyable { + func matches(_ string: String) -> Bool { if let preferredMIMEName = self.preferredMIMEName, - preferredMIMEName.isEqual(to: string, tokenizedBy: tokenizer) { + preferredMIMEName._isASCIICaseinsensitivelyEqual(to: string) { return true } - if name.isEqual(to: string, tokenizedBy: tokenizer) { + if name._isASCIICaseinsensitivelyEqual(to: string) { return true } for alias in aliases { - if alias.isEqual(to: string, tokenizedBy: tokenizer) { + if alias._isASCIICaseinsensitivelyEqual(to: string) { return true } } @@ -249,7 +148,7 @@ extension String.Encoding { guard let ianaCharset = encoding._ianaCharset else { continue } - if ianaCharset.matches(charsetName, tokenizedBy: ASCIICaseInsensitiveTokenizer.self) { + if ianaCharset.matches(charsetName) { return encoding } } From d4ab876d7ad2a7d54968284887bfaaeb74650445 Mon Sep 17 00:00:00 2001 From: YOCKOW Date: Fri, 24 Oct 2025 11:27:53 +0900 Subject: [PATCH 13/14] Fix spelling of functions for "case-insensitively". In response to: - https://github.com/swiftlang/swift-foundation/pull/1286#discussion_r2457914338 --- .../String/String+Encoding+Names.swift | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/Sources/FoundationEssentials/String/String+Encoding+Names.swift b/Sources/FoundationEssentials/String/String+Encoding+Names.swift index 2e34b3fed..84b51ec48 100644 --- a/Sources/FoundationEssentials/String/String+Encoding+Names.swift +++ b/Sources/FoundationEssentials/String/String+Encoding+Names.swift @@ -14,7 +14,7 @@ // MARK: - Private extensions for parsing encoding names private extension UTF8.CodeUnit { - func _isASCIICaseinsensitivelyEqual(to other: UTF8.CodeUnit) -> Bool { + func _isASCIICaseInsensitivelyEqual(to other: UTF8.CodeUnit) -> Bool { return switch self { case other, other._uppercased, other._lowercased: true default: false @@ -23,11 +23,11 @@ private extension UTF8.CodeUnit { } private extension String { - func _isASCIICaseinsensitivelyEqual(to other: String) -> Bool { + func _isASCIICaseInsensitivelyEqual(to other: String) -> Bool { let (myUTF8, otherUTF8) = (self.utf8, other.utf8) var (myIndex, otherIndex) = (myUTF8.startIndex, otherUTF8.startIndex) while myIndex < myUTF8.endIndex && otherIndex < otherUTF8.endIndex { - guard myUTF8[myIndex]._isASCIICaseinsensitivelyEqual(to: otherUTF8[otherIndex]) else { + guard myUTF8[myIndex]._isASCIICaseInsensitivelyEqual(to: otherUTF8[otherIndex]) else { return false } @@ -64,14 +64,14 @@ internal struct IANACharset { func matches(_ string: String) -> Bool { if let preferredMIMEName = self.preferredMIMEName, - preferredMIMEName._isASCIICaseinsensitivelyEqual(to: string) { + preferredMIMEName._isASCIICaseInsensitivelyEqual(to: string) { return true } - if name._isASCIICaseinsensitivelyEqual(to: string) { + if name._isASCIICaseInsensitivelyEqual(to: string) { return true } for alias in aliases { - if alias._isASCIICaseinsensitivelyEqual(to: string) { + if alias._isASCIICaseInsensitivelyEqual(to: string) { return true } } From cf9ed49f0ead711ed80ba469aedf55831abcb25e Mon Sep 17 00:00:00 2001 From: YOCKOW Date: Fri, 24 Oct 2025 11:45:51 +0900 Subject: [PATCH 14/14] Remove redundant nested function in `String.Encoding(ianaName:)`. In response to: - https://github.com/swiftlang/swift-foundation/pull/1286#discussion_r2457939067 --- .../String/String+Encoding+Names.swift | 19 +++++-------------- 1 file changed, 5 insertions(+), 14 deletions(-) diff --git a/Sources/FoundationEssentials/String/String+Encoding+Names.swift b/Sources/FoundationEssentials/String/String+Encoding+Names.swift index 84b51ec48..1407e5ae1 100644 --- a/Sources/FoundationEssentials/String/String+Encoding+Names.swift +++ b/Sources/FoundationEssentials/String/String+Encoding+Names.swift @@ -143,22 +143,13 @@ extension String.Encoding { .utf32LittleEndian, ] - func __determineEncoding() -> String.Encoding? { - for encoding in possibilities { - guard let ianaCharset = encoding._ianaCharset else { - continue - } - if ianaCharset.matches(charsetName) { - return encoding - } + for encoding in possibilities { + if encoding._ianaCharset!.matches(charsetName) { + self = encoding + return } - return nil } - - guard let encoding = __determineEncoding() else { - return nil - } - self = encoding + return nil } }