epoxy 源码笔记1：Diffing 算法

· 2023-03-16 · # epoxy # ios

最近开始阅读学习 epoxy 的源码，开个新的系列来记录一下学习成功。

本系列文章不对 epoxy 是什么进行讲解，直接讲解其中的部分源码内容。

如果相比较文字，您更喜欢直接阅读代码+注释，那么您可以跳到文章末尾的总结一节，该小节包含了一个完整的，带注释讲解的源码。

源码

本文主要讲解 Collection+Diff.swift 文件中的 makeChangeset(from:) 方法，该方法可以快速比较出两个集合之间的区别，同时返回增删改的索引集合。

首先贴一下相关的源码：

// MARK: - Collection
extension Collection where Element: Diffable, Index == Int {

  /// Diffs two collections (e.g. `Array`s) of `Diffable` items, returning an `IndexChangeset`
  /// representing the minimal set of changes to get from the other collection to this collection.
  ///
  /// - Parameters:
  ///     - from other: The collection of old data.
  public func makeChangeset(from other: Self) -> IndexChangeset {
    // Arranging the elements contiguously prior to diffing improves performance by ~40%.
    let new = ContiguousArray(self)
    let old = ContiguousArray(other)

    /// The entries in both this and the other collection, keyed by their `dataID`s.
    var entries = [AnyHashable: Entry](minimumCapacity: new.count)
    var duplicates = [Entry]()

    var newResults = ContiguousArray<NewRecord>()
    newResults.reserveCapacity(new.count)

    for index in new.indices {
      let id = new[index].diffIdentifier
      let entry = entries[id, default: Entry()]
      if entry.trackNewIndex(index) {
        duplicates.append(entry)
      }
      entries[id] = entry
      newResults.append(NewRecord(entry: entry))
    }

    var oldResults = ContiguousArray<OldRecord>()
    oldResults.reserveCapacity(old.count)

    for index in old.indices {
      let id = old[index].diffIdentifier
      let entry = entries[id]
      entry?.pushOldIndex(index)
      oldResults.append(OldRecord(entry: entry))
    }

    for newIndex in new.indices {
      let entry = newResults[newIndex].entry
      if let oldIndex = entry.popOldIndex() {
        let newItem = new[newIndex]
        let oldItem = other[oldIndex]

        if !oldItem.isDiffableItemEqual(to: newItem) {
          entry.isUpdated = true
        }

        newResults[newIndex].correspondingOldIndex = oldIndex
        oldResults[oldIndex].correspondingNewIndex = newIndex
      }
    }

    var deletes = [Int]()
    var deleteOffsets = [Int]()
    deleteOffsets.reserveCapacity(old.count)
    var runningDeleteOffset = 0

    for index in old.indices {
      deleteOffsets.append(runningDeleteOffset)

      let record = oldResults[index]

      if record.correspondingNewIndex == nil {
        deletes.append(index)
        runningDeleteOffset += 1
      }
    }

    var inserts = [Int]()
    var updates = [(Int, Int)]()
    var moves = [(Int, Int)]()
    var insertOffsets = [Int]()
    insertOffsets.reserveCapacity(new.count)
    var runningInsertOffset = 0

    for index in new.indices {
      insertOffsets.append(runningInsertOffset)

      let record = newResults[index]

      if let oldArrayIndex = record.correspondingOldIndex {
        if record.entry.isUpdated {
          updates.append((oldArrayIndex, index))
        }

        let insertOffset = insertOffsets[index]
        let deleteOffset = deleteOffsets[oldArrayIndex]
        if (oldArrayIndex - deleteOffset + insertOffset) != index {
          moves.append((oldArrayIndex, index))
        }

      } else {
        inserts.append(index)
        runningInsertOffset += 1
      }
    }

    EpoxyLogger.shared.assert(
      old.count + inserts.count - deletes.count == new.count,
      "Failed sanity check for old count with changes matching new count.")

    return IndexChangeset(
      inserts: inserts,
      deletes: deletes,
      updates: updates,
      moves: moves,
      newIndices: oldResults.map { $0.correspondingNewIndex },
      duplicates: duplicates.map { $0.newIndices })
  }

// MARK: - Entry
/// A bookkeeping refrence type for the diffing algorithm.
private final class Entry {

  // MARK: Internal
  private(set) var oldIndices = [Int]()
  private(set) var newIndices = [Int]()
  var isUpdated = false

  /// Tracks an index from the new indices, returning `true` if this entry has previously tracked
  /// a new index as a means to identify duplicates and `false` otherwise.
  func trackNewIndex(_ index: Int) -> Bool {
    let previouslyEmpty = newIndices.isEmpty

    newIndices.append(index)

    // We've encountered a duplicate, return true so we can track it.
    if !previouslyEmpty, newIndices.count == 2 {
      return true
    }

    return false
  }

  func pushOldIndex(_ index: Int) {
    oldIndices.append(index)
  }

  func popOldIndex() -> Int? {
    guard currentOldIndex < oldIndices.endIndex else {
      return nil
    }
    defer {
      currentOldIndex += 1
    }
    return oldIndices[currentOldIndex]
  }

  // MARK: Private
  private var currentOldIndex = 0
}

// MARK: - OldRecord
/// A bookkeeping type for pairing up an old element with its new index.
private struct OldRecord {
  var entry: Entry?
  var correspondingNewIndex: Int? = nil
}

// MARK: - NewRecord
/// A bookkeeping type for pairing up a new element with its old index.
private struct NewRecord {
  var entry: Entry
  var correspondingOldIndex: Int? = nil
}

makeChangeset(from:) 方法是 Collection<Diffable> 的一个扩展，实际使用时你可以简单的理解为一个 [Diffable]。方法接收一个同类型的集合，返回一个 IndexChangeset 类型的结构体。

返回值暂且忽略，咱们直接来看函数体。

`ContiguousArray`

let new = ContiguousArray(self)
let old = ContiguousArray(other)

函数体内首先将 self 和 other 转换为 ContiguousArray 类型的数组。从变量名上我们可以得知，self 代表着新的集合，而 other 代表着旧的的集合，也就是被比较的集合。

ContiguousArray 和 Array 类似，区别在于 ContiguousArray 能保证子元素在内存中是连续存储的，也就是说它是一块连续的内存，这就允许 CPU 通过指针进行连续访问，大幅提升性能。而 Array 我们大家都知道，它的内部使用了 “缓冲区”，逻辑上是连续的，但是内存上未必是连续的。

因为在后续的算法中，self 和 other 的大小是固定的，不会发生变化，所以这里将其转换为 ContiguousArray 是更好的选择。

通过源码中的注释我们也能知道，将 self 和 other 转换为 ContiguousArray 类型，可以提升大约 40% 的性能。

后文中，将使用 new 代表 self，使用 old 代表 other。和这里的变量声明保持同步。

预填充数组

毕竟是一个讲究效率的算法，后文中随处可见 .init(minimumCapacity:) 、 reserveCapacity(_:) 方法，这些方法的作用是一样的：设置数组的长度。

我们都知道 Swift 数组扩容通常是直接 *2。上一节中我们有提到，整个算法中数据源的大小是已知的，那么作为一个讲究效率的算法，我们可以在创建数组的同时就设定好数组的大小，避免在后续操作中触发数组的自动扩容，减少因为扩容而带来的性能损耗。

`entries`

var entries = [AnyHashable: Entry](minimumCapacity: new.count)

entries 这个字典变量的作用是，将 new 和 old 中，使用相同 id 的元素索引值对应起来。

现在这么讲可能比较抽象，后面结合逻辑一起来谈。

记录新元素

var newResults = ContiguousArray<NewRecord>()
newResults.reserveCapacity(new.count)

for index in new.indices {
  let id = new[index].diffIdentifier
  let entry = entries[id, default: Entry()]
  if entry.trackNewIndex(index) {
    duplicates.append(entry)
  }
  entries[id] = entry
  newResults.append(NewRecord(entry: entry))
}

代码首先使用新序列的索引进行遍历，使用对应元素的 id，尝试去 entries 中查找元素。

这里使用了一个小技巧：Dictionay 的 default。当不存在 id 对应的元素时，将返回 default 对应的内容。这里即是返回一个空的 Entry 对象。

稍后讲解 Entry 对象和 trackNewIndex(_:) 方法的具体内容，先大致说一下：

Entry 对象用来记录 “拥有同一个 id 的两个元素，其在新、旧两个数组中的位置”。
trackNewIndex(_:) 方法接收索引作为参数，并存储到 Entry 对象中。同时该方法会返回这个 Entry 对象是否已经在之前出现过（相当于一个重复检查）。

现在我们就可以知道这个 for 循环大致的含义和作用了：

它遍历新数组，首先根据 id 将数组元素存储到 entries 对象中，其次对该数组做一个 “重复检查”，并记录下重复的次数。

`Entry` 对象和 `trackNewIndex(_:)` 方法

Entry 对象是一个结构体，它的完整声明比较长，这里我就不再重复贴了，读者可以翻到最上面自行查看。

我们先把目光集中在 newIndices 这个属性以及 trackNewIndex(_:) 方法上。

trackNewIndex(_:) 方法的实现为：

/// Tracks an index from the new indices, returning `true` if this entry has previously tracked a new index as a means to identify duplicates and `false` otherwise.
func trackNewIndex(_ index: Int) -> Bool {
  let previouslyEmpty = newIndices.isEmpty

  newIndices.append(index)

  // We've encountered a duplicate, return true so we can track it.
  if !previouslyEmpty, newIndices.count == 2 {
    return true
  }

  return false
}

该方法将参数存储到 newIndices 里。这是它的作用之一：记录该 id 在新数组（new）中的位置。

然后判断，如果 newIndices 中的元素等于 2，则意味着出现了重复。这很好理解，newIndices 大于 1 意味着同一个 id 出现了两次。不过截止到发文我也没能理解，为什么是 == 2 而不是 >= 2。结合后文推断，猜测是不希望 duplicates 中出现重复的元素。

记录旧元素

var oldResults = ContiguousArray<OldRecord>()
oldResults.reserveCapacity(old.count)

for index in old.indices {
  let id = old[index].diffIdentifier
  let entry = entries[id]
  entry?.pushOldIndex(index)
  oldResults.append(OldRecord(entry: entry))
}

记录旧元素的逻辑比记录新元素的逻辑要简单一些：当我们通过 id，如果在 entries 映射表中查到对应元素时（意味着该 id 存在对应的新元素），则存储该元素在旧数据中的索引值。否则不进行存储（因为 entries 的意思是存储 “新-旧” 的对应关系，不存在新，那么旧就没有意义）。

`NewRecord` & `OldRecord`

在遍历新集合和旧集合的循环中，末尾都是创建了一个对应的 XXRecord 对象。这两个类型的声明比较相似，除了存储对应的 Entry 之外，还带有一个 Int? 类型的 correspondingXXXIndex 属性，该属性默认是 nil。

correspondingOldIndex 意味着在新集合中，该元素在旧集合中的位置。
correspondingNewIndex 意味着在旧集合中，该元素在新集合中的位置。

如果是 nil，则意味着该元素在另外的集合中不存在。

这两个属性都在后面的算法中起到了重要的作用。

确定元素变动

for newIndex in new.indices {
  let entry = newResults[newIndex].entry
  if let oldIndex = entry.popOldIndex() {
    let newItem = new[newIndex]
    let oldItem = other[oldIndex]
    
    if !oldItem.isDiffableItemEqual(to: newItem) {
      entry.isUpdated = true
    }
    
    newResults[newIndex].correspondingOldIndex = oldIndex
    oldResults[oldIndex].correspondingNewIndex = newIndex
  }
}

这个对新集合的遍历，确定了两个集合之间元素的变动。

首先是对新集合的遍历，因为 newResults 可以说就是 new 通过 map 得到的，所以可以使用 newIndex 直接从 newResults 中进行取值，而不用担心是否会越界。

你应该还记得，entry 对象中存储着该 id 对应的元素在新、老集合中对应的索引。如果在新、老集合中不存在，那么对应的索引也就不存在。所以通过 popOldIndex() 方法（通过方法名也能看出它的作用），可以通过 entry 对象判断是否存在旧的元素。

到这一步我们应该能知道：如果 newResults 中的某一个元素的 correspondingOldIndex 是 nil，那么该元素就是新增的。那么反过来，oldResults 中的逻辑也是一样的。

实际上上一节中，关于 correspondingOldIndex 和 correspondingNewIndex 这两个属性的作用，也是我通过这个循环推断出来的。

接着往下看。如果存在旧的元素，那么我们就要比较新旧两个元素是否一致，如果不一致，则通过 entry.isUpdated 记录该元素发生了更新。

最后在 newResults 和 oldResults 中记录该元素在另外一个数组中的位置。

中场休息

到了这里，其实我们可以小结一下，先总结一下 newResults、oldResults 和 entries 三个变量的作用。

entries 的主要就是将新旧两个数组，通过相同的 id 做一个匹配，同时记录相应的索引。匹配到一起之后用来创建 newResults 和 oldResults 对象。该变量主要作用于算法的前半部分，为创建 newResults 和 oldResults 而生，创建完之后它的任务就结束了。

newResults 和 oldResults 这两个变量最主要的作用，按我的理解就是：“通过一个Int 类型的可选值，记录了 entry 的前世今生”。

correspondingXXXIndex 的存在有两个含义：

是否为空意味着有前世，或者有今生
不只是有，还能拿到对应的位置

感慨一下如果没有 “可选值” 这个概念，则需要借助 Bool + Int 两个变量来完成这个工作。或者判断一下 nil？

删除、新增与移动

在得到 newResults 和 oldResults 两个对象后，要计算出两个集合之间的差异就比较简单了。

删除

后面的算法首先处理的是删除操作。

var deletes = [Int]()
var deleteOffsets = [Int]()
deleteOffsets.reserveCapacity(old.count)
var runningDeleteOffset = 0

for index in old.indices {
    deleteOffsets.append(runningDeleteOffset)
    let record = oldResults[index]
    if record.correspondingNewIndex == nil {
        deletes.append(index)
        runningDeleteOffset += 1
    }
}

其实上文已经做了解释：如果 correspondingNewIndex 是 nil，则意味着该元素是被删除的元素。所以这次是对 old 进行遍历，而不是遍历 new 。

除了将被删除的元素记录到 deletes 之外，该循环还将 “在这个位置之前，有多少个被删除了多少个元素” 记录到了 deleteOffsets 数组中。

例如 [1, 2, 3, 4, 5, 6, 7] 和 [2, 3, 5, 7] 这两个集合做比较，那么 deleteOffsets 的值就是 [0, 1, 1, 1, 2, 2, 3]。代表着 “5这个元素之前，有两个元素被删除掉了”。

该算法是先将 runningDeleteOffset 添加到 deleteOffsets 里，再做递增操作。所以记录的是 “本元素之前，被删除了多少个元素”。

新增和移动

var inserts = [Int]()
var updates = [(Int, Int)]()
var moves = [(Int, Int)]()
var insertOffsets = [Int]()
insertOffsets.reserveCapacity(new.count)
var runningInsertOffset = 0

for index in new.indices {
  insertOffsets.append(runningInsertOffset)
  
  let record = newResults[index]
  
  if let oldArrayIndex = record.correspondingOldIndex {
    if record.entry.isUpdated {
      updates.append((oldArrayIndex, index))
    }
    
    let insertOffset = insertOffsets[index]
    let deleteOffset = deleteOffsets[oldArrayIndex]
    if (oldArrayIndex - deleteOffset + insertOffset) != index {
      moves.append((oldArrayIndex, index))
    }
    
  } else {
    inserts.append(index)
    runningInsertOffset += 1
  }
}

首先我们发现遍历的开头，用了 record.correspondingOldIndex 来做判断，意味着 “该元素有没有对应的老元素”。

然后从下往上看 else 的逻辑，和删除的时候一样，这次循环通过 inserts 记录了 “该元素之前，插入了多少个元素”。这里我就不用具体的例子举例了，大家应该可以明白。

那么再回到 if 里的逻辑，首先 isUpdated 是之前已经判断过了的。

那么 (oldArrayIndex - deleteOffset + insertOffset) != index 该怎么理解呢？

首先我们明确每个参数的含义：

index 是某个元素在 “新集合” 中的位置。
oldArrayIndex 是该元素在 “旧集合” 中的位置。
deleteOffset 意味着该元素之前删除了几个元素。
insertOffset 意味着该元素之前添加了几个元素。

那么我们就可以理解了：对于旧集合而言，如果去掉被删除的，再加上新增的，如果索引位置不变，那么该元素的位置没有发生移动。

通过具体的例子来看一下：

[1, 2, 3, 4, 5, 6, 7] 和 [2, 3, 7, 5]。我们遍历的是后者，即新集合。deleteOffsets 是 [0, 1, 1, 1, 2, 2, 3]。

假设此时我们遍历到了新集合中的 3。

那么 index 是 1，oldArrayIndex 是 2，deleteOffset 是 1，insertOffset 是 0。表达式为 2 - 1 - 0 == 1，不符合条件（注意代码中用的是 != 号），那么则不会添加到 moves 里，意味着 3 这个元素没有被移动。

再假设此时我们遍历到了新集合中的 7。

那么 index 是 2，oldArrayIndex 是 6，deleteOffset 是 3，insertOffset 是 0。表达式为 6 - 3 - 0 != 2，符合条件，所以判断 7 发生了移动。

最后遍历到 5 的时候，

index 是 3，oldArrayIndex 是 4，deleteOffset 是 2，insertOffset 是 0。表达式为 4 - 2 - 0 != 3，符合条件，所以判5 也是发生了移动。

这样，我们就可以筛选出哪些元素发生了移动。

总结

epoxy 的 Diffing 算法大体可以分为一下几个部分：

将输入转换为 ContiguousArray 类型，提升效率。
使用哈希表，将同一个 id 的元素从新旧两个集合中摘出来，做匹配，同时记录原本的位置。
利用这个哈希表，配合 LCS 算法，找出哪些元素被删除，哪些元素是新增的。
根据需求，进一步将第三步的结果进行处理。

最后贴上一下我自己对源代码做的注释：

/// 对两个 `Diffable` 元素集合（例如 `Array`）进行差异化处理，返回一个 `IndexChangeset`，表示从 `other` 到此集合的最小更改集合。
///
/// - Parameters:
///     - from other: 旧数据集合。
public func makeChangeset(from other: Self) -> IndexChangeset {
    // 在进行差异化处理之前将元素连续排列可以提高大约40%的性能。
    let new = ContiguousArray(self)
    let old = ContiguousArray(other)
    
    // 通过 `dataID`，将 `self` 和 `other` 的对应元素存储到 `entries` 里。
    var entries = [AnyHashable: Entry](minimumCapacity: new.count)
    var duplicates = [Entry]()
    
    // 用来记录新元素（`self`）和旧索引的一个对应关系
    var newResults = ContiguousArray<NewRecord>()
    newResults.reserveCapacity(new.count)
    
    //
    for index in new.indices {
        // `new` 其实就是 `self`，从 `self` 里拿标识符
        let id = new[index].diffIdentifier
        
        let entry = entries[id, default: Entry()]
        
        // 1. 记录这个新元素的位置
        // 2. 判断当前数组的重复性，因为是数组，所以内部数据有可能会有重复的情况。
        if entry.trackNewIndex(index) {
            duplicates.append(entry)
        }
        
        entries[id] = entry
        
        // `correspondingOldIndex` 默认是 `nil`
        newResults.append(NewRecord(entry: entry))
    }
    
    // 用来记录旧元素（`other`）和新索引的一个对应关系
    var oldResults = ContiguousArray<OldRecord>()
    oldResults.reserveCapacity(old.count)
    
    for index in old.indices {
        // 从 `other` 里拿标识符
        let id = old[index].diffIdentifier
        
        // 用 `other` 的 `id`，去 `self` 里找对应的值
        let entry = entries[id]
        
        // 如果找到的话，就将旧的索引存储到 `entry` 里。
        // `entry` 里既存储了新（`self`）的索引，又存储了旧（`other`）的索引。
        // 因为 `id` 是一样的，所以就通过 `id` 把新旧两个元素的索引对应起来了。
        entry?.pushOldIndex(index)
        
        // `correspondingNewIndex` 默认是 `nil`
        oldResults.append(OldRecord(entry: entry))
    }
    
    for newIndex in new.indices {
        // `newResults` 是通过遍历 new 得到的
        // 所以 `index` 是一致的，可以直接用 `new` 的 `index` 拿到 `newResults` 的元素
        let entry = newResults[newIndex].entry
        
        // 如果存在新元素对应的老元素的话，可能对应更新操作
        if let oldIndex = entry.popOldIndex() {
            let newItem = new[newIndex]
            let oldItem = other[oldIndex]
            
            // 如果两个对象不相等，则是更新操作
            if !oldItem.isDiffableItemEqual(to: newItem) {
                entry.isUpdated = true
            }
            
            // 记录两个元素相对的位置
            // 可能从 `old` 更新到 `new`，所以索引是有需要的
            newResults[newIndex].correspondingOldIndex = oldIndex
            oldResults[oldIndex].correspondingNewIndex = newIndex
        }
    }
    
    // 删除操作
    var deletes = [Int]()
    var deleteOffsets = [Int]()
    deleteOffsets.reserveCapacity(old.count)
    var runningDeleteOffset = 0
    
    for index in old.indices {
        deleteOffsets.append(runningDeleteOffset)
        
        let record = oldResults[index]
        
        // 这里等于 `nil`，意味着在上一个循环中，没有匹配到对应的 `new` 元素
        // 所以意味着在 `new` 数组中，没有该元素存在，所以该元素被删除了
        if record.correspondingNewIndex == nil {
            deletes.append(index)
            runningDeleteOffset += 1
        }
    }
    
    var inserts = [Int]()
    var updates = [(Int, Int)]()
    var moves = [(Int, Int)]()
    var insertOffsets = [Int]()
    insertOffsets.reserveCapacity(new.count)
    var runningInsertOffset = 0
    
    for index in new.indices {
        insertOffsets.append(runningInsertOffset)
        
        let record = newResults[index]
        
        // 有新，有老
        if let oldArrayIndex = record.correspondingOldIndex {
            if record.entry.isUpdated {
                // 更新，记录元素的对应关系
                updates.append((oldArrayIndex, index))
            }
            
            // `insertOffsets` 和 `deleteOffsets` 里的值，类似于 `[0, 0, 1, 1, 1]` 这种形式
            // 其长度分别等于 `new` 和 `old`，所以直接使用对应的 `index` 取值不会导致溢出
            let insertOffset = insertOffsets[index]
            let deleteOffset = deleteOffsets[oldArrayIndex]
            
            // 旧位置 - 删除了的元素个数 + 新增了的元素个数 != 新位置 -> 移动了
            if (oldArrayIndex - deleteOffset + insertOffset) != index {
                moves.append((oldArrayIndex, index))
            }
        }
        
        // 有新，没老
        else {
            // 意味着新的元素相对于老的元素来说是插入进来的
            inserts.append(index)
            runningInsertOffset += 1
        }
    }
    
    EpoxyLogger.shared.assert(
        old.count + inserts.count - deletes.count == new.count,
        "Failed sanity check for old count with changes matching new count.")
    
    return IndexChangeset(
        inserts: inserts,
        deletes: deletes,
        updates: updates,
        moves: moves,
        newIndices: oldResults.map { $0.correspondingNewIndex },
        duplicates: duplicates.map { $0.newIndices })
}

后记

文章算是写完了，但是感慨自己虽然花了很多时间，算是 “弄明白” 了本算法，但是也仅限于看懂，然后感慨它的精妙。甚至不敢肯定以后遇到相似的需求时，能用上该算法。哎，希望今后还能有所长进。