Find Duplicate File in System — DSA

Problem#

Given a list of strings describing directories and their files in the form "path/dir name1(content1) name2(content2) ...", return groups of file paths sharing identical content. Singleton groups are excluded.

Examples#

["root/a 1.txt(abcd) 2.txt(efgh)","root/c 3.txt(abcd) 4.txt(efgh)","root/c/d 5.txt(efgh)"] → [["root/a/1.txt","root/c/3.txt"],["root/a/2.txt","root/c/4.txt","root/c/d/5.txt"]].

Constraints#

1 <= paths.length <= 2 * 10^4, file names and contents are alphanumeric.

Hints#

Hint

Parse each entry into (path, file, content) triples, then bucket by content.

Approach#

Parse each input string: first token is the directory, the rest are name(content). Strip the content out (substring between parens) and bucket dir + "/" + name by content in a hash map. Return only buckets of size >= 2.

Solution#

class Solution {
public:
    vector<vector<string>> findDuplicate(vector<string>& paths) {
        unordered_map<string, vector<string>> buckets;
        for (auto& entry : paths) {
            int sp = entry.find(' ');
            string dir = entry.substr(0, sp);
            int i = sp + 1, n = entry.size();
            while (i < n) {
                int lp = entry.find('(', i);
                int rp = entry.find(')', lp);
                string name = entry.substr(i, lp - i);
                string content = entry.substr(lp + 1, rp - lp - 1);
                buckets[content].push_back(dir + "/" + name);
                i = rp + 2;     // skip ") "
            }
        }
        vector<vector<string>> ans;
        for (auto& [_, v] : buckets) if (v.size() > 1) ans.push_back(std::move(v));
        return ans;
    }
};

import "strings"

func findDuplicate(paths []string) [][]string {
    buckets := map[string][]string{}
    for _, entry := range paths {
        sp := strings.IndexByte(entry, ' ')
        dir := entry[:sp]
        i, n := sp+1, len(entry)
        for i < n {
            lp := strings.IndexByte(entry[i:], '(') + i
            rp := strings.IndexByte(entry[lp:], ')') + lp
            name := entry[i:lp]
            content := entry[lp+1 : rp]
            buckets[content] = append(buckets[content], dir+"/"+name)
            i = rp + 2
        }
    }
    ans := [][]string{}
    for _, v := range buckets {
        if len(v) > 1 {
            ans = append(ans, v)
        }
    }
    return ans
}

from collections import defaultdict
from typing import List

class Solution:
    def findDuplicate(self, paths: List[str]) -> List[List[str]]:
        buckets = defaultdict(list)
        for entry in paths:
            sp = entry.index(' ')
            dirpath = entry[:sp]
            i, n = sp + 1, len(entry)
            while i < n:
                lp = entry.index('(', i)
                rp = entry.index(')', lp)
                name = entry[i:lp]
                content = entry[lp + 1:rp]
                buckets[content].append(dirpath + "/" + name)
                i = rp + 2  # skip ") "
        return [v for v in buckets.values() if len(v) > 1]

function findDuplicate(paths) {
    const buckets = new Map();
    for (const entry of paths) {
        const sp = entry.indexOf(' ');
        const dir = entry.slice(0, sp);
        let i = sp + 1;
        const n = entry.length;
        while (i < n) {
            const lp = entry.indexOf('(', i);
            const rp = entry.indexOf(')', lp);
            const name = entry.slice(i, lp);
            const content = entry.slice(lp + 1, rp);
            if (!buckets.has(content)) buckets.set(content, []);
            buckets.get(content).push(dir + '/' + name);
            i = rp + 2;
        }
    }
    const ans = [];
    for (const v of buckets.values()) {
        if (v.length > 1) ans.push(v);
    }
    return ans;
}

class Solution {
    public List<List<String>> findDuplicate(String[] paths) {
        Map<String, List<String>> buckets = new HashMap<>();
        for (String entry : paths) {
            int sp = entry.indexOf(' ');
            String dir = entry.substring(0, sp);
            int i = sp + 1, n = entry.length();
            while (i < n) {
                int lp = entry.indexOf('(', i);
                int rp = entry.indexOf(')', lp);
                String name = entry.substring(i, lp);
                String content = entry.substring(lp + 1, rp);
                buckets.computeIfAbsent(content, k -> new ArrayList<>()).add(dir + "/" + name);
                i = rp + 2;
            }
        }
        List<List<String>> ans = new ArrayList<>();
        for (List<String> v : buckets.values()) {
            if (v.size() > 1) ans.add(v);
        }
        return ans;
    }
}

function findDuplicate(paths: string[]): string[][] {
    const buckets = new Map<string, string[]>();
    for (const entry of paths) {
        const sp = entry.indexOf(' ');
        const dir = entry.slice(0, sp);
        let i = sp + 1;
        const n = entry.length;
        while (i < n) {
            const lp = entry.indexOf('(', i);
            const rp = entry.indexOf(')', lp);
            const name = entry.slice(i, lp);
            const content = entry.slice(lp + 1, rp);
            if (!buckets.has(content)) buckets.set(content, []);
            buckets.get(content)!.push(`${dir}/${name}`);
            i = rp + 2;
        }
    }
    const ans: string[][] = [];
    for (const v of buckets.values()) {
        if (v.length > 1) ans.push(v);
    }
    return ans;
}

Editorial#

The hash map bucketed by content gives the answer in one pass. Parsing the awkward “path file(content) file(content) …” format is straightforward with find() and substr() — the i = rp + 2 step jumps over the closing ) and the separating space. For very large files, store a SHA hash of the content instead of the content itself.

Complexity#

Time: O(N * L).
Space: O(N * L).

Concept revision#

Group Anagrams

Problem#

Examples#

Constraints#

Hints#

Approach#

Solution#

Editorial#

Complexity#

Concept revision#

Related#