[bitbake-devel] [OE-core][PATCH v7 3/3] sstate: Implement hash equivalence sstate
Jacob Kroon
jacob.kroon at gmail.com
Tue Jan 8 06:29:37 UTC 2019
On 1/4/19 5:20 PM, Joshua Watt wrote:
> Converts sstate so that it can use a hash equivalence server to
> determine if a task really needs to be rebuilt, or if it can be restored
> from a different (equivalent) sstate object.
>
> The unique hashes are cached persistently using persist_data. This has
> a number of advantages:
> 1) Unique hashes can be cached between invocations of bitbake to
> prevent needing to contact the server every time (which is slow)
> 2) The value of each tasks unique hash can easily be synchronized
> between different threads, which will be useful if bitbake is
> updated to do on the fly task re-hashing.
>
> [YOCTO #13030]
>
> Signed-off-by: Joshua Watt <JPEWhacker at gmail.com>
> ---
> meta/classes/sstate.bbclass | 105 +++++++++++++++++++++--
> meta/conf/bitbake.conf | 4 +-
> meta/lib/oe/sstatesig.py | 167 ++++++++++++++++++++++++++++++++++++
> 3 files changed, 267 insertions(+), 9 deletions(-)
>
> diff --git a/meta/classes/sstate.bbclass b/meta/classes/sstate.bbclass
> index 59ebc3ab5cc..da0807d6e99 100644
> --- a/meta/classes/sstate.bbclass
> +++ b/meta/classes/sstate.bbclass
> @@ -11,7 +11,7 @@ def generate_sstatefn(spec, hash, d):
> SSTATE_PKGARCH = "${PACKAGE_ARCH}"
> SSTATE_PKGSPEC = "sstate:${PN}:${PACKAGE_ARCH}${TARGET_VENDOR}-${TARGET_OS}:${PV}:${PR}:${SSTATE_PKGARCH}:${SSTATE_VERSION}:"
> SSTATE_SWSPEC = "sstate:${PN}::${PV}:${PR}::${SSTATE_VERSION}:"
> -SSTATE_PKGNAME = "${SSTATE_EXTRAPATH}${@generate_sstatefn(d.getVar('SSTATE_PKGSPEC'), d.getVar('BB_TASKHASH'), d)}"
> +SSTATE_PKGNAME = "${SSTATE_EXTRAPATH}${@generate_sstatefn(d.getVar('SSTATE_PKGSPEC'), d.getVar('BB_UNIHASH'), d)}"
> SSTATE_PKG = "${SSTATE_DIR}/${SSTATE_PKGNAME}"
> SSTATE_EXTRAPATH = ""
> SSTATE_EXTRAPATHWILDCARD = ""
> @@ -82,6 +82,23 @@ SSTATE_SIG_PASSPHRASE ?= ""
> # Whether to verify the GnUPG signatures when extracting sstate archives
> SSTATE_VERIFY_SIG ?= "0"
>
> +SSTATE_HASHEQUIV_METHOD ?= "OEOuthashBasic"
> +SSTATE_HASHEQUIV_METHOD[doc] = "The function used to calculate the output hash \
> + for a task, which in turn is used to determine equivalency. \
> + "
> +
> +SSTATE_HASHEQUIV_SERVER ?= ""
> +SSTATE_HASHEQUIV_SERVER[doc] = "The hash equivalence sever. For example, \
> + 'http://192.168.0.1:5000'. Do not include a trailing slash \
> + "
> +
> +SSTATE_HASHEQUIV_REPORT_TASKDATA ?= "0"
> +SSTATE_HASHEQUIV_REPORT_TASKDATA[doc] = "Report additional useful data to the \
> + hash equivalency server, such as PN, PV, taskname, etc. This information \
> + is very useful for developers looking at task data, but may leak sensitive \
> + data if the equivalence server is public. \
> + "
> +
> python () {
> if bb.data.inherits_class('native', d):
> d.setVar('SSTATE_PKGARCH', d.getVar('BUILD_ARCH', False))
> @@ -640,7 +657,7 @@ def sstate_package(ss, d):
> return
>
> for f in (d.getVar('SSTATECREATEFUNCS') or '').split() + \
> - ['sstate_create_package', 'sstate_sign_package'] + \
> + ['sstate_report_unihash', 'sstate_create_package', 'sstate_sign_package'] + \
> (d.getVar('SSTATEPOSTCREATEFUNCS') or '').split():
> # All hooks should run in SSTATE_BUILDDIR.
> bb.build.exec_func(f, d, (sstatebuild,))
> @@ -764,6 +781,73 @@ python sstate_sign_package () {
> d.getVar('SSTATE_SIG_PASSPHRASE'), armor=False)
> }
>
> +def OEOuthashBasic(path, sigfile, task, d):
> + import hashlib
> + import stat
> +
> + def update_hash(s):
> + s = s.encode('utf-8')
> + h.update(s)
> + if sigfile:
> + sigfile.write(s)
> +
> + h = hashlib.sha256()
> + prev_dir = os.getcwd()
> +
> + try:
> + os.chdir(path)
> +
> + update_hash("OEOuthashBasic\n")
> +
> + # It is only currently useful to get equivalent hashes for things that
> + # can be restored from sstate. Since the sstate object is named using
> + # SSTATE_PKGSPEC and the task name, those should be included in the
> + # output hash calculation.
> + update_hash("SSTATE_PKGSPEC=%s\n" % d.getVar('SSTATE_PKGSPEC'))
> + update_hash("task=%s\n" % task)
> +
> + for root, dirs, files in os.walk('.', topdown=True):
> + # Sort directories and files to ensure consistent ordering
> + dirs.sort()
> + files.sort()
> +
> + for f in files:
> + path = os.path.join(root, f)
> + s = os.lstat(path)
> +
> + # Hash file path
> + update_hash(path + '\n')
> +
> + # Hash file mode
> + update_hash("\tmode=0x%x\n" % stat.S_IMODE(s.st_mode))
> + update_hash("\ttype=0x%x\n" % stat.S_IFMT(s.st_mode))
> +
> + if stat.S_ISBLK(s.st_mode) or stat.S_ISBLK(s.st_mode):
> + # Hash device major and minor
> + update_hash("\tdev=%d,%d\n" % (os.major(s.st_rdev), os.minor(s.st_rdev)))
> + elif stat.S_ISLNK(s.st_mode):
> + # Hash symbolic link
> + update_hash("\tsymlink=%s\n" % os.readlink(path))
> + else:
> + fh = hashlib.sha256()
> + # Hash file contents
> + with open(path, 'rb') as d:
> + for chunk in iter(lambda: d.read(4096), b""):
> + fh.update(chunk)
> + update_hash("\tdigest=%s\n" % fh.hexdigest())
Would it be a good idea to make the depsig.do_* files even more human
readable, considering that they could be candidates for being stored in
buildhistory ?
As an example, here's what buildhistory/.../files-in-package.txt for
busybox looks like:
drwxr-xr-x root root 4096 ./bin
lrwxrwxrwx root root 14 ./bin/busybox -> busybox.nosuid
-rwxr-xr-x root root 547292 ./bin/busybox.nosuid
-rwsr-xr-x root root 50860 ./bin/busybox.suid
lrwxrwxrwx root root 14 ./bin/sh -> busybox.nosuid
drwxr-xr-x root root 4096 ./etc
-rw-r--r-- root root 2339 ./etc/busybox.links.nosuid
-rw-r--r-- root root 91 ./etc/busybox.links.suid
> + finally:
> + os.chdir(prev_dir)
> +
> + return h.hexdigest()
> +
> +python sstate_report_unihash() {
> + report_unihash = getattr(bb.parse.siggen, 'report_unihash', None)
> +
> + if report_unihash:
> + ss = sstate_state_fromvars(d)
> + report_unihash(os.getcwd(), ss['task'], d)
> +}
> +
> #
> # Shell function to decompress and prepare a package for installation
> # Will be run from within SSTATE_INSTDIR.
> @@ -788,6 +872,11 @@ def sstate_checkhashes(sq_fn, sq_task, sq_hash, sq_hashfn, d, siginfo=False, *,
> if siginfo:
> extension = extension + ".siginfo"
>
> + def gethash(task):
> + if sq_unihash is not None:
> + return sq_unihash[task]
> + return sq_hash[task]
> +
> def getpathcomponents(task, d):
> # Magic data from BB_HASHFILENAME
> splithashfn = sq_hashfn[task].split(" ")
> @@ -810,7 +899,7 @@ def sstate_checkhashes(sq_fn, sq_task, sq_hash, sq_hashfn, d, siginfo=False, *,
>
> spec, extrapath, tname = getpathcomponents(task, d)
>
> - sstatefile = d.expand("${SSTATE_DIR}/" + extrapath + generate_sstatefn(spec, sq_hash[task], d) + "_" + tname + extension)
> + sstatefile = d.expand("${SSTATE_DIR}/" + extrapath + generate_sstatefn(spec, gethash(task), d) + "_" + tname + extension)
>
> if os.path.exists(sstatefile):
> bb.debug(2, "SState: Found valid sstate file %s" % sstatefile)
> @@ -872,7 +961,7 @@ def sstate_checkhashes(sq_fn, sq_task, sq_hash, sq_hashfn, d, siginfo=False, *,
> if task in ret:
> continue
> spec, extrapath, tname = getpathcomponents(task, d)
> - sstatefile = d.expand(extrapath + generate_sstatefn(spec, sq_hash[task], d) + "_" + tname + extension)
> + sstatefile = d.expand(extrapath + generate_sstatefn(spec, gethash(task), d) + "_" + tname + extension)
> tasklist.append((task, sstatefile))
>
> if tasklist:
> @@ -898,12 +987,12 @@ def sstate_checkhashes(sq_fn, sq_task, sq_hash, sq_hashfn, d, siginfo=False, *,
> evdata = {'missed': [], 'found': []};
> for task in missed:
> spec, extrapath, tname = getpathcomponents(task, d)
> - sstatefile = d.expand(extrapath + generate_sstatefn(spec, sq_hash[task], d) + "_" + tname + ".tgz")
> - evdata['missed'].append( (sq_fn[task], sq_task[task], sq_hash[task], sstatefile ) )
> + sstatefile = d.expand(extrapath + generate_sstatefn(spec, gethash(task), d) + "_" + tname + ".tgz")
> + evdata['missed'].append( (sq_fn[task], sq_task[task], gethash(task), sstatefile ) )
> for task in ret:
> spec, extrapath, tname = getpathcomponents(task, d)
> - sstatefile = d.expand(extrapath + generate_sstatefn(spec, sq_hash[task], d) + "_" + tname + ".tgz")
> - evdata['found'].append( (sq_fn[task], sq_task[task], sq_hash[task], sstatefile ) )
> + sstatefile = d.expand(extrapath + generate_sstatefn(spec, gethash(task), d) + "_" + tname + ".tgz")
> + evdata['found'].append( (sq_fn[task], sq_task[task], gethash(task), sstatefile ) )
> bb.event.fire(bb.event.MetadataEvent("MissedSstate", evdata), d)
>
> # Print some summary statistics about the current task completion and how much sstate
> diff --git a/meta/conf/bitbake.conf b/meta/conf/bitbake.conf
> index 64800623545..e64ce6a6dab 100644
> --- a/meta/conf/bitbake.conf
> +++ b/meta/conf/bitbake.conf
> @@ -867,7 +867,9 @@ BB_HASHBASE_WHITELIST ?= "TMPDIR FILE PATH PWD BB_TASKHASH BBPATH BBSERVER DL_DI
> STAMPS_DIR PRSERV_DUMPDIR PRSERV_DUMPFILE PRSERV_LOCKDOWN PARALLEL_MAKE \
> CCACHE_DIR EXTERNAL_TOOLCHAIN CCACHE CCACHE_NOHASHDIR LICENSE_PATH SDKPKGSUFFIX \
> WARN_QA ERROR_QA WORKDIR STAMPCLEAN PKGDATA_DIR BUILD_ARCH SSTATE_PKGARCH \
> - BB_WORKERCONTEXT BB_LIMITEDDEPS extend_recipe_sysroot DEPLOY_DIR"
> + BB_WORKERCONTEXT BB_LIMITEDDEPS BB_UNIHASH extend_recipe_sysroot DEPLOY_DIR \
> + SSTATE_HASHEQUIV_METHOD SSTATE_HASHEQUIV_SERVER SSTATE_HASHEQUIV_REPORT_TASKDATA \
> + SSTATE_HASHEQUIV_OWNER"
> BB_HASHCONFIG_WHITELIST ?= "${BB_HASHBASE_WHITELIST} DATE TIME SSH_AGENT_PID \
> SSH_AUTH_SOCK PSEUDO_BUILD BB_ENV_EXTRAWHITE DISABLE_SANITY_CHECKS \
> PARALLEL_MAKE BB_NUMBER_THREADS BB_ORIGENV BB_INVALIDCONF BBINCLUDED \
> diff --git a/meta/lib/oe/sstatesig.py b/meta/lib/oe/sstatesig.py
> index 18c5a353a2a..059e165c7ab 100644
> --- a/meta/lib/oe/sstatesig.py
> +++ b/meta/lib/oe/sstatesig.py
> @@ -263,10 +263,177 @@ class SignatureGeneratorOEBasicHash(bb.siggen.SignatureGeneratorBasicHash):
> if error_msgs:
> bb.fatal("\n".join(error_msgs))
>
> +class SignatureGeneratorOEEquivHash(SignatureGeneratorOEBasicHash):
> + name = "OEEquivHash"
> +
> + def init_rundepcheck(self, data):
> + super().init_rundepcheck(data)
> + self.server = data.getVar('SSTATE_HASHEQUIV_SERVER')
> + self.method = data.getVar('SSTATE_HASHEQUIV_METHOD')
> + self.unihashes = bb.persist_data.persist('SSTATESIG_UNIHASH_CACHE_v1_' + self.method, data)
> +
> + def get_taskdata(self):
> + return (self.server, self.method) + super().get_taskdata()
> +
> + def set_taskdata(self, data):
> + self.server, self.method = data[:2]
> + super().set_taskdata(data[2:])
> +
> + def __get_task_unihash_key(self, task):
> + # TODO: The key only *needs* to be the taskhash, the task is just
> + # convenient
> + return '%s:%s' % (task, self.taskhash[task])
> +
> + def get_stampfile_hash(self, task):
> + if task in self.taskhash:
> + # If a unique hash is reported, use it as the stampfile hash. This
> + # ensures that if a task won't be re-run if the taskhash changes,
> + # but it would result in the same output hash
> + unihash = self.unihashes.get(self.__get_task_unihash_key(task))
> + if unihash is not None:
> + return unihash
> +
> + return super().get_stampfile_hash(task)
> +
> + def get_unihash(self, task):
> + import urllib
> + import json
> +
> + taskhash = self.taskhash[task]
> +
> + key = self.__get_task_unihash_key(task)
> +
> + # TODO: This cache can grow unbounded. It probably only needs to keep
> + # for each task
> + unihash = self.unihashes.get(key)
> + if unihash is not None:
> + return unihash
> +
> + # In the absence of being able to discover a unique hash from the
> + # server, make it be equivalent to the taskhash. The unique "hash" only
> + # really needs to be a unique string (not even necessarily a hash), but
> + # making it match the taskhash has a few advantages:
> + #
> + # 1) All of the sstate code that assumes hashes can be the same
> + # 2) It provides maximal compatibility with builders that don't use
> + # an equivalency server
> + # 3) The value is easy for multiple independent builders to derive the
> + # same unique hash from the same input. This means that if the
> + # independent builders find the same taskhash, but it isn't reported
> + # to the server, there is a better chance that they will agree on
> + # the unique hash.
> + unihash = taskhash
> +
> + try:
> + url = '%s/v1/equivalent?%s' % (self.server,
> + urllib.parse.urlencode({'method': self.method, 'taskhash': self.taskhash[task]}))
> +
> + request = urllib.request.Request(url)
> + response = urllib.request.urlopen(request)
> + data = response.read().decode('utf-8')
> +
> + json_data = json.loads(data)
> +
> + if json_data:
> + unihash = json_data['unihash']
> + # A unique hash equal to the taskhash is not very interesting,
> + # so it is reported it at debug level 2. If they differ, that
> + # is much more interesting, so it is reported at debug level 1
> + bb.debug((1, 2)[unihash == taskhash], 'Found unihash %s in place of %s for %s from %s' % (unihash, taskhash, task, self.server))
> + else:
> + bb.debug(2, 'No reported unihash for %s:%s from %s' % (task, taskhash, self.server))
> + except urllib.error.URLError as e:
> + bb.warn('Failure contacting Hash Equivalence Server %s: %s' % (self.server, str(e)))
> + except (KeyError, json.JSONDecodeError) as e:
> + bb.warn('Poorly formatted response from %s: %s' % (self.server, str(e)))
> +
> + self.unihashes[key] = unihash
> + return unihash
> +
> + def report_unihash(self, path, task, d):
> + import urllib
> + import json
> + import tempfile
> + import base64
> +
> + taskhash = d.getVar('BB_TASKHASH')
> + unihash = d.getVar('BB_UNIHASH')
> + report_taskdata = d.getVar('SSTATE_HASHEQUIV_REPORT_TASKDATA') == '1'
> + tempdir = d.getVar('T')
> + fn = d.getVar('BB_FILENAME')
> + key = fn + '.do_' + task + ':' + taskhash
> +
> + # Sanity checks
> + cache_unihash = self.unihashes.get(key)
> + if cache_unihash is None:
> + bb.fatal('%s not in unihash cache. Please report this error' % key)
> +
> + if cache_unihash != unihash:
> + bb.fatal("Cache unihash %s doesn't match BB_UNIHASH %s" % (cache_unihash, unihash))
> +
> + sigfile = None
> + sigfile_name = "depsig.do_%s.%d" % (task, os.getpid())
> + sigfile_link = "depsig.do_%s" % task
> +
> + try:
> + call = self.method + '(path, sigfile, task, d)'
> + sigfile = open(os.path.join(tempdir, sigfile_name), 'w+b')
> + locs = {'path': path, 'sigfile': sigfile, 'task': task, 'd': d}
> +
> + outhash = bb.utils.better_eval(call, locs)
> +
> + try:
> + url = '%s/v1/equivalent' % self.server
> + task_data = {
> + 'taskhash': taskhash,
> + 'method': self.method,
> + 'outhash': outhash,
> + 'unihash': unihash,
> + 'owner': d.getVar('SSTATE_HASHEQUIV_OWNER')
> + }
> +
> + if report_taskdata:
> + sigfile.seek(0)
> +
> + task_data['PN'] = d.getVar('PN')
> + task_data['PV'] = d.getVar('PV')
> + task_data['PR'] = d.getVar('PR')
> + task_data['task'] = task
> + task_data['outhash_siginfo'] = sigfile.read().decode('utf-8')
> +
> + headers = {'content-type': 'application/json'}
> +
> + request = urllib.request.Request(url, json.dumps(task_data).encode('utf-8'), headers)
> + response = urllib.request.urlopen(request)
> + data = response.read().decode('utf-8')
> +
> + json_data = json.loads(data)
> + new_unihash = json_data['unihash']
> +
> + if new_unihash != unihash:
> + bb.debug(1, 'Task %s unihash changed %s -> %s by server %s' % (taskhash, unihash, new_unihash, self.server))
> + else:
> + bb.debug(1, 'Reported task %s as unihash %s to %s' % (taskhash, unihash, self.server))
> + except urllib.error.URLError as e:
> + bb.warn('Failure contacting Hash Equivalence Server %s: %s' % (self.server, str(e)))
> + except (KeyError, json.JSONDecodeError) as e:
> + bb.warn('Poorly formatted response from %s: %s' % (self.server, str(e)))
> + finally:
> + if sigfile:
> + sigfile.close()
> +
> + sigfile_link_path = os.path.join(tempdir, sigfile_link)
> + bb.utils.remove(sigfile_link_path)
> +
> + try:
> + os.symlink(sigfile_name, sigfile_link_path)
> + except OSError:
> + pass
>
> # Insert these classes into siggen's namespace so it can see and select them
> bb.siggen.SignatureGeneratorOEBasic = SignatureGeneratorOEBasic
> bb.siggen.SignatureGeneratorOEBasicHash = SignatureGeneratorOEBasicHash
> +bb.siggen.SignatureGeneratorOEEquivHash = SignatureGeneratorOEEquivHash
>
>
> def find_siginfo(pn, taskname, taskhashlist, d):
>
More information about the bitbake-devel
mailing list