bulk-checkin.c

   1 /*
   2  * Copyright (c) 2011, Google Inc.
   3  */
   4
   5 #define USE_THE_REPOSITORY_VARIABLE
   6
   7 #include "git-compat-util.h"
   8 #include "bulk-checkin.h"
   9 #include "environment.h"
  10 #include "gettext.h"
  11 #include "hex.h"
  12 #include "lockfile.h"
  13 #include "repository.h"
  14 #include "csum-file.h"
  15 #include "pack.h"
  16 #include "strbuf.h"
  17 #include "tmp-objdir.h"
  18 #include "packfile.h"
  19 #include "object-file.h"
  20 #include "odb.h"
  21
  22 static int odb_transaction_nesting;
  23
  24 static struct tmp_objdir *bulk_fsync_objdir;
  25
  26 static struct bulk_checkin_packfile {
  27         char *pack_tmp_name;
  28         struct hashfile *f;
  29         off_t offset;
  30         struct pack_idx_option pack_idx_opts;
  31
  32         struct pack_idx_entry **written;
  33         uint32_t alloc_written;
  34         uint32_t nr_written;
  35 } bulk_checkin_packfile;
  36
  37 static void finish_tmp_packfile(struct strbuf *basename,
  38                                 const char *pack_tmp_name,
  39                                 struct pack_idx_entry **written_list,
  40                                 uint32_t nr_written,
  41                                 struct pack_idx_option *pack_idx_opts,
  42                                 unsigned char hash[])
  43 {
  44         char *idx_tmp_name = NULL;
  45
  46         stage_tmp_packfiles(the_repository, basename, pack_tmp_name,
  47                             written_list, nr_written, NULL, pack_idx_opts, hash,
  48                             &idx_tmp_name);
  49         rename_tmp_packfile_idx(basename, &idx_tmp_name);
  50
  51         free(idx_tmp_name);
  52 }
  53
  54 static void flush_bulk_checkin_packfile(struct bulk_checkin_packfile *state)
  55 {
  56         unsigned char hash[GIT_MAX_RAWSZ];
  57         struct strbuf packname = STRBUF_INIT;
  58
  59         if (!state->f)
  60                 return;
  61
  62         if (state->nr_written == 0) {
  63                 close(state->f->fd);
  64                 free_hashfile(state->f);
  65                 unlink(state->pack_tmp_name);
  66                 goto clear_exit;
  67         } else if (state->nr_written == 1) {
  68                 finalize_hashfile(state->f, hash, FSYNC_COMPONENT_PACK,
  69                                   CSUM_HASH_IN_STREAM | CSUM_FSYNC | CSUM_CLOSE);
  70         } else {
  71                 int fd = finalize_hashfile(state->f, hash, FSYNC_COMPONENT_PACK, 0);
  72                 fixup_pack_header_footer(the_hash_algo, fd, hash, state->pack_tmp_name,
  73                                          state->nr_written, hash,
  74                                          state->offset);
  75                 close(fd);
  76         }
  77
  78         strbuf_addf(&packname, "%s/pack/pack-%s.", repo_get_object_directory(the_repository),
  79                     hash_to_hex(hash));
  80         finish_tmp_packfile(&packname, state->pack_tmp_name,
  81                             state->written, state->nr_written,
  82                             &state->pack_idx_opts, hash);
  83         for (uint32_t i = 0; i < state->nr_written; i++)
  84                 free(state->written[i]);
  85
  86 clear_exit:
  87         free(state->pack_tmp_name);
  88         free(state->written);
  89         memset(state, 0, sizeof(*state));
  90
  91         strbuf_release(&packname);
  92         /* Make objects we just wrote available to ourselves */
  93         reprepare_packed_git(the_repository);
  94 }
  95
  96 /*
  97  * Cleanup after batch-mode fsync_object_files.
  98  */
  99 static void flush_batch_fsync(void)
 100 {
 101         struct strbuf temp_path = STRBUF_INIT;
 102         struct tempfile *temp;
 103
 104         if (!bulk_fsync_objdir)
 105                 return;
 106
 107         /*
 108          * Issue a full hardware flush against a temporary file to ensure
 109          * that all objects are durable before any renames occur. The code in
 110          * fsync_loose_object_bulk_checkin has already issued a writeout
 111          * request, but it has not flushed any writeback cache in the storage
 112          * hardware or any filesystem logs. This fsync call acts as a barrier
 113          * to ensure that the data in each new object file is durable before
 114          * the final name is visible.
 115          */
 116         strbuf_addf(&temp_path, "%s/bulk_fsync_XXXXXX", repo_get_object_directory(the_repository));
 117         temp = xmks_tempfile(temp_path.buf);
 118         fsync_or_die(get_tempfile_fd(temp), get_tempfile_path(temp));
 119         delete_tempfile(&temp);
 120         strbuf_release(&temp_path);
 121
 122         /*
 123          * Make the object files visible in the primary ODB after their data is
 124          * fully durable.
 125          */
 126         tmp_objdir_migrate(bulk_fsync_objdir);
 127         bulk_fsync_objdir = NULL;
 128 }
 129
 130 static int already_written(struct bulk_checkin_packfile *state, struct object_id *oid)
 131 {
 132         /* The object may already exist in the repository */
 133         if (odb_has_object(the_repository->objects, oid,
 134                            HAS_OBJECT_RECHECK_PACKED | HAS_OBJECT_FETCH_PROMISOR))
 135                 return 1;
 136
 137         /* Might want to keep the list sorted */
 138         for (uint32_t i = 0; i < state->nr_written; i++)
 139                 if (oideq(&state->written[i]->oid, oid))
 140                         return 1;
 141
 142         /* This is a new object we need to keep */
 143         return 0;
 144 }
 145
 146 /*
 147  * Read the contents from fd for size bytes, streaming it to the
 148  * packfile in state while updating the hash in ctx. Signal a failure
 149  * by returning a negative value when the resulting pack would exceed
 150  * the pack size limit and this is not the first object in the pack,
 151  * so that the caller can discard what we wrote from the current pack
 152  * by truncating it and opening a new one. The caller will then call
 153  * us again after rewinding the input fd.
 154  *
 155  * The already_hashed_to pointer is kept untouched by the caller to
 156  * make sure we do not hash the same byte when we are called
 157  * again. This way, the caller does not have to checkpoint its hash
 158  * status before calling us just in case we ask it to call us again
 159  * with a new pack.
 160  */
 161 static int stream_blob_to_pack(struct bulk_checkin_packfile *state,
 162                                struct git_hash_ctx *ctx, off_t *already_hashed_to,
 163                                int fd, size_t size, const char *path,
 164                                unsigned flags)
 165 {
 166         git_zstream s;
 167         unsigned char ibuf[16384];
 168         unsigned char obuf[16384];
 169         unsigned hdrlen;
 170         int status = Z_OK;
 171         int write_object = (flags & INDEX_WRITE_OBJECT);
 172         off_t offset = 0;
 173
 174         git_deflate_init(&s, pack_compression_level);
 175
 176         hdrlen = encode_in_pack_object_header(obuf, sizeof(obuf), OBJ_BLOB, size);
 177         s.next_out = obuf + hdrlen;
 178         s.avail_out = sizeof(obuf) - hdrlen;
 179
 180         while (status != Z_STREAM_END) {
 181                 if (size && !s.avail_in) {
 182                         size_t rsize = size < sizeof(ibuf) ? size : sizeof(ibuf);
 183                         ssize_t read_result = read_in_full(fd, ibuf, rsize);
 184                         if (read_result < 0)
 185                                 die_errno("failed to read from '%s'", path);
 186                         if ((size_t)read_result != rsize)
 187                                 die("failed to read %u bytes from '%s'",
 188                                     (unsigned)rsize, path);
 189                         offset += rsize;
 190                         if (*already_hashed_to < offset) {
 191                                 size_t hsize = offset - *already_hashed_to;
 192                                 if (rsize < hsize)
 193                                         hsize = rsize;
 194                                 if (hsize)
 195                                         git_hash_update(ctx, ibuf, hsize);
 196                                 *already_hashed_to = offset;
 197                         }
 198                         s.next_in = ibuf;
 199                         s.avail_in = rsize;
 200                         size -= rsize;
 201                 }
 202
 203                 status = git_deflate(&s, size ? 0 : Z_FINISH);
 204
 205                 if (!s.avail_out || status == Z_STREAM_END) {
 206                         if (write_object) {
 207                                 size_t written = s.next_out - obuf;
 208
 209                                 /* would we bust the size limit? */
 210                                 if (state->nr_written &&
 211                                     pack_size_limit_cfg &&
 212                                     pack_size_limit_cfg < state->offset + written) {
 213                                         git_deflate_abort(&s);
 214                                         return -1;
 215                                 }
 216
 217                                 hashwrite(state->f, obuf, written);
 218                                 state->offset += written;
 219                         }
 220                         s.next_out = obuf;
 221                         s.avail_out = sizeof(obuf);
 222                 }
 223
 224                 switch (status) {
 225                 case Z_OK:
 226                 case Z_BUF_ERROR:
 227                 case Z_STREAM_END:
 228                         continue;
 229                 default:
 230                         die("unexpected deflate failure: %d", status);
 231                 }
 232         }
 233         git_deflate_end(&s);
 234         return 0;
 235 }
 236
 237 /* Lazily create backing packfile for the state */
 238 static void prepare_to_stream(struct bulk_checkin_packfile *state,
 239                               unsigned flags)
 240 {
 241         if (!(flags & INDEX_WRITE_OBJECT) || state->f)
 242                 return;
 243
 244         state->f = create_tmp_packfile(the_repository, &state->pack_tmp_name);
 245         reset_pack_idx_option(&state->pack_idx_opts);
 246
 247         /* Pretend we are going to write only one object */
 248         state->offset = write_pack_header(state->f, 1);
 249         if (!state->offset)
 250                 die_errno("unable to write pack header");
 251 }
 252
 253 static int deflate_blob_to_pack(struct bulk_checkin_packfile *state,
 254                                 struct object_id *result_oid,
 255                                 int fd, size_t size,
 256                                 const char *path, unsigned flags)
 257 {
 258         off_t seekback, already_hashed_to;
 259         struct git_hash_ctx ctx;
 260         unsigned char obuf[16384];
 261         unsigned header_len;
 262         struct hashfile_checkpoint checkpoint;
 263         struct pack_idx_entry *idx = NULL;
 264
 265         seekback = lseek(fd, 0, SEEK_CUR);
 266         if (seekback == (off_t) -1)
 267                 return error("cannot find the current offset");
 268
 269         header_len = format_object_header((char *)obuf, sizeof(obuf),
 270                                           OBJ_BLOB, size);
 271         the_hash_algo->init_fn(&ctx);
 272         git_hash_update(&ctx, obuf, header_len);
 273
 274         /* Note: idx is non-NULL when we are writing */
 275         if ((flags & INDEX_WRITE_OBJECT) != 0) {
 276                 CALLOC_ARRAY(idx, 1);
 277
 278                 prepare_to_stream(state, flags);
 279                 hashfile_checkpoint_init(state->f, &checkpoint);
 280         }
 281
 282         already_hashed_to = 0;
 283
 284         while (1) {
 285                 prepare_to_stream(state, flags);
 286                 if (idx) {
 287                         hashfile_checkpoint(state->f, &checkpoint);
 288                         idx->offset = state->offset;
 289                         crc32_begin(state->f);
 290                 }
 291                 if (!stream_blob_to_pack(state, &ctx, &already_hashed_to,
 292                                          fd, size, path, flags))
 293                         break;
 294                 /*
 295                  * Writing this object to the current pack will make
 296                  * it too big; we need to truncate it, start a new
 297                  * pack, and write into it.
 298                  */
 299                 if (!idx)
 300                         BUG("should not happen");
 301                 hashfile_truncate(state->f, &checkpoint);
 302                 state->offset = checkpoint.offset;
 303                 flush_bulk_checkin_packfile(state);
 304                 if (lseek(fd, seekback, SEEK_SET) == (off_t) -1)
 305                         return error("cannot seek back");
 306         }
 307         git_hash_final_oid(result_oid, &ctx);
 308         if (!idx)
 309                 return 0;
 310
 311         idx->crc32 = crc32_end(state->f);
 312         if (already_written(state, result_oid)) {
 313                 hashfile_truncate(state->f, &checkpoint);
 314                 state->offset = checkpoint.offset;
 315                 free(idx);
 316         } else {
 317                 oidcpy(&idx->oid, result_oid);
 318                 ALLOC_GROW(state->written,
 319                            state->nr_written + 1,
 320                            state->alloc_written);
 321                 state->written[state->nr_written++] = idx;
 322         }
 323         return 0;
 324 }
 325
 326 void prepare_loose_object_bulk_checkin(void)
 327 {
 328         /*
 329          * We lazily create the temporary object directory
 330          * the first time an object might be added, since
 331          * callers may not know whether any objects will be
 332          * added at the time they call begin_odb_transaction.
 333          */
 334         if (!odb_transaction_nesting || bulk_fsync_objdir)
 335                 return;
 336
 337         bulk_fsync_objdir = tmp_objdir_create(the_repository, "bulk-fsync");
 338         if (bulk_fsync_objdir)
 339                 tmp_objdir_replace_primary_odb(bulk_fsync_objdir, 0);
 340 }
 341
 342 void fsync_loose_object_bulk_checkin(int fd, const char *filename)
 343 {
 344         /*
 345          * If we have an active ODB transaction, we issue a call that
 346          * cleans the filesystem page cache but avoids a hardware flush
 347          * command. Later on we will issue a single hardware flush
 348          * before renaming the objects to their final names as part of
 349          * flush_batch_fsync.
 350          */
 351         if (!bulk_fsync_objdir ||
 352             git_fsync(fd, FSYNC_WRITEOUT_ONLY) < 0) {
 353                 if (errno == ENOSYS)
 354                         warning(_("core.fsyncMethod = batch is unsupported on this platform"));
 355                 fsync_or_die(fd, filename);
 356         }
 357 }
 358
 359 int index_blob_bulk_checkin(struct object_id *oid,
 360                             int fd, size_t size,
 361                             const char *path, unsigned flags)
 362 {
 363         int status = deflate_blob_to_pack(&bulk_checkin_packfile, oid, fd, size,
 364                                           path, flags);
 365         if (!odb_transaction_nesting)
 366                 flush_bulk_checkin_packfile(&bulk_checkin_packfile);
 367         return status;
 368 }
 369
 370 void begin_odb_transaction(void)
 371 {
 372         odb_transaction_nesting += 1;
 373 }
 374
 375 void flush_odb_transaction(void)
 376 {
 377         flush_batch_fsync();
 378         flush_bulk_checkin_packfile(&bulk_checkin_packfile);
 379 }
 380
 381 void end_odb_transaction(void)
 382 {
 383         odb_transaction_nesting -= 1;
 384         if (odb_transaction_nesting < 0)
 385                 BUG("Unbalanced ODB transaction nesting");
 386
 387         if (odb_transaction_nesting)
 388                 return;
 389
 390         flush_odb_transaction();
 391 }