|
19 | 19 | import host_tools.drive as drive_tools |
20 | 20 | import host_tools.network as net_tools |
21 | 21 | from framework import utils |
| 22 | +from framework.microvm import SnapshotType |
22 | 23 | from framework.properties import global_props |
23 | | -from framework.utils import check_filesystem, check_output |
| 24 | +from framework.utils import Timeout, check_filesystem, check_output |
24 | 25 | from framework.utils_vsock import ( |
25 | 26 | ECHO_SERVER_PORT, |
26 | 27 | VSOCK_UDS_PATH, |
@@ -583,3 +584,119 @@ def test_snapshot_rename_interface(uvm_nano, microvm_factory): |
583 | 584 | rename_interfaces={iface_override.dev_name: iface_override.tap_name}, |
584 | 585 | resume=True, |
585 | 586 | ) |
| 587 | + |
| 588 | + |
| 589 | +@pytest.mark.parametrize("snapshot_type", [SnapshotType.FULL]) |
| 590 | +@pytest.mark.parametrize("pci_enabled", [False]) |
| 591 | +def test_snapshot_with_heavy_async_io( |
| 592 | + microvm_factory, guest_kernel_linux_6_1, rootfs_rw, snapshot_type, pci_enabled |
| 593 | +): |
| 594 | + """ |
| 595 | + Test that snapshot/restore works correctly when there are in-flight async I/O |
| 596 | + operations. |
| 597 | + """ |
| 598 | + |
| 599 | + def _parse_pending_ops_from_log(vm): |
| 600 | + """Parse VM log to find pending_ops during drain. Returns 0 if not found.""" |
| 601 | + if not vm.log_file or not vm.log_file.exists(): |
| 602 | + return 0 |
| 603 | + try: |
| 604 | + for line in vm.log_data.splitlines(): |
| 605 | + if "drain_and_flush draining:" in line: |
| 606 | + match = re.search(r"pending_ops=(\d+)", line) |
| 607 | + if match: |
| 608 | + print(line) |
| 609 | + return int(match.group(1)) |
| 610 | + except (OSError, ValueError) as err: |
| 611 | + print(f"ERROR: Failed to parse log file: {err}") |
| 612 | + return 0 |
| 613 | + |
| 614 | + max_attempts = 30 |
| 615 | + |
| 616 | + for attempt in range(max_attempts): |
| 617 | + print("=" * 80) |
| 618 | + print( |
| 619 | + f"Attempt {attempt + 1}/{max_attempts} - Testing for non-zero async I/O drain" |
| 620 | + ) |
| 621 | + print("=" * 80) |
| 622 | + |
| 623 | + vm = microvm_factory.build(guest_kernel_linux_6_1, rootfs_rw, pci=pci_enabled) |
| 624 | + vm.spawn(log_level="Debug", log_show_level=True, log_show_origin=True) |
| 625 | + vm.basic_config( |
| 626 | + vcpu_count=2, |
| 627 | + mem_size_mib=1024, |
| 628 | + rootfs_io_engine="Async", |
| 629 | + ) |
| 630 | + vm.add_net_iface() |
| 631 | + vm.start() |
| 632 | + |
| 633 | + write_io_script = """ |
| 634 | + mkdir -p /root/io_test |
| 635 | + cd /root/io_test |
| 636 | +
|
| 637 | + for i in $(seq 1 2000); do |
| 638 | + dd if=/dev/urandom of=/root/io_test/test_file_$i bs=8k count=1 oflag=direct 2>/dev/null & |
| 639 | + done |
| 640 | +
|
| 641 | + for i in $(seq 1 1000); do |
| 642 | + dd if=/dev/urandom of=/root/io_test/medium_file_$i bs=16k count=1 oflag=direct 2>/dev/null & |
| 643 | + done |
| 644 | +
|
| 645 | + for i in $(seq 1 500); do |
| 646 | + dd if=/dev/urandom of=/root/io_test/large_file_$i bs=32k count=1 oflag=direct 2>/dev/null & |
| 647 | + done |
| 648 | +
|
| 649 | + for i in $(seq 1 200); do |
| 650 | + dd if=/dev/urandom of=/root/io_test/xlarge_file_$i bs=64k count=1 oflag=direct 2>/dev/null & |
| 651 | + done |
| 652 | +
|
| 653 | + fio --name=heavy_write --filename=/root/io_test/fio_write_test \ |
| 654 | + --rw=randwrite --bs=4k --size=2G --ioengine=libaio \ |
| 655 | + --iodepth=512 --direct=1 --runtime=30 --time_based \ |
| 656 | + --numjobs=2 --group_reporting >/dev/null 2>&1 & |
| 657 | +
|
| 658 | + fio --name=heavy_write2 --filename=/root/io_test/fio_write_test2 \ |
| 659 | + --rw=randwrite --bs=8k --size=2G --ioengine=libaio \ |
| 660 | + --iodepth=512 --direct=1 --runtime=30 --time_based \ |
| 661 | + --numjobs=2 --group_reporting >/dev/null 2>&1 & |
| 662 | +
|
| 663 | + fio --name=heavy_write3 --filename=/root/io_test/fio_write_test3 \ |
| 664 | + --rw=randwrite --bs=16k --size=2G --ioengine=libaio \ |
| 665 | + --iodepth=512 --direct=1 --runtime=30 --time_based \ |
| 666 | + --numjobs=2 --group_reporting >/dev/null 2>&1 & |
| 667 | +
|
| 668 | + fio --name=heavy_write4 --filename=/root/io_test/fio_write_test4 \ |
| 669 | + --rw=randwrite --bs=4k --size=2G --ioengine=libaio \ |
| 670 | + --iodepth=512 --direct=1 --runtime=30 --time_based \ |
| 671 | + --numjobs=2 --group_reporting >/dev/null 2>&1 & |
| 672 | + """ |
| 673 | + |
| 674 | + vm.ssh.run(write_io_script) |
| 675 | + |
| 676 | + # Very short sleep to maximize chance of catching in-flight I/O |
| 677 | + time.sleep(0.01) |
| 678 | + |
| 679 | + snapshot = vm.make_snapshot(snapshot_type) |
| 680 | + pending_ops_during_drain = _parse_pending_ops_from_log(vm) |
| 681 | + vm.kill() |
| 682 | + |
| 683 | + if pending_ops_during_drain == 0: |
| 684 | + print("pending_ops=0, retrying...") |
| 685 | + # Clean up only when we're not going to use the snapshot |
| 686 | + vm.jailer.cleanup() |
| 687 | + chroot = vm.jailer.chroot_base_with_id() |
| 688 | + if chroot.exists(): |
| 689 | + shutil.rmtree(chroot) |
| 690 | + continue |
| 691 | + |
| 692 | + # We caught in-flight I/O - now test restore |
| 693 | + print(f"Caught {pending_ops_during_drain} pending ops, testing restore...") |
| 694 | + with Timeout(30): |
| 695 | + restored_vm = microvm_factory.build_from_snapshot(snapshot) |
| 696 | + |
| 697 | + # Verify VM is responsive |
| 698 | + restored_vm.ssh.check_output("true") |
| 699 | + print(f"SUCCESS: VM restored with {pending_ops_during_drain} pending async ops") |
| 700 | + return |
| 701 | + |
| 702 | + pytest.skip(f"Could not catch in-flight async I/O after {max_attempts} attempts") |
0 commit comments