V0516 11:47:27.901000 139733182882816 torch/_logging/structured.py:19] {"str": ["/home/jjwu/tmp.py", 0]}
V0516 11:47:27.901000 139733182882816 torch/_logging/structured.py:19] {"str": ["/data/users/jjwu/a/pytorch/torch/nn/modules/module.py", 1]}
V0516 11:47:27.901000 139733182882816 torch/_logging/structured.py:19] {"str": ["/data/users/jjwu/a/pytorch/torch/_dynamo/eval_frame.py", 2]}
V0516 11:47:27.901000 139733182882816 torch/_dynamo/convert_frame.py:792] {"dynamo_start": {"stack": [{"line": 15, "name": "<module>", "filename": 0}, {"line": 1532, "name": "_wrapped_call_impl", "filename": 1}, {"line": 1541, "name": "_call_impl", "filename": 1}, {"line": 414, "name": "_fn", "filename": 2}, {"line": 1532, "name": "_wrapped_call_impl", "filename": 1}, {"line": 1541, "name": "_call_impl", "filename": 1}]}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0}
V0516 11:47:27.920000 139733182882816 torch/_dynamo/output_graph.py:1278] {"dynamo_output_graph": {"sizes": {"l_x_": [2], "g_global_state_tensor_": [2], "l__self___param": [2], "y": [2], "add": [2], "add_1": [2]}}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "2fcc93b1b49f2b51f2f307ab841be828"}
class GraphModule(torch.nn.Module):
def forward(self, L_x_: "f32[2][1]cpu", G_global_state_tensor_: "f32[2][1]cpu"):
l_x_ = L_x_
g_global_state_tensor_ = G_global_state_tensor_
# File: /home/jjwu/tmp.py:8 in forward, code: y = torch.sin(self.param)
l__self___param: "f32[2][1]cpu" = self.L__self___param
y: "f32[2][1]cpu" = torch.sin(l__self___param); l__self___param = None
# File: /home/jjwu/tmp.py:9 in forward, code: return y+ x + global_state_tensor
add: "f32[2][1]cpu" = y + l_x_; y = l_x_ = None
add_1: "f32[2][1]cpu" = add + g_global_state_tensor_; add = g_global_state_tensor_ = None
return (add_1,)
V0516 11:47:27.930000 139733182882816 torch/_functorch/aot_autograd.py:887] {"link": {"name": "manifold_url", "url": "https://www.google.com"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0}
V0516 11:47:27.945000 139733182882816 torch/_functorch/_aot_autograd/dispatch_and_compile_graph.py:191] {"aot_forward_graph": {}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "1beb7268ed1533c825d47635b0110b80"}
class <lambda>(torch.nn.Module):
def forward(self, arg0_1: "f32[2]", arg1_1: "f32[2]", arg2_1: "f32[2]"):
# File: /home/jjwu/tmp.py:8 in forward, code: y = torch.sin(self.param)
sin: "f32[2]" = torch.ops.aten.sin.default(arg0_1); arg0_1 = None
# File: /home/jjwu/tmp.py:9 in forward, code: return y+ x + global_state_tensor
add: "f32[2]" = torch.ops.aten.add.Tensor(sin, arg1_1); sin = arg1_1 = None
add_1: "f32[2]" = torch.ops.aten.add.Tensor(add, arg2_1); add = arg2_1 = None
return (add_1,)
V0516 11:47:29.040000 139733182882816 torch/_inductor/compile_fx.py:742] {"inductor_post_grad_graph": {}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "1beb7268ed1533c825d47635b0110b80"}
class <lambda>(torch.nn.Module):
def forward(self, arg0_1: "f32[2]", arg1_1: "f32[2]", arg2_1: "f32[2]"):
# File: /home/jjwu/tmp.py:8 in forward, code: y = torch.sin(self.param)
sin: "f32[2]" = torch.ops.aten.sin.default(arg0_1); arg0_1 = None
# File: /home/jjwu/tmp.py:9 in forward, code: return y+ x + global_state_tensor
add: "f32[2]" = torch.ops.aten.add.Tensor(sin, arg1_1); sin = arg1_1 = None
add_1: "f32[2]" = torch.ops.aten.add.Tensor(add, arg2_1); add = arg2_1 = None
return (add_1,)
V0516 11:47:33.646000 139733182882816 torch/_inductor/graph.py:1697] {"inductor_output_code": {"filename": "/tmp/torchinductor_jjwu/a5/ca5os4o7g4qiox3d7on73q5rz47pg6mywe35z7mwmsaohs3ev3cy.py"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "0e0f93ffb20f4bd34dec99c06a121936"}
# AOT ID: ['0_inference']
from ctypes import c_void_p, c_long
import torch
import math
import random
import os
import tempfile
from math import inf, nan
from torch._inductor.hooks import run_intermediate_hooks
from torch._inductor.utils import maybe_profile
from torch._inductor.codegen.memory_planning import _align as align
from torch import device, empty_strided
from torch._inductor.codecache import AsyncCompile
from torch._inductor.select_algorithm import extern_kernels
from torch._inductor.codegen.multi_kernel import MultiKernelCall
aten = torch.ops.aten
inductor_ops = torch.ops.inductor
_quantized = torch.ops._quantized
assert_size_stride = torch._C._dynamo.guards.assert_size_stride
empty_strided_cpu = torch._C._dynamo.guards._empty_strided_cpu
empty_strided_cuda = torch._C._dynamo.guards._empty_strided_cuda
alloc_from_pool = torch.ops.inductor._alloc_from_pool
reinterpret_tensor = torch.ops.inductor._reinterpret_tensor
async_compile = AsyncCompile()
cpp_fused_add_sin_0 = async_compile.cpp_pybinding(['const float*', 'const float*', 'const float*', 'float*'], '''
#include "/tmp/torchinductor_jjwu/tc/ctcib3vzwwy5ojjrjpuj6kvvjcgr5r6aayijaxwpdmvn4amuedlx.h"
extern "C" void kernel(const float* in_ptr0,
const float* in_ptr1,
const float* in_ptr2,
float* out_ptr0)
{
{
#pragma omp simd simdlen(8)
for(long x0=static_cast<long>(0L); x0<static_cast<long>(2L); x0+=static_cast<long>(1L))
{
auto tmp0 = in_ptr0[static_cast<long>(x0)];
auto tmp2 = in_ptr1[static_cast<long>(x0)];
auto tmp4 = in_ptr2[static_cast<long>(x0)];
auto tmp1 = std::sin(tmp0);
auto tmp3 = decltype(tmp1)(tmp1 + tmp2);
auto tmp5 = decltype(tmp3)(tmp3 + tmp4);
out_ptr0[static_cast<long>(x0)] = tmp5;
}
}
}
''')
async_compile.wait(globals())
del async_compile
def call(args):
arg0_1, arg1_1, arg2_1 = args
args.clear()
assert_size_stride(arg0_1, (2, ), (1, ))
assert_size_stride(arg1_1, (2, ), (1, ))
assert_size_stride(arg2_1, (2, ), (1, ))
buf0 = empty_strided_cpu((2, ), (1, ), torch.float32)
cpp_fused_add_sin_0(arg0_1, arg1_1, arg2_1, buf0)
del arg0_1
del arg1_1
del arg2_1
return (buf0, )
def benchmark_compiled_module(times=10, repeat=10):
from torch._dynamo.testing import rand_strided
from torch._inductor.utils import print_performance
arg0_1 = rand_strided((2, ), (1, ), device='cpu', dtype=torch.float32)
arg1_1 = rand_strided((2, ), (1, ), device='cpu', dtype=torch.float32)
arg2_1 = rand_strided((2, ), (1, ), device='cpu', dtype=torch.float32)
fn = lambda: call([arg0_1, arg1_1, arg2_1])
return print_performance(fn, times=times, repeat=repeat)
if __name__ == "__main__":
from torch._inductor.wrapper_benchmark import compiled_module_main
compiled_module_main('None', benchmark_compiled_module)
V0516 11:47:33.651000 139733182882816 torch/_dynamo/guards.py:2304] {"dynamo_guards": {}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "18b50eaa01e860d2c78d96b8478bfd75"}
[
]
V0516 11:47:33.652000 139733182882816 torch/_dynamo/guards.py:2132] {"dynamo_cpp_guards_str": {}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "8c785311c5ebd407bea97cb8f1eacf06"}
TREE_GUARD_MANAGER:
+- RootGuardManager
| +- DEFAULT_DEVICE: utils_device.CURRENT_DEVICE == None # _dynamo/output_graph.py:456 in init_ambient_guards
| +- GLOBAL_STATE: ___check_global_state()
| +- GuardManager: source=L['x'], accessed_by=DictGetItemGuardAccessor(x)
| | +- TENSOR_MATCH: check_tensor(L['x'], Tensor, DispatchKeySet(CPU, BackendSelect, ADInplaceOrView, AutogradCPU), torch.float32, device=None, requires_grad=False, size=[2], stride=[1])
| | +- NO_HASATTR: hasattr(L['x'], '_dynamo_dynamic_indices') == False
| | +- NO_TENSOR_ALIASING: check_no_aliasing(L['x'], G['global_state_tensor'])
| +- GuardManager: source=L['self'], accessed_by=DictGetItemGuardAccessor(self)
| | +- ID_MATCH: ___check_obj_id(L['self'], 139733177812832)
| | +- GuardManager: source=L['self'].__dict__, accessed_by=GetGenericDictGuardAccessor
| | | +- GuardManager: source=L['self'].training, accessed_by=DictGetItemGuardAccessor(training)
| | | | +- ID_MATCH: ___check_obj_id(L['self'].training, 7665376)
| | | +- GuardManager: source=L['self']._parameters, accessed_by=DictGetItemGuardAccessor(_parameters)
| | | | +- GuardManager: source=L['self'].param, accessed_by=DictGetItemGuardAccessor(param)
| | | | | +- ID_MATCH: ___check_obj_id(L['self'].param, 139730993771552)
| +- GuardManager: source=G, accessed_by=GlobalsGuardAccessor
| | +- GuardManager: source=G['torch'], accessed_by=DictGetItemGuardAccessor(torch)
| | | +- ID_MATCH: ___check_obj_id(G['torch'], 139733176988704)
| | | +- GuardManager: source=G['torch'].sin, accessed_by=GetAttrGuardAccessor(sin)
| | | | +- ID_MATCH: ___check_obj_id(G['torch'].sin, 139733173319728)
| | +- GuardManager: source=G['global_state_tensor'], accessed_by=DictGetItemGuardAccessor(global_state_tensor)
| | | +- TENSOR_MATCH: check_tensor(G['global_state_tensor'], Tensor, DispatchKeySet(CPU, BackendSelect, ADInplaceOrView, AutogradCPU), torch.float32, device=None, requires_grad=False, size=[2], stride=[1])
| | | +- NO_HASATTR: hasattr(G['global_state_tensor'], '_dynamo_dynamic_indices') == False
| | | +- NO_TENSOR_ALIASING: check_no_aliasing(L['x'], G['global_state_tensor'])
V0516 11:47:33.652000 139733182882816 torch/_dynamo/utils.py:634] {"compilation_metrics": {"frame_key": "1", "co_name": "forward", "co_filename": "/home/jjwu/tmp.py", "co_firstlineno": 7, "cache_size": 0, "accumulated_cache_size": 0, "guard_count": 11, "shape_env_guard_count": 0, "graph_op_count": 3, "graph_node_count": 7, "graph_input_count": 2, "start_time": 1715885247.9018195, "entire_frame_compile_time_s": 5.750433683395386, "backend_compile_time_s": 5.728264808654785, "inductor_compile_time_s": 4.610030651092529, "code_gen_time_s": 4.601039886474609, "fail_type": null, "fail_reason": null, "fail_user_frame_filename": null, "fail_user_frame_lineno": null, "non_compliant_ops": [], "compliant_custom_ops": [], "restart_reasons": [], "dynamo_time_before_restart_s": 0.0, "has_guarded_code": true}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0}