--
-- Copyright 2015-2019 Intel Corporation.
-- This software and the related documents are Intel copyrighted materials, and your use of them 
-- is governed by the express license under which they were provided to you ("License"). Unless the 
-- License provides otherwise, you may not use, modify, copy, publish, distribute, disclose or 
-- transmit this software or the related documents without Intel's prior written permission.
-- 
-- This software and the related documents are provided as is, with no express or implied warranties, 
-- other than those that are expressly stated in the License.
-- 
--


--TASK
--Find top 100 products that are sold together frequently in given
--stores. Only products in certain categories sold in specific stores are considered,
--and "sold together frequently" means at least 50 customers bought these products
--together in a transaction.

--IMPLEMENTATION NOTICE:
-- "Market basket analysis"
-- create pairs of "viewed together" items within one sale (one sale == one ss_sales_sk)
-- There are are several ways to to "basketing". Implemented is way A)
-- A) collect distinct viewed items per session (same sales_sk) in list and employ a UDTF to produce pairwise combinations of all list elements
-- B) distribute by sales_sk end employ reducer streaming script to aggregate all items per session and produce the pairs
-- C) pure SQL: produce pairings by self joining on sales_sk and filtering out left.item_sk < right.item_sk (elimiates dupplicates and switched posistions)


-- Resources
ADD JAR ${env:BIG_BENCH_QUERY_RESOURCES}/bigbenchqueriesmr.jar;
CREATE TEMPORARY FUNCTION makePairs AS 'io.bigdatabenchmark.v1.queries.udf.PairwiseUDTF';

--Result -------------------------------------------------------------------------
--keep result human readable
set hive.exec.compress.output=false;
set hive.exec.compress.output;
--CREATE RESULT TABLE. Store query result externally in output_dir/qXXresult/
DROP TABLE IF EXISTS ${hiveconf:RESULT_TABLE};
CREATE TABLE ${hiveconf:RESULT_TABLE} (
  pid1 BIGINT,
  pid2 BIGINT,
  cnt  BIGINT
)
ROW FORMAT DELIMITED FIELDS TERMINATED BY ',' LINES TERMINATED BY '\n'
STORED AS ${env:BIG_BENCH_hive_default_fileformat_result_table} LOCATION '${hiveconf:RESULT_DIR}';

-- the real query part
-- Find the most frequent ones

INSERT INTO TABLE ${hiveconf:RESULT_TABLE}
SELECT item_sk_1, item_sk_2, COUNT(*) AS cnt
FROM
(
  -- Make item "sold together" pairs
  -- combining collect_set + sorting + makePairs(array, selfParing=false)
  -- ensures we get no pairs with swapped places like: (12,24),(24,12).
  -- We only produce tuples like: (12,24) ensuring that the smaller number is always on the left side
  SELECT makePairs(sort_array(itemArray), false) AS (item_sk_1, item_sk_2)
  FROM
  (
    SELECT collect_set(ss_item_sk) AS itemArray --(_list = with duplicates, _set = distinct)
    FROM store_sales s, item i
    -- Only products in certain categories sold in specific stores are considered,
    WHERE s.ss_item_sk = i.i_item_sk
    AND i.i_category_id IN (${hiveconf:q01_i_category_id_IN})
    AND s.ss_store_sk IN (${hiveconf:q01_ss_store_sk_IN})
    GROUP BY ss_ticket_number
  ) soldItemsPerTicket
) soldTogetherPairs
GROUP BY item_sk_1, item_sk_2
-- 'frequently'
HAVING cnt > ${hiveconf:q01_viewed_together_count}
ORDER BY cnt DESC, item_sk_1, item_sk_2
LIMIT ${hiveconf:q01_limit};
;
