--
-- Copyright 2015-2019 Intel Corporation.
-- This software and the related documents are Intel copyrighted materials, and your use of them 
-- is governed by the express license under which they were provided to you ("License"). Unless the 
-- License provides otherwise, you may not use, modify, copy, publish, distribute, disclose or 
-- transmit this software or the related documents without Intel's prior written permission.
-- 
-- This software and the related documents are provided as is, with no express or implied warranties, 
-- other than those that are expressly stated in the License.
-- 
--


-- Perform category affinity analysis for products viewed together online.
-- Note that the order of products viewed does not matter,
-- and "viewed together" relates to a click_session of a user with a session timeout of 60min.
-- If the duration between two clicks of a user is greater then the session time-out, a new session begins.


-- IMPLEMENTATION NOTICE:
-- "Market basket analysis"
-- First difficult part is to create pairs of "viewed together" items within one sale
-- There are are several ways to to "basketing", implemented is way A)
-- A) collect distinct viewed items per session (same sales_sk) in list and employ a UDTF to produce pairwise combinations of all list elements
-- B) distribute by sales_sk and employ reducer streaming script to aggregate all items per session and produce the pairs
-- C) pure SQL: produce pairings by self joining on sales_sk and filtering out left.item_sk < right.item_sk
--
-- The second difficulty is to reconstruct a users browsing session from the web_clickstreams table
-- There are are several ways to to "sessionize", common to all is the creation of a unique virtual time stamp from the date and time serial
-- key's as we know they are both strictly monotonic increasing in order of time: (wcs_click_date_sk*24*60*60 + wcs_click_time_sk)
-- A) sessionize using SQL-windowing functions => partition by user and sort by virtual time stamp.
--    Step1: compute time difference to preceding click_session
--    Step2: compute session id per user by defining a session as: clicks no father apart then q02_session_timeout_inSec
--    Step3: make unique session identifier <user_sk>_<user_session_ID>
-- B) sessionize by clustering on user_sk and sorting by virtual time stamp then feeding the output through a external reducer script
--    which linearly iterates over the rows,  keeps track of session id's per user based on the specified session time-out and makes the unique session identifier <user_sk>_<user_seesion_ID>


-- Resources
ADD JAR ${env:BIG_BENCH_QUERY_RESOURCES}/bigbenchqueriesmr.jar;
CREATE TEMPORARY FUNCTION makePairs AS 'io.bigdatabenchmark.v1.queries.udf.PairwiseUDTF';
ADD FILE ${hiveconf:QUERY_DIR}/q30-sessionize.py;


-- SESSIONZIE by streaming
-- Step1: compute time difference to preceding click_session
-- Step2: compute session id per user by defining a session as: clicks no father appart then q30_session_timeout_inSec
-- Step3: make unique session identifier <user_sk>_<user_seesion_ID>
DROP VIEW IF EXISTS ${hiveconf:TEMP_TABLE};
CREATE VIEW ${hiveconf:TEMP_TABLE} AS
SELECT *
FROM (
  FROM (
    SELECT wcs_user_sk,
      (wcs_click_date_sk*24L*60L*60L + wcs_click_time_sk) AS tstamp_inSec,
      i_category_id
    FROM web_clickstreams wcs, item i
    WHERE wcs.wcs_item_sk = i.i_item_sk
    AND i.i_category_id IS NOT NULL
    AND wcs.wcs_user_sk IS NOT NULL
    --Hive uses the columns in Distribute By to distribute the rows among reducers. All rows with the same Distribute By columns will go to the same reducer. However, Distribute By does not guarantee clustering or sorting properties on the distributed keys.
    DISTRIBUTE BY wcs_user_sk SORT BY wcs_user_sk, tstamp_inSec, i_category_id -- "sessionize" reducer script requires the cluster by uid and sort by tstamp ASCENDING
  ) clicksAnWebPageType
  REDUCE
    wcs_user_sk,
    tstamp_inSec,
    i_category_id
  USING 'python q30-sessionize.py ${hiveconf:q30_session_timeout_inSec}'
  AS (
    category_id BIGINT,
    sessionid STRING
  )
) q02_tmp_sessionize
Cluster by sessionid
;


--Result -------------------------------------------------------------------------
--keep result human readable
set hive.exec.compress.output=false;
set hive.exec.compress.output;
--CREATE RESULT TABLE. Store query result externally in output_dir/qXXresult/
DROP TABLE IF EXISTS ${hiveconf:RESULT_TABLE};
CREATE TABLE ${hiveconf:RESULT_TABLE} (
  category_id_1 BIGINT,
  category_id_2 BIGINT,
  cnt BIGINT
)
ROW FORMAT DELIMITED FIELDS TERMINATED BY ',' LINES TERMINATED BY '\n'
STORED AS ${env:BIG_BENCH_hive_default_fileformat_result_table} LOCATION '${hiveconf:RESULT_DIR}';


-- the real query part

INSERT INTO TABLE ${hiveconf:RESULT_TABLE}
SELECT  category_id_1, category_id_2, COUNT (*) AS cnt
FROM (
  -- Make category "viewed together" pairs
  -- combining collect_set + sorting + makepairs(array, noSelfParing)
  -- ensures we get no pairs with swapped places like: (12,24),(24,12).
  -- We only produce tuples (12,24) ensuring that the smaller number is always on the left side
  SELECT makePairs(sort_array(itemArray), false) AS (category_id_1,category_id_2)
  FROM (
    SELECT collect_set(category_id) as itemArray --(_list= with duplicates, _set = distinct)
    FROM ${hiveconf:TEMP_TABLE}
    GROUP BY sessionid
  ) collectedList
) pairs
GROUP BY category_id_1, category_id_2
ORDER BY cnt DESC, category_id_1, category_id_2
LIMIT ${hiveconf:q30_limit};

-- cleanup
DROP VIEW IF EXISTS ${hiveconf:TEMP_TABLE};
